diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 79f0652e192f2a..eaa4b3b2f82456 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -16,8 +16,11 @@ env:
 jobs:
   benchmark:
     name: Benchmark
+    strategy:
+      matrix:
+        group: [aws-g5-4xlarge-cache, aws-p4d-24xlarge-plus]
     runs-on:
-      group: aws-g5-4xlarge-cache
+      group: ${{ matrix.group }}
     if: |
       (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark') )||
       (github.event_name == 'push' && github.ref == 'refs/heads/main')
@@ -63,6 +66,9 @@ jobs:
           python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg"
         env:
           HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+          # Enable this to see debug logs
+          # HF_HUB_VERBOSITY: debug
+          # TRANSFORMERS_VERBOSITY: debug
           PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
           PGUSER: transformers_benchmarks
           PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}
diff --git a/benchmark/grafana_dashboard.json b/benchmark/grafana_dashboard.json
index 2375663ffbc6db..3d579f7b368711 100644
--- a/benchmark/grafana_dashboard.json
+++ b/benchmark/grafana_dashboard.json
@@ -39,7 +39,7 @@
       "datasource": {
         "default": true,
         "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
+        "uid": "be28nkzirtb0gd"
       },
       "fieldConfig": {
         "defaults": {
@@ -77,7 +77,7 @@
             "properties": [
               {
                 "id": "custom.width",
-                "value": 364
+                "value": 196
               }
             ]
           },
@@ -101,7 +101,7 @@
             "properties": [
               {
                 "id": "custom.width",
-                "value": 708
+                "value": 581
               }
             ]
           },
@@ -113,7 +113,7 @@
             "properties": [
               {
                 "id": "custom.width",
-                "value": 388
+                "value": 379
               }
             ]
           }
@@ -148,7 +148,7 @@
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name FROM benchmarks WHERE branch = '${branch}';",
+          "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name, created_at AS date FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -232,7 +232,7 @@
       "datasource": {
         "default": true,
         "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
+        "uid": "be28nkzirtb0gd"
       },
       "fieldConfig": {
         "defaults": {
@@ -312,7 +312,7 @@
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'",
+          "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -334,6 +334,19 @@
         }
       ],
       "title": "First eager forward pass",
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": {},
+            "sort": [
+              {
+                "field": "time"
+              }
+            ]
+          }
+        }
+      ],
       "transparent": true,
       "type": "barchart"
     },
@@ -341,7 +354,7 @@
       "datasource": {
         "default": true,
         "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
+        "uid": "be28nkzirtb0gd"
       },
       "fieldConfig": {
         "defaults": {
@@ -424,7 +437,7 @@
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'",
+          "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -446,6 +459,19 @@
         }
       ],
       "title": "Second eager forward pass",
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": {},
+            "sort": [
+              {
+                "field": "time"
+              }
+            ]
+          }
+        }
+      ],
       "transparent": true,
       "type": "barchart"
     },
@@ -466,7 +492,7 @@
       "datasource": {
         "default": true,
         "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
+        "uid": "be28nkzirtb0gd"
       },
       "fieldConfig": {
         "defaults": {
@@ -545,7 +571,7 @@
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -567,6 +593,19 @@
         }
       ],
       "title": "Time to first token",
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": {},
+            "sort": [
+              {
+                "field": "time"
+              }
+            ]
+          }
+        }
+      ],
       "transparent": true,
       "type": "barchart"
     },
@@ -574,7 +613,7 @@
       "datasource": {
         "default": true,
         "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
+        "uid": "be28nkzirtb0gd"
       },
       "fieldConfig": {
         "defaults": {
@@ -653,7 +692,7 @@
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -675,6 +714,19 @@
         }
       ],
       "title": "Time to second token",
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": {},
+            "sort": [
+              {
+                "field": "time"
+              }
+            ]
+          }
+        }
+      ],
       "transparent": true,
       "type": "barchart"
     },
@@ -682,7 +734,7 @@
       "datasource": {
         "default": true,
         "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
+        "uid": "be28nkzirtb0gd"
       },
       "fieldConfig": {
         "defaults": {
@@ -761,7 +813,7 @@
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -783,6 +835,19 @@
         }
       ],
       "title": "Time to third token",
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": {},
+            "sort": [
+              {
+                "field": "time"
+              }
+            ]
+          }
+        }
+      ],
       "transparent": true,
       "type": "barchart"
     },
@@ -790,7 +855,7 @@
       "datasource": {
         "default": true,
         "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
+        "uid": "be28nkzirtb0gd"
       },
       "fieldConfig": {
         "defaults": {
@@ -869,7 +934,7 @@
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -891,6 +956,19 @@
         }
       ],
       "title": "Time to subsequent next tokens mean",
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": {},
+            "sort": [
+              {
+                "field": "time"
+              }
+            ]
+          }
+        }
+      ],
       "transparent": true,
       "type": "barchart"
     },
@@ -911,7 +989,7 @@
       "datasource": {
         "default": true,
         "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
+        "uid": "be28nkzirtb0gd"
       },
       "fieldConfig": {
         "defaults": {
@@ -990,7 +1068,7 @@
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'",
+          "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -1012,6 +1090,19 @@
         }
       ],
       "title": "First compile generate",
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": {},
+            "sort": [
+              {
+                "field": "time"
+              }
+            ]
+          }
+        }
+      ],
       "transparent": true,
       "type": "barchart"
     },
@@ -1019,7 +1110,7 @@
       "datasource": {
         "default": true,
         "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
+        "uid": "be28nkzirtb0gd"
       },
       "fieldConfig": {
         "defaults": {
@@ -1098,7 +1189,7 @@
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}';",
+          "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -1120,6 +1211,19 @@
         }
       ],
       "title": "Second compile generate",
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": {},
+            "sort": [
+              {
+                "field": "time"
+              }
+            ]
+          }
+        }
+      ],
       "transparent": true,
       "type": "barchart"
     },
@@ -1127,7 +1231,7 @@
       "datasource": {
         "default": true,
         "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
+        "uid": "be28nkzirtb0gd"
       },
       "fieldConfig": {
         "defaults": {
@@ -1206,7 +1310,7 @@
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}';",
+          "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -1228,6 +1332,19 @@
         }
       ],
       "title": "Third compile generate",
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": {},
+            "sort": [
+              {
+                "field": "time"
+              }
+            ]
+          }
+        }
+      ],
       "transparent": true,
       "type": "barchart"
     },
@@ -1235,7 +1352,7 @@
       "datasource": {
         "default": true,
         "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
+        "uid": "be28nkzirtb0gd"
       },
       "fieldConfig": {
         "defaults": {
@@ -1314,7 +1431,7 @@
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}';",
+          "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -1336,11 +1453,24 @@
         }
       ],
       "title": "Fourth compile generate",
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": {},
+            "sort": [
+              {
+                "field": "time"
+              }
+            ]
+          }
+        }
+      ],
       "transparent": true,
       "type": "barchart"
     },
     {
-      "collapsed": false,
+      "collapsed": true,
       "gridPos": {
         "h": 1,
         "w": 24,
@@ -1348,751 +1478,753 @@
         "y": 64
       },
       "id": 15,
-      "panels": [],
-      "title": "Usage metrics",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "default": true,
-        "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
+      "panels": [
+        {
+          "datasource": {
+            "default": true,
+            "type": "grafana-postgresql-datasource",
+            "uid": "be28nkzirtb0gd"
           },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": 60000,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "barWidthFactor": 0.6,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "insertNulls": 60000,
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              },
+              "unit": "percent"
             },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 0,
+            "y": 65
+          },
+          "id": 1,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
             },
-            "thresholdsStyle": {
-              "mode": "off"
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
             }
           },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
+          "targets": [
+            {
+              "datasource": {
+                "type": "grafana-postgresql-datasource",
+                "uid": "bdz2yss7sxo1sc"
               },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 9,
-        "w": 12,
-        "x": 0,
-        "y": 65
-      },
-      "id": 1,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
+              "editorMode": "code",
+              "format": "table",
+              "rawQuery": true,
+              "rawSql": "SELECT\n  d.cpu_util,\n  d.time\nFROM\n  benchmarks AS b\n  JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n  branch = '${branch}';",
+              "refId": "A",
+              "sql": {
+                "columns": [
+                  {
+                    "parameters": [
+                      {
+                        "name": "cpu_util",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "mem_megabytes",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "gpu_util",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "gpu_mem_megabytes",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "\"time\"",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  }
+                ],
+                "groupBy": [
+                  {
+                    "property": {
+                      "type": "string"
+                    },
+                    "type": "groupBy"
+                  }
+                ],
+                "limit": 50,
+                "whereJsonTree": {
+                  "children1": [
+                    {
+                      "id": "baa888b8-89ab-4cde-b012-31922f8671e9",
+                      "properties": {
+                        "field": "commit_id",
+                        "fieldSrc": "field",
+                        "operator": "equal",
+                        "value": [
+                          "${commit}"
+                        ],
+                        "valueError": [
+                          null
+                        ],
+                        "valueSrc": [
+                          "value"
+                        ],
+                        "valueType": [
+                          "text"
+                        ]
+                      },
+                      "type": "rule"
+                    }
+                  ],
+                  "id": "bab88a98-0123-4456-b89a-b1922f7d4f11",
+                  "type": "group"
+                },
+                "whereString": "commit_id = '${commit}'"
+              },
+              "table": "measurements"
+            }
+          ],
+          "title": "CPU Utilization",
+          "transparent": true,
+          "type": "timeseries"
         },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
         {
           "datasource": {
+            "default": true,
             "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
           },
-          "editorMode": "code",
-          "format": "table",
-          "rawQuery": true,
-          "rawSql": "SELECT\n  d.cpu_util,\n  d.time\nFROM\n  benchmarks AS b\n  JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n  branch = '${branch}'",
-          "refId": "A",
-          "sql": {
-            "columns": [
-              {
-                "parameters": [
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "barWidthFactor": 0.6,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "insertNulls": 60000,
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
                   {
-                    "name": "cpu_util",
-                    "type": "functionParameter"
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
                   }
-                ],
-                "type": "function"
+                ]
               },
-              {
-                "parameters": [
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 65
+          },
+          "id": 4,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "grafana-postgresql-datasource",
+                "uid": "bdz2yss7sxo1sc"
+              },
+              "editorMode": "code",
+              "format": "table",
+              "rawQuery": true,
+              "rawSql": "SELECT\n  b.commit_id,\n  d.gpu_util,\n  d.time\nFROM\n  benchmarks AS b\n  JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n  branch = '${branch}';",
+              "refId": "A",
+              "sql": {
+                "columns": [
                   {
-                    "name": "mem_megabytes",
-                    "type": "functionParameter"
+                    "parameters": [
+                      {
+                        "name": "cpu_util",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "mem_megabytes",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "gpu_util",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "gpu_mem_megabytes",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "\"time\"",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
                   }
                 ],
-                "type": "function"
-              },
-              {
-                "parameters": [
+                "groupBy": [
                   {
-                    "name": "gpu_util",
-                    "type": "functionParameter"
+                    "property": {
+                      "type": "string"
+                    },
+                    "type": "groupBy"
                   }
                 ],
-                "type": "function"
+                "limit": 50,
+                "whereJsonTree": {
+                  "children1": [
+                    {
+                      "id": "baa888b8-89ab-4cde-b012-31922f8671e9",
+                      "properties": {
+                        "field": "commit_id",
+                        "fieldSrc": "field",
+                        "operator": "equal",
+                        "value": [
+                          "${commit}"
+                        ],
+                        "valueError": [
+                          null
+                        ],
+                        "valueSrc": [
+                          "value"
+                        ],
+                        "valueType": [
+                          "text"
+                        ]
+                      },
+                      "type": "rule"
+                    }
+                  ],
+                  "id": "bab88a98-0123-4456-b89a-b1922f7d4f11",
+                  "type": "group"
+                },
+                "whereString": "commit_id = '${commit}'"
               },
-              {
-                "parameters": [
+              "table": "measurements"
+            }
+          ],
+          "title": "GPU Utilization",
+          "transparent": true,
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "default": true,
+            "type": "grafana-postgresql-datasource",
+            "uid": "be28nkzirtb0gd"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "barWidthFactor": 0.6,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "insertNulls": 60000,
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
                   {
-                    "name": "gpu_mem_megabytes",
-                    "type": "functionParameter"
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
                   }
-                ],
-                "type": "function"
+                ]
               },
-              {
-                "parameters": [
+              "unit": "decmbytes"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 0,
+            "y": 74
+          },
+          "id": 2,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "grafana-postgresql-datasource",
+                "uid": "bdz2yss7sxo1sc"
+              },
+              "editorMode": "code",
+              "format": "table",
+              "rawQuery": true,
+              "rawSql": "SELECT d.mem_megabytes, d.time FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}';",
+              "refId": "A",
+              "sql": {
+                "columns": [
                   {
-                    "name": "\"time\"",
-                    "type": "functionParameter"
+                    "parameters": [
+                      {
+                        "name": "cpu_util",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "mem_megabytes",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "gpu_util",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "gpu_mem_megabytes",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "\"time\"",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
                   }
                 ],
-                "type": "function"
-              }
-            ],
-            "groupBy": [
-              {
-                "property": {
-                  "type": "string"
-                },
-                "type": "groupBy"
-              }
-            ],
-            "limit": 50,
-            "whereJsonTree": {
-              "children1": [
-                {
-                  "id": "baa888b8-89ab-4cde-b012-31922f8671e9",
-                  "properties": {
-                    "field": "commit_id",
-                    "fieldSrc": "field",
-                    "operator": "equal",
-                    "value": [
-                      "${commit}"
-                    ],
-                    "valueError": [
-                      null
-                    ],
-                    "valueSrc": [
-                      "value"
-                    ],
-                    "valueType": [
-                      "text"
-                    ]
-                  },
-                  "type": "rule"
-                }
-              ],
-              "id": "bab88a98-0123-4456-b89a-b1922f7d4f11",
-              "type": "group"
-            },
-            "whereString": "commit_id = '${commit}'"
-          },
-          "table": "measurements"
-        }
-      ],
-      "title": "CPU Utilization",
-      "transparent": true,
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "default": true,
-        "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": 60000,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 9,
-        "w": 12,
-        "x": 12,
-        "y": 65
-      },
-      "id": 4,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": {
-            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
-          },
-          "editorMode": "code",
-          "format": "table",
-          "rawQuery": true,
-          "rawSql": "SELECT\n  b.commit_id,\n  d.gpu_util,\n  d.time\nFROM\n  benchmarks AS b\n  JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n  branch = '${branch}'",
-          "refId": "A",
-          "sql": {
-            "columns": [
-              {
-                "parameters": [
-                  {
-                    "name": "cpu_util",
-                    "type": "functionParameter"
-                  }
-                ],
-                "type": "function"
-              },
-              {
-                "parameters": [
+                "groupBy": [
                   {
-                    "name": "mem_megabytes",
-                    "type": "functionParameter"
+                    "property": {
+                      "type": "string"
+                    },
+                    "type": "groupBy"
                   }
                 ],
-                "type": "function"
-              },
-              {
-                "parameters": [
-                  {
-                    "name": "gpu_util",
-                    "type": "functionParameter"
-                  }
-                ],
-                "type": "function"
-              },
-              {
-                "parameters": [
-                  {
-                    "name": "gpu_mem_megabytes",
-                    "type": "functionParameter"
-                  }
-                ],
-                "type": "function"
-              },
-              {
-                "parameters": [
-                  {
-                    "name": "\"time\"",
-                    "type": "functionParameter"
-                  }
-                ],
-                "type": "function"
-              }
-            ],
-            "groupBy": [
-              {
-                "property": {
-                  "type": "string"
+                "limit": 50,
+                "whereJsonTree": {
+                  "children1": [
+                    {
+                      "id": "baa888b8-89ab-4cde-b012-31922f8671e9",
+                      "properties": {
+                        "field": "commit_id",
+                        "fieldSrc": "field",
+                        "operator": "equal",
+                        "value": [
+                          "${commit}"
+                        ],
+                        "valueError": [
+                          null
+                        ],
+                        "valueSrc": [
+                          "value"
+                        ],
+                        "valueType": [
+                          "text"
+                        ]
+                      },
+                      "type": "rule"
+                    }
+                  ],
+                  "id": "bab88a98-0123-4456-b89a-b1922f7d4f11",
+                  "type": "group"
                 },
-                "type": "groupBy"
-              }
-            ],
-            "limit": 50,
-            "whereJsonTree": {
-              "children1": [
-                {
-                  "id": "baa888b8-89ab-4cde-b012-31922f8671e9",
-                  "properties": {
-                    "field": "commit_id",
-                    "fieldSrc": "field",
-                    "operator": "equal",
-                    "value": [
-                      "${commit}"
-                    ],
-                    "valueError": [
-                      null
-                    ],
-                    "valueSrc": [
-                      "value"
-                    ],
-                    "valueType": [
-                      "text"
-                    ]
-                  },
-                  "type": "rule"
-                }
-              ],
-              "id": "bab88a98-0123-4456-b89a-b1922f7d4f11",
-              "type": "group"
-            },
-            "whereString": "commit_id = '${commit}'"
-          },
-          "table": "measurements"
-        }
-      ],
-      "title": "GPU Utilization",
-      "transparent": true,
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "default": true,
-        "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": 60000,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
+                "whereString": "commit_id = '${commit}'"
               },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "decmbytes"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 9,
-        "w": 12,
-        "x": 0,
-        "y": 74
-      },
-      "id": 2,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
+              "table": "measurements"
+            }
+          ],
+          "title": "Memory usage",
+          "transparent": true,
+          "type": "timeseries"
         },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
         {
           "datasource": {
+            "default": true,
             "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
           },
-          "editorMode": "code",
-          "format": "table",
-          "rawQuery": true,
-          "rawSql": "SELECT d.mem_megabytes, d.time FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}'",
-          "refId": "A",
-          "sql": {
-            "columns": [
-              {
-                "parameters": [
-                  {
-                    "name": "cpu_util",
-                    "type": "functionParameter"
-                  }
-                ],
-                "type": "function"
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
               },
-              {
-                "parameters": [
-                  {
-                    "name": "mem_megabytes",
-                    "type": "functionParameter"
-                  }
-                ],
-                "type": "function"
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "barWidthFactor": 0.6,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "insertNulls": 60000,
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
               },
-              {
-                "parameters": [
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
                   {
-                    "name": "gpu_util",
-                    "type": "functionParameter"
-                  }
-                ],
-                "type": "function"
-              },
-              {
-                "parameters": [
+                    "color": "green",
+                    "value": null
+                  },
                   {
-                    "name": "gpu_mem_megabytes",
-                    "type": "functionParameter"
+                    "color": "red",
+                    "value": 80
                   }
-                ],
-                "type": "function"
+                ]
               },
-              {
-                "parameters": [
-                  {
-                    "name": "\"time\"",
-                    "type": "functionParameter"
-                  }
-                ],
-                "type": "function"
-              }
-            ],
-            "groupBy": [
-              {
-                "property": {
-                  "type": "string"
-                },
-                "type": "groupBy"
-              }
-            ],
-            "limit": 50,
-            "whereJsonTree": {
-              "children1": [
-                {
-                  "id": "baa888b8-89ab-4cde-b012-31922f8671e9",
-                  "properties": {
-                    "field": "commit_id",
-                    "fieldSrc": "field",
-                    "operator": "equal",
-                    "value": [
-                      "${commit}"
-                    ],
-                    "valueError": [
-                      null
-                    ],
-                    "valueSrc": [
-                      "value"
-                    ],
-                    "valueType": [
-                      "text"
-                    ]
-                  },
-                  "type": "rule"
-                }
-              ],
-              "id": "bab88a98-0123-4456-b89a-b1922f7d4f11",
-              "type": "group"
-            },
-            "whereString": "commit_id = '${commit}'"
-          },
-          "table": "measurements"
-        }
-      ],
-      "title": "Memory usage",
-      "transparent": true,
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "default": true,
-        "type": "grafana-postgresql-datasource",
-        "uid": "de0dbhs18ho1sc"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": 60000,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
+              "unit": "decmbytes"
             },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 74
+          },
+          "id": 3,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
             },
-            "thresholdsStyle": {
-              "mode": "off"
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
             }
           },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
+          "targets": [
+            {
+              "datasource": {
+                "type": "grafana-postgresql-datasource",
+                "uid": "bdz2yss7sxo1sc"
               },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "decmbytes"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 9,
-        "w": 12,
-        "x": 12,
-        "y": 74
-      },
-      "id": 3,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": {
-            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
-          },
-          "editorMode": "code",
-          "format": "table",
-          "rawQuery": true,
-          "rawSql": "SELECT\n  d.gpu_mem_megabytes,\n  d.time\nFROM\n  benchmarks AS b\n  JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n  branch = '${branch}'",
-          "refId": "A",
-          "sql": {
-            "columns": [
-              {
-                "parameters": [
+              "editorMode": "code",
+              "format": "table",
+              "rawQuery": true,
+              "rawSql": "SELECT\n  d.gpu_mem_megabytes,\n  d.time\nFROM\n  benchmarks AS b\n  JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n  branch = '${branch}';",
+              "refId": "A",
+              "sql": {
+                "columns": [
                   {
-                    "name": "cpu_util",
-                    "type": "functionParameter"
-                  }
-                ],
-                "type": "function"
-              },
-              {
-                "parameters": [
+                    "parameters": [
+                      {
+                        "name": "cpu_util",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
                   {
-                    "name": "mem_megabytes",
-                    "type": "functionParameter"
-                  }
-                ],
-                "type": "function"
-              },
-              {
-                "parameters": [
+                    "parameters": [
+                      {
+                        "name": "mem_megabytes",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
                   {
-                    "name": "gpu_util",
-                    "type": "functionParameter"
-                  }
-                ],
-                "type": "function"
-              },
-              {
-                "parameters": [
+                    "parameters": [
+                      {
+                        "name": "gpu_util",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
                   {
-                    "name": "gpu_mem_megabytes",
-                    "type": "functionParameter"
+                    "parameters": [
+                      {
+                        "name": "gpu_mem_megabytes",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
+                  },
+                  {
+                    "parameters": [
+                      {
+                        "name": "\"time\"",
+                        "type": "functionParameter"
+                      }
+                    ],
+                    "type": "function"
                   }
                 ],
-                "type": "function"
-              },
-              {
-                "parameters": [
+                "groupBy": [
                   {
-                    "name": "\"time\"",
-                    "type": "functionParameter"
+                    "property": {
+                      "type": "string"
+                    },
+                    "type": "groupBy"
                   }
                 ],
-                "type": "function"
-              }
-            ],
-            "groupBy": [
-              {
-                "property": {
-                  "type": "string"
+                "limit": 50,
+                "whereJsonTree": {
+                  "children1": [
+                    {
+                      "id": "baa888b8-89ab-4cde-b012-31922f8671e9",
+                      "properties": {
+                        "field": "commit_id",
+                        "fieldSrc": "field",
+                        "operator": "equal",
+                        "value": [
+                          "${commit}"
+                        ],
+                        "valueError": [
+                          null
+                        ],
+                        "valueSrc": [
+                          "value"
+                        ],
+                        "valueType": [
+                          "text"
+                        ]
+                      },
+                      "type": "rule"
+                    }
+                  ],
+                  "id": "bab88a98-0123-4456-b89a-b1922f7d4f11",
+                  "type": "group"
                 },
-                "type": "groupBy"
-              }
-            ],
-            "limit": 50,
-            "whereJsonTree": {
-              "children1": [
-                {
-                  "id": "baa888b8-89ab-4cde-b012-31922f8671e9",
-                  "properties": {
-                    "field": "commit_id",
-                    "fieldSrc": "field",
-                    "operator": "equal",
-                    "value": [
-                      "${commit}"
-                    ],
-                    "valueError": [
-                      null
-                    ],
-                    "valueSrc": [
-                      "value"
-                    ],
-                    "valueType": [
-                      "text"
-                    ]
-                  },
-                  "type": "rule"
-                }
-              ],
-              "id": "bab88a98-0123-4456-b89a-b1922f7d4f11",
-              "type": "group"
-            },
-            "whereString": "commit_id = '${commit}'"
-          },
-          "table": "measurements"
+                "whereString": "commit_id = '${commit}'"
+              },
+              "table": "measurements"
+            }
+          ],
+          "title": "GPU memory usage",
+          "transparent": true,
+          "type": "timeseries"
         }
       ],
-      "title": "GPU memory usage",
-      "transparent": true,
-      "type": "timeseries"
+      "title": "Usage metrics",
+      "type": "row"
     }
   ],
+  "refresh": "",
   "schemaVersion": 39,
   "tags": [],
   "templating": {
@@ -2105,7 +2237,7 @@
         },
         "datasource": {
           "type": "grafana-postgresql-datasource",
-          "uid": "de0dbhs18ho1sc"
+          "uid": "be28nkzirtb0gd"
         },
         "definition": "SELECT DISTINCT branch FROM benchmarks;",
         "description": "",
@@ -2125,12 +2257,12 @@
       {
         "current": {
           "selected": false,
-          "text": "1728662868776",
-          "value": "1728662868776"
+          "text": "1729701492845",
+          "value": "1729701492845"
         },
         "datasource": {
           "type": "grafana-postgresql-datasource",
-          "uid": "de0dbhs18ho1sc"
+          "uid": "be28nkzirtb0gd"
         },
         "definition": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id ASC LIMIT 1;",
         "description": "",
@@ -2149,12 +2281,12 @@
       {
         "current": {
           "selected": false,
-          "text": "1728663254125",
-          "value": "1728663254125"
+          "text": "1730120430069",
+          "value": "1730120430069"
         },
         "datasource": {
           "type": "grafana-postgresql-datasource",
-          "uid": "de0dbhs18ho1sc"
+          "uid": "be28nkzirtb0gd"
         },
         "definition": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}' ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;",
         "description": "",
@@ -2164,7 +2296,7 @@
         "name": "EndTime",
         "options": [],
         "query": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}' ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;",
-        "refresh": 2,
+        "refresh": 1,
         "regex": "",
         "skipUrlSync": false,
         "sort": 0,
@@ -2178,7 +2310,7 @@
         },
         "datasource": {
           "type": "grafana-postgresql-datasource",
-          "uid": "de0dbhs18ho1sc"
+          "uid": "be28nkzirtb0gd"
         },
         "definition": "SELECT DISTINCT gpu_name FROM benchmarks;",
         "hide": 0,
@@ -2188,11 +2320,32 @@
         "name": "gpu_name",
         "options": [],
         "query": "SELECT DISTINCT gpu_name FROM benchmarks;",
-        "refresh": 1,
+        "refresh": 2,
         "regex": "",
         "skipUrlSync": false,
         "sort": 0,
         "type": "query"
+      },
+      {
+        "current": {
+          "selected": false,
+          "text": "10",
+          "value": "10"
+        },
+        "description": "The number of commits to display, going from most recent to the nth commit.",
+        "hide": 0,
+        "label": "Last # of commits",
+        "name": "last_n_commits",
+        "options": [
+          {
+            "selected": true,
+            "text": "10",
+            "value": "10"
+          }
+        ],
+        "query": "10",
+        "skipUrlSync": false,
+        "type": "textbox"
       }
     ]
   },
@@ -2206,6 +2359,6 @@
   "timezone": "browser",
   "title": "Transformers benchmarks",
   "uid": "fdz33iyzln9c0a",
-  "version": 11,
+  "version": 4,
   "weekStart": ""
 }
diff --git a/benchmark/init_db.sql b/benchmark/init_db.sql
index 4381b99cea660e..573cc11518e857 100644
--- a/benchmark/init_db.sql
+++ b/benchmark/init_db.sql
@@ -7,6 +7,10 @@ CREATE TABLE IF NOT EXISTS benchmarks (
   created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
 );
 
+CREATE INDEX IF NOT EXISTS benchmarks_benchmark_id_idx ON benchmarks (benchmark_id);
+
+CREATE INDEX IF NOT EXISTS benchmarks_branch_idx ON benchmarks (branch);
+
 CREATE TABLE IF NOT EXISTS device_measurements (
   measurement_id SERIAL PRIMARY KEY,
   benchmark_id int REFERENCES benchmarks (benchmark_id),
@@ -17,6 +21,8 @@ CREATE TABLE IF NOT EXISTS device_measurements (
   time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
 );
 
+CREATE INDEX IF NOT EXISTS device_measurements_branch_idx ON device_measurements (benchmark_id);
+
 CREATE TABLE IF NOT EXISTS model_measurements (
   measurement_id SERIAL PRIMARY KEY,
   benchmark_id int REFERENCES benchmarks (benchmark_id),
@@ -24,3 +30,4 @@ CREATE TABLE IF NOT EXISTS model_measurements (
   time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
 );
 
+CREATE INDEX IF NOT EXISTS model_measurements_branch_idx ON model_measurements (benchmark_id);
diff --git a/benchmark/llama.py b/benchmark/llama.py
index a926f903486607..4a2c57422e6ffb 100644
--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@@ -96,17 +96,21 @@ def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_ge
         )
         conn.commit()
         benchmark_id = cur.fetchone()[0]
+        logger.info(f"running benchmark #{benchmark_id} on {gpu_name}")
         metrics_thread = Thread(target=collect_metrics, args=[benchmark_id, continue_metric_collection])
         metrics_thread.start()
+        logger.info("started background thread to fetch device metrics")
 
         os.environ["TOKENIZERS_PARALLELISM"] = "false"  # silence warnings when compiling
 
         device = "cuda"
         ckpt = "meta-llama/Llama-2-7b-hf"
 
+        logger.info("downloading weights")
         # This is to avoid counting download in model load time measurement
         model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16)
         gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1)
+        logger.info("loading model")
         start = perf_counter()
         model = AutoModelForCausalLM.from_pretrained(
             ckpt, torch_dtype=torch.float16, generation_config=gen_config
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index 93f9b6f6a170fd..b597f5a73fb5be 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).
 
-ARG PYTORCH='2.4.0'
+ARG PYTORCH='2.5.1'
 # (not always a valid torch version)
 ARG INTEL_TORCH_EXT='2.3.0'
 # Example: `cu102`, `cu113`, etc.
@@ -26,7 +26,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
 # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
 #    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
-RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 tensorflow_text tensorflow_probability && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
+RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 "tensorflow_text<2.16" "tensorflow_probability<0.22" && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
 
 RUN python3 -m pip uninstall -y flax jax
 
diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile
index 62578ad0f3610f..f22d77b9372d7e 100644
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -11,7 +11,7 @@ ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 
 # If set to nothing, will install the latest version
-ARG PYTORCH='2.4.0'
+ARG PYTORCH='2.5.1'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 # Example: `cu102`, `cu113`, etc.
diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile
index d765767780f46c..378491a6c60007 100644
--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@@ -18,7 +18,7 @@ RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' ||  VERSIO
 RUN python3 -m pip uninstall -y torch flax
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
-RUN python3 -m pip install --no-cache-dir -U tensorflow_probability
+RUN python3 -m pip install --no-cache-dir -U "tensorflow_probability<0.22"
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
diff --git a/docs/README.md b/docs/README.md
index 7dbcefc0483c66..bb54d7004130f2 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -276,14 +276,14 @@ building the return.
 
 Here's an example of a single value return:
 
-```
+```python
     Returns:
         `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
 ```
 
 Here's an example of a tuple return, comprising several objects:
 
-```
+```python
     Returns:
         `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
         - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
@@ -322,10 +322,9 @@ includes an example of how to transcribe speech to text in the
 
 The syntax for Example docstrings can look as follows:
 
-```
+```python
     Example:
 
-    ```python
     >>> from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
     >>> from datasets import load_dataset
     >>> import torch
@@ -347,7 +346,6 @@ The syntax for Example docstrings can look as follows:
     >>> transcription = processor.batch_decode(predicted_ids)
     >>> transcription[0]
     'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'
-    ```
 ```
 
 The docstring should give a minimal, clear example of how the respective model 
diff --git a/docs/TRANSLATING.md b/docs/TRANSLATING.md
index 49747821f476f0..64dced450987dc 100644
--- a/docs/TRANSLATING.md
+++ b/docs/TRANSLATING.md
@@ -1,57 +1,70 @@
-### Translating the Transformers documentation into your language
+# Translating the Transformers documentation into your language
 
-As part of our mission to democratize machine learning, we'd love to make the Transformers library available in many more languages! Follow the steps below if you want to help translate the documentation into your language 🙏.
+As part of our mission to democratize machine learning, we aim to make the Transformers library available in many more languages! Follow the steps below to help translate the documentation into your language.
 
-**🗞️ Open an issue**
+## Open an Issue
 
-To get started, navigate to the [Issues](https://github.com/huggingface/transformers/issues) page of this repo and check if anyone else has opened an issue for your language. If not, open a new issue by selecting the "Translation template" from the "New issue" button.
+1. Navigate to the Issues page of this repository.
+2. Check if anyone has already opened an issue for your language.
+3. If not, create a new issue by selecting the "Translation template" from the "New issue" button.
+4. Post a comment indicating which chapters you’d like to work on, and we’ll add your name to the list.
 
-Once an issue exists, post a comment to indicate which chapters you'd like to work on, and we'll add your name to the list.
+## Fork the Repository
 
+1. First, fork the Transformers repo by clicking the Fork button in the top-right corner.
+2. Clone your fork to your local machine for editing with the following command:
 
-**🍴 Fork the repository**
+    ```bash
+    git clone https://github.com/YOUR-USERNAME/transformers.git
+    ```
+   
+   Replace `YOUR-USERNAME` with your GitHub username.
 
-First, you'll need to [fork the Transformers repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). You can do this by clicking on the **Fork** button on the top-right corner of this repo's page.
+## Copy-paste the English version with a new language code
 
-Once you've forked the repo, you'll want to get the files on your local machine for editing. You can do that by cloning the fork with Git as follows:
+The documentation files are organized in the following directory:
 
-```bash
-git clone https://github.com/YOUR-USERNAME/transformers.git
-```
+- **docs/source**: This contains all documentation materials organized by language.
 
-**📋 Copy-paste the English version with a new language code**
+To copy the English version to your new language directory:
 
-The documentation files are in one leading directory:
+1. Navigate to your fork of the repository:
 
-- [`docs/source`](https://github.com/huggingface/transformers/tree/main/docs/source): All the documentation materials are organized here by language.
+    ```bash
+    cd ~/path/to/transformers/docs
+    ```
 
-You'll only need to copy the files in the [`docs/source/en`](https://github.com/huggingface/transformers/tree/main/docs/source/en) directory, so first navigate to your fork of the repo and run the following:
+   Replace `~/path/to` with your actual path.
 
-```bash
-cd ~/path/to/transformers/docs
-cp -r source/en source/LANG-ID
-```
+2. Run the following command:
 
-Here, `LANG-ID` should be one of the ISO 639-1 or ISO 639-2 language codes -- see [here](https://www.loc.gov/standards/iso639-2/php/code_list.php) for a handy table.
+    ```bash
+    cp -r source/en source/LANG-ID
+    ```
 
-**✍️ Start translating**
+   Replace `LANG-ID` with the appropriate ISO 639-1 or ISO 639-2 language code (see [this table](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) for reference).
 
-The fun part comes - translating the text!
+## Start translating
 
-The first thing we recommend is translating the part of the `_toctree.yml` file that corresponds to your doc chapter. This file is used to render the table of contents on the website. 
+Begin translating the text!
 
-> 🙋 If the `_toctree.yml` file doesn't yet exist for your language, you can create one by copy-pasting from the English version and deleting the sections unrelated to your chapter. Just make sure it exists in the `docs/source/LANG-ID/` directory!
+1. Start with the `_toctree.yml` file that corresponds to your documentation chapter. This file is essential for rendering the table of contents on the website.
 
-The fields you should add are `local` (with the name of the file containing the translation; e.g. `autoclass_tutorial`), and `title` (with the title of the doc in your language; e.g. `Load pretrained instances with an AutoClass`) -- as a reference, here is the `_toctree.yml` for [English](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml):
+    - If the `_toctree.yml` file doesn’t exist for your language, create one by copying the English version and removing unrelated sections.
+    - Ensure it is placed in the `docs/source/LANG-ID/` directory.
 
-```yaml
-- sections:
-  - local: pipeline_tutorial # Do not change this! Use the same name for your .md file
-    title: Pipelines for inference # Translate this!
-    ...
-  title: Tutorials # Translate this!
-```
+    Here’s an example structure for the `_toctree.yml` file:
 
-Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your docs chapter.
+    ```yaml
+    - sections:
+      - local: pipeline_tutorial # Keep this name for your .md file
+        title: Pipelines for Inference # Translate this
+        ...
+      title: Tutorials # Translate this
+    ```
 
-> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/transformers/issues) and tag @stevhliu.
+2. Once you’ve translated the `_toctree.yml`, move on to translating the associated MDX files.
+
+## Collaborate and share
+
+If you'd like assistance with your translation, open an issue and tag `@stevhliu`. Feel free to share resources or glossaries to ensure consistent terminology.
diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml
index 6f7899b53b854e..d9523eaf5da535 100644
--- a/docs/source/ar/_toctree.yml
+++ b/docs/source/ar/_toctree.yml
@@ -108,38 +108,38 @@
 #       title: دليل إرشادي لمحفزات النماذج اللغوية الكبيرة
 #     title: الإرشاد
 #   title: أدلة المهام
-# - sections:
-#   - local: fast_tokenizers
-#     title: استخدم برامج التجزئة السريعة من 🤗 Tokenizers
-#   - local: multilingual
-#     title: تشغيل الاستنتاج باستخدام نماذج متعددة اللغات
-#   - local: create_a_model
-#     title: استخدام واجهات برمجة التطبيقات الخاصة بالنموذج
-#   - local: custom_models
-#     title: مشاركة نموذج مخصص
-#   - local: chat_templating
-#     title: قوالب لنماذج الدردشة
-#   - local: trainer
-#     title: المدرب
-#   - local: sagemaker
-#     title: تشغيل التدريب على Amazon SageMaker
-#   - local: serialization
-#     title: التصدير إلى ONNX
-#   - local: tflite
-#     title: التصدير إلى TFLite
-#   - local: torchscript
-#     title: التصدير إلى TorchScript
+- sections:
+  - local: fast_tokenizers
+    title: استخدم مجزئيات النصوص السريعة من 🤗 Tokenizers 
+  - local: multilingual
+    title: الاستدلال باستخدام نماذج متعددة اللغات
+  - local: create_a_model
+    title: استخدام واجهات برمجة التطبيقات الخاصة بالنموذج
+  - local: custom_models
+    title: مشاركة نموذج مخصص
+  - local: chat_templating
+    title: قوالب لنماذج الدردشة
+  - local: trainer
+    title: المدرب
+  - local: sagemaker
+    title: تشغيل التدريب على Amazon SageMaker
+  - local: serialization
+    title: التصدير إلى ONNX
+  - local: tflite
+    title: التصدير إلى TFLite
+  - local: torchscript
+    title: التصدير إلى TorchScript
 #   - local: benchmarks
 #     title: المعايير
 #   - local: notebooks
 #     title: دفاتر الملاحظات مع الأمثلة
 #   - local: community
 #     title: موارد المجتمع
-#   - local: troubleshooting
-#     title: استكشاف الأخطاء وإصلاحها
-#   - local: gguf
-#     title: التوافق مع ملفات GGUF
-#   title: أدلة المطورين
+  - local: troubleshooting
+    title: استكشاف الأخطاء وإصلاحها
+  - local: gguf
+    title: التوافق مع ملفات GGUF
+  title: أدلة المطورين
 # - sections:
 #   - local: quantization/overview
 #     title: نظرة عامة
diff --git a/docs/source/ar/agents.md b/docs/source/ar/agents.md
index 92b2a4715f6f07..1213b35008605b 100644
--- a/docs/source/ar/agents.md
+++ b/docs/source/ar/agents.md
@@ -464,7 +464,7 @@ image = image_generator(prompt=improved_prompt)
 
 قبل إنشاء الصورة أخيرًا:
 
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png" />
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit_spacesuit_flux.webp" />
 
 > [!WARNING]
 > تتطلب gradio-tools إدخالات وإخراجات *نصية* حتى عند العمل مع طرائق مختلفة مثل كائنات الصور والصوت. الإدخالات والإخراجات الصورية والصوتية غير متوافقة حاليًا.
diff --git a/docs/source/ar/chat_templating.md b/docs/source/ar/chat_templating.md
new file mode 100644
index 00000000000000..90f4ac820e146a
--- /dev/null
+++ b/docs/source/ar/chat_templating.md
@@ -0,0 +1,835 @@
+# قوالب نماذج الدردشة
+
+## مقدمة
+
+تعد **الدردشة** أحد استخدامات نماذج اللغات الكبيرة (LLMs) شائعة الاستخدام بشكل متزايد. ففي سياق الدردشة، وبدلاً من متابعة سلسلة نصية واحدة (كما هو الحال مع نماذج اللغات القياسية)، يواصل النموذج بدلاً من ذلك محادثة تتكون من رسالة واحدة أو أكثر، تتضمن كل منها دورًا، مثل "المستخدم" أو "المساعد"، بالإضافة إلى نص الرسالة.
+
+وكما هو الحال مع تقسيم النص إلى رموز (tokenization)، تتوقع النماذج المختلفة تنسيقات إدخال مختلفة تمامًا للمحادثة. لهذا السبب أضفنا **قوالب الدردشة** كميزة جديدة. تُعد قوالب المحادثة جزءًا من tokenizer. تحدد هذه القوالب كيفية تحويل المحادثات، والتي يتم تمثيلها كقوائم من الرسائل، إلى سلسلة نصية واحدة قابلة للتقسيم إلى رموز بالتنسيق الذي يتوقعه النموذج.
+
+دعونا نجعل هذا ملموسًا بمثال سريع باستخدام نموذج `BlenderBot`. لدى BlenderBot قالب افتراضي بسيط للغاية، والذي يضيف في الغالب مسافات بيضاء بين جولات الحوار:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+
+>>> chat = [
+...    {"role": "user", "content": "Hello, how are you?"},
+...    {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+...    {"role": "user", "content": "I'd like to show off how chat templating works!"},
+... ]
+
+>>> tokenizer.apply_chat_template(chat, tokenize=False)
+" Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>"
+```
+
+لاحظ كيف تم ضغط الدردشة بأكملها في سلسلة واحدة. إذا استخدمنا `tokenize=True`، وهو الإعداد الافتراضي، فسيتم أيضًا تحليل السلسلة نحويًا نيابة عنا. ولكن، لنشاهد قالبًا أكثر تعقيدًا في العمل، دعونا نستخدم نموذج `mistralai/Mistral-7B-Instruct-v0.1`.
+
+```python
+>>> from transformers import AutoTokenizer
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
+
+>>> chat = [
+...   {"role": "user", "content": "Hello, how are you?"},
+...   {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+...   {"role": "user", "content": "I'd like to show off how chat templating works!"},
+... ]
+
+>>> tokenizer.apply_chat_template(chat, tokenize=False)
+"<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]</s>"
+```
+
+لاحظ كيف أضاف المجزىء اللغوى tokenizer رموز التحكم `[INST]` و `[/INST]` للإشارة إلى بداية ونهاية رسائل المستخدم (ولكن ليس رسائل المساعد!) ، وتم تكثيف المحادثة بأكملها في سلسلة نصية واحدة. إذا استخدمنا `tokenize=True` ، وهو الإعداد الافتراضي ، فسيتم أيضًا تقسيم تلك السلسلة إلى رموز.
+
+حاول الآن استخدام نفس الشفرة، لكن مع استبدال النموذج بـ `HuggingFaceH4/zephyr-7b-beta` ، وستحصل على:
+```text
+<|user|>
+Hello, how are you?</s>
+<|assistant|>
+I'm doing great. How can I help you today?</s>
+<|user|>
+I'd like to show off how chat templating works!</s>
+```
+تم ضبط كل من Zephyr و Mistral-Instruct من نفس النموذج الأصلي ، Mistral-7B-v0.1. ومع ذلك ، فقد تم تدريبهم بتنسيقات دردشة مختلفة تمامًا. بدون قوالب المحادثة، ستضطر إلى كتابة شفرة تنسيق يدويًا لكل نموذج ، ومن السهل جدًا ارتكاب أخطاء بسيطة تؤثر على الأداء! تُدير قوالب المحادثة تفاصيل التنسيق نيابةً عنك ، مما يُتيح لك كتابة شفرة عامة تعمل مع أي نموذج.
+
+## كيف أستخدم قوالب الدردشة؟
+
+كما رأيت في المثال السابق،  من السهل استخدام قوالب الدردشة. قم ببساطة بإنشاء قائمة من الرسائل، مع مفتاحي `role` و`content`، ثم قم بتمريرها إلى [`~PreTrainedTokenizer.apply_chat_template`] . بمجرد قيامك بذلك، ستحصل على مخرجات جاهزة للاستخدام! عند استخدام قوالب الدردشة كإدخال لتوليد نصوص بواسطة النموذج، فمن الجيد أيضًا استخدام `add_generation_prompt=True` لإضافة [مطالبات توليد النصوص](#what-are-generation-prompts).
+
+فيما يلي مثال على إعداد الإدخال لـ `model.generate()`، باستخدام Zephyr مرة أخرى:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+checkpoint = "HuggingFaceH4/zephyr-7b-beta"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint) # قد ترغب في استخدام bfloat16 و/أو الانتقال إلى GPU هنا
+
+messages = [
+    {
+        "role": "system",
+        "content": "You are a friendly chatbot who always responds in the style of a pirate",
+    },
+    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+print(tokenizer.decode(tokenized_chat[0]))
+```
+سيؤدي هذا إلى إنتاج سلسلة نصية بتنسيق الإدخال الذي يتوقعه Zephyr.
+
+```text
+<|system|>
+You are a friendly chatbot who always responds in the style of a pirate</s>
+<|user|>
+How many helicopters can a human eat in one sitting?</s>
+<|assistant|>
+```
+
+الآن بعد أن تم تنسيق الإدخال بشكل صحيح لـ Zephyr، يمكننا استخدام النموذج لإنشاء رد على سؤال المستخدم:
+
+```python
+outputs = model.generate(tokenized_chat, max_new_tokens=128)
+print(tokenizer.decode(outputs[0]))
+```
+
+سيؤدي هذا إلى ما يلي:
+
+```text
+<|system|>
+You are a friendly chatbot who always responds in the style of a pirate</s>
+<|user|>
+How many helicopters can a human eat in one sitting?</s>
+<|assistant|>
+Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
+```
+
+كان ذلك سهلاً بعد كل شيء !
+
+
+
+## هل هناك قنوات معالجة أوتوماتيكية للدردشة؟
+
+نعم يوجد ! تدعم قنوات المعالجة توليد النصوص مدخلات الدردشة ، مما يُسهّل استخدام نماذج الدردشة . في الماضي ، كنا نستخدم فئة "ConversationalPipeline" المُخصّصة ، ولكن تم الآن إيقافها وتم دمج وظائفها في [`TextGenerationPipeline`]. دعونا نجرّب مثال Zephyr مرة أخرى ، ولكن هذه المرة باستخدام قناة معالجة:
+
+```python
+from transformers import pipeline
+
+pipe = pipeline("text-generation", "HuggingFaceH4/zephyr-7b-beta")
+messages = [
+    {
+        "role": "system",
+        "content": "You are a friendly chatbot who always responds in the style of a pirate",
+    },
+    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+]
+print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1]) # طباعة استجابة المساعد
+```
+
+```النص
+{'role': 'assistant', 'content': "Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all."}
+```
+
+سيُراعي قناة المعالجة جميع تفاصيل تقسيم النص إلى رموز واستدعاء apply_chat_template نيابةً عنك - بمجرد أن يصبح لِدى النموذج قالب دردشة ، فكل ما تحتاج إلى القيام به هو تهيئة قناة معالجة وتمرير قائمة الرسائل إليها!
+
+## ما هي "مطالبات التوليد"؟
+
+قد تلاحظ أن طريقة `apply_chat_template` لها معامل `add_generation_prompt`. تخبر هذه المعامل القالب بإضافة رموز تشير إلى بداية رد البوت. على سبيل المثال، ضع في اعتبارك الدردشة التالية:
+
+```python
+messages = [
+    {"role": "user", "content": "Hi there!"},
+    {"role": "assistant", "content": "Nice to meet you!"},
+    {"role": "user", "content": "Can I ask a question?"}
+]
+```
+
+إليك كيف سيبدو ذلك بدون موجه توليد نصوص ، بالنسبة لنموذج يستخدم تنسيق "ChatML" القياسي :
+
+```python
+tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+"""<|im_start|>user
+Hi there!<|im_end|>
+<|im_start|>assistant
+Nice to meet you!<|im_end|>
+<|im_start|>user
+Can I ask a question?<|im_end|>
+"""
+```
+
+وهكذا يبدو الأمر **مع** مطالبة التوليد:
+
+```python
+tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+"""<|im_start|>user
+Hi there!<|im_end|>
+<|im_start|>assistant
+Nice to meet you!<|im_end|>
+<|im_start|>user
+Can I ask a question?<|im_end|>
+<|im_start|>assistant
+"""
+```
+
+لاحظ أننا أضفنا هذه المرة الرموز التي تشير إلى بداية رد البوت. يضمن هذا أنه عندما يُولّد النموذج نصًا فسيكتب رد البوت بدلاً من القيام بشيء غير متوقع، مثل الاستمرار في رسالة المستخدم. تذكر، أن نماذج الدردشة لا تزال مجرد نماذج للغة - فهي مدربة على متابعة النصوص، والدردشة هي مجرد نوع خاص من النصوص بالنسبة لها! يجب توجيهها برموز تحكم مناسبة، حتى تعرف ما الذي يجب عليها فعله.
+
+لا تتطلب جميع النماذج الرموز التحكمية لتوليد نصوص . بعض النماذج ، مثل LLaMA ، ليس لديها أي رموز خاصة قبل ردود البوت . في هذه الحالات ، لن يكون لمعامل `add_generation_prompt` أي تأثير. يعتمد التأثير الدقيق الذي تُحدثه `add_generation_prompt` على القالب المستخدم .
+
+## ما وظيفة "continue_final_message"؟
+
+عند تمرير قائمة من الرسائل إلى `apply_chat_template` أو `TextGenerationPipeline` ، يمكنك اختيار تنسيق المحادثة بحيث يواصل النموذج الرسالة الأخيرة في المحادثة بدلاً من بدء رسالة جديدة. يتم ذلك عن طريق إزالة أي رموز نهاية التسلسل التي تشير إلى نهاية الرسالة الأخيرة ، بحيث يقوم النموذج ببساطة بتمديد الرسالة الأخيرة عندما يبدأ في توليد النص . يُعد هذا أمرًا مفيدًا "لِمَلء بداية" رد النموذج مُسبقًا.
+
+وهنا مثال:
+```python
+chat = [
+    {"role": "user", "content": "Can you format the answer in JSON?"},
+    {"role": "assistant", "content": '{"name": "'},
+]
+
+formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True)
+model.generate(**formatted_chat)
+```
+سيقوم النموذج بتوليد نص يكمل سلسلة JSON ، بدلاً من بدء رسالة جديدة . يمكن أن يكون هذا النهج مفيدًا جدًا لتحسين دقة اتباع النموذج للإرشادات عندما تعرف كيف تريد أن يبدأ ردوده .
+.
+
+نظرًا لأن `add_generation_prompt` تضيف الرموز التي تبدأ رسالة جديدة ، و `continue_final_message` تزيل أي رموز نهاية الرسالة من الرسالة الأخيرة ، فليس من المنطقي استخدامهما معًا . ونتيجة لذلك ، ستتلقّى خطأً إذا حاولت ذلك !
+
+السلوك الافتراضي لِـ `TextGenerationPipeline` هو تعيين `add_generation_prompt=True` بحيث تبدأ رسالة جديدة . ومع ذلك ، إذا كانت الرسالة الأخيرة في المحادثة التي تم إدخالها لديها دور "assistant" ، فسوف تفترض أن هذه الرسالة هي "مَلء بداية" وتتحوّل إلى `continue_final_message=True` بدلاً من ذلك ، لأن مُعظم النماذج لا تدعم عدة رسائل متتالية للمساعد . يمكنك تجاوز هذا السلوك عن طريق تمرير معامل `continue_final_message` بشكل صريح عند استدعاء قناة المعالجة .
+
+
+
+## هل يمكنني استخدام قوالب الدردشة في التدريب؟
+
+نعم ! تُعد هذه طريقة جيدة للتأكد من أن قالب الدردشة يتطابق مع الرموز التي يراها النموذج أثناء التدريب . نوصي بتطبيق قالب الدردشة كخطوة معالجة أولية لمجموعة بياناتك . بعد ذلك ، يمكنك ببساطة متابعة عملية التدريب كما هو الحال مع أي مهمة تدريب نماذج لغات أخرى . عند التدريب ، يجب أن تُعيّن عادةً  `add_generation_prompt=False` ، لأنه لن تكون الرموز المُضافة لتحفيز رد المساعد مفيدة أثناء التدريب . دعونا نرى مثالاً :
+
+```python
+from transformers import AutoTokenizer
+from datasets import Dataset
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+
+chat1 = [
+    {"role": "user", "content": "Which is bigger, the moon or the sun?"},
+    {"role": "assistant", "content": "The sun."}
+]
+chat2 = [
+    {"role": "user", "content": "Which is bigger, a virus or a bacterium?"},
+    {"role": "assistant", "content": "A bacterium."}
+]
+
+dataset = Dataset.from_dict({"chat": [chat1, chat2]})
+dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})
+print(dataset['formatted_chat'][0])
+```
+ونحصل على:
+
+```text
+<|user|>
+Which is bigger, the moon or the sun?</s>
+<|assistant|>
+The sun.</s>
+```
+
+من هنا، استمر في التدريب كما تفعل مع مهمة نمذجة اللغة القياسية، باستخدام عمود `formatted_chat`.
+
+<Tip>
+بشكل افتراضي ، تضيف بعض *tokenizers* رموزًا خاصة مثل `<bos>` و `<eos>` إلى النص الذي تقوم بتقسيمه إلى رموز. يجب أن تتضمن قوالب المحادثة بالفعل جميع الرموز الخاصة التي تحتاجها ، وبالتالي فإن الرموز الخاصة الإضافية ستكون غالبًا غير صحيحة أو مُكررة ، مما سيؤثر سلبًا على أداء النموذج .
+
+لذلك ، إذا قمت بتنسيق النص باستخدام  `apply_chat_template(tokenize=False)` ، فيجب تعيين المعامل `add_special_tokens=False` عندما تقوم بتقسيم ذلك النص إلى رموز لاحقًا . إذا كنت تستخدم `apply_chat_template(tokenize=True)` ، فلن تحتاج إلى القلق بشأن ذلك !
+</Tip>
+
+## متقدّم: مدخلات إضافية لِقوالب الدردشة
+
+
+المعامل الوحيدة التي تتطلبها طريقة `apply_chat_template` هي `messages`. ومع ذلك، يمكنك تمرير أي معامل ككلمة مفتاحية إلى `apply_chat_template` وستكون متاحة داخل القالب. يمنحك هذا الكثير من المرونة لاستخدام قوالب الدردشة للعديد من الأشياء. لا توجد قيود على أسماء هذه المعامﻻت أو تنسيقاتها - يمكنك تمرير سلاسل نصية أو قوائم أو قواميس أو أي شيء آخر تريده.
+
+ومع ذلك، هناك بعض الحالات الشائعة لاستخدام هذه المعامﻻت الإضافية، مثل تمرير أدوات لاستدعاء الوظائف، أو المستندات  لإنشاء النصوص المُعزّزة بالاسترجاع. في هذه الحالات الشائعة، لدينا بعض التوصيات المُحدّدة حول أسماء هذه المعامﻻت وتنسيقاتها، والتي يتم وصفها في الأقسام التالية. نشجع مطوّري النماذج على جعل قوالب الدردشة الخاصة بهم متوافقة مع هذا التنسيق، لتسهيل نقل التعليمات البرمجية لاستدعاء الأدوات بين النماذج.
+
+## متقدم: استخدام الأداة / استدعاء الدالة
+
+يمكن لنماذج "استخدام الأداة" اختيار استدعاء الدوال كأدوات خارجية قبل توليد الإجابة. عند تمرير الأدوات إلى نموذج استخدام الأدوات، يمكنك ببساطة تمرير قائمة من الوظائف إلى معامل `tools`:
+
+```python
+import datetime
+
+def current_time():
+    """Get the current local time as a string."""
+    return str(datetime.now())
+
+def multiply(a: float, b: float):
+    """
+    A function that multiplies two numbers
+    
+    Args:
+        a: The first number to multiply
+        b: The second number to multiply
+    """
+    return a * b
+
+tools = [current_time, multiply]
+
+model_input = tokenizer.apply_chat_template(
+    messages,
+    tools=tools
+)
+```
+
+لكي يعمل هذا بشكل صحيح، يجب عليك كتابة وظائفك بالتنسيق السابق، حتى يمكن تحليلها بشكل صحيح كأدوات. على وجه التحديد، يجب عليك اتباع هذه القواعد:
+
+- يجب أن يكون للدالة اسم وصفي.
+- يجب أن يكون لكل معامل نوع للتلميح.
+- يجب أن تحتوي الدالة على سلسلة مستندية بتنسيق Google القياسي (بمعنى وصف الدالة الأولي متبوعًا بكتلة `Args:` التي تصف المعاﻻت، ما لم تكن الدالة لا تحتوي على أي معامﻻت.
+- لا تقم بتضمين الأنواع في كتلة `Args:` . بعبارة أخرى، اكتب `a: The first number to multiply`، وليس `a (int): The first number to multiply`. يجب أن تذهب تلميحات الأنواع في رأس الدالة بدلاً من ذلك.
+- يمكن أن يكون للدالة نوع للإرجاع ومربع `Returns:` في السلسلة. ومع ذلك، فهذه اختيارية لأن معظم نماذج استخدام الأدوات تتجاهلها.
+
+### تمرير نتائج الأداة إلى النموذج
+
+يكفي الكود السابقة لسرد الأدوات المتاحة لنموذجك، ولكن ماذا يحدث إذا أراد النموذج استخدام واحدة منها؟ إذا حدث ذلك، فيجب عليك:
+
+1. تحليل مخرجات النموذج للحصول على اسم (أسماء) الأدوات ومعامﻻتها.
+2. أضف استدعاء (استدعاءات) النموذج لِلأدوات إلى المحادثة.
+3. استدعاء الدالة (الدالات) المقابلة بتلك المعامﻻت.
+4. أضف النتيجة (النتائج) إلى المحادثة
+
+### مثال كامل على استخدام الأداة
+
+
+سنستعرض مثالاً على استخدام الأدوات خطوة بخطوة . في هذا المثال ، سنستخدم نموذج `Hermes-2-Pro` بحجم 8 مليارات معامل ، نظرًا لأنه أحد أعلى نماذج استخدام الأدوات أداءً في فئة حجمه وقت كتابة هذا النص . إذا كان لديك الذاكرة الكافية ، فيمكنك النظر في استخدام نموذج أكبر بدلاً من ذلك مثل `Command-R` أو `Mixtral-8x22B` ، وكلاهما يدعم استخدام الأدوات ويوفر أداءً أقوى .
+
+
+أولاً ، لنقم بتحميل نموذجنا و tokenizer الخاص بنا:
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B"
+
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto")
+
+```python
+messages = [
+  {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
+  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
+]
+```
+
+الآن، لنقم نطبق قالب الدردشة ونولد رد:
+
+```python
+inputs = tokenizer.apply_chat_template(messages, chat_template="tool_use", tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v.to(model.device) for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+ونحصل على:
+
+```text
+<tool_call>
+{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"}
+</tool_call><|im_end|>
+```
+
+لقد قام النموذج باستدعاء الدالة مع معامﻻت صحيحة، بالصيغة التي طلبتها توثيق الدالة. لقد استنتج أننا نشير على الأرجح إلى باريس في فرنسا، وتذكر أنه بكونها موطن وحدات القياس الدولية، يجب عرض درجة الحرارة في فرنسا بالدرجة المئوية.
+
+دعنا نضيف استدعاء الأداة الخاص بالنموذج إلى المحادثة. لاحظ أننا نولد معرف استدعاء أداة عشوائيًا هنا. لا تستخدم جميع النماذج هذه المعرفات، ولكنها تسمح للنماذج بإصدار عدة استدعاءات للأدوات في نفس الوقت وتتبع الاستجابة المقابلة لكل استدعاء. يمكنك توليد هذه المعرفات بأي طريقة تريدها، ولكن يجب أن تكون فريدة داخل كل محادثة.
+
+```python
+tool_call_id = "vAHdf3"  # Random ID, should be unique for each tool call
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"id": tool_call_id, "type": "function", "function": tool_call}]})
+```
+
+الآن بعد أن أضفنا استدعاء الأداة إلى المحادثة، يمكننا استدعاء الدالة وإضافة النتيجة إلى المحادثة. نظرًا لأننا نستخدم دالة وهمية لهذا المثال والتي تعيد دائمًا 22.0، فيمكننا ببساطة إضافة تلك النتيجة مباشرةً. لاحظ معرف استدعاء الأداة - يجب أن يتطابق مع المعرف المستخدم في استدعاء الأداة أعلاه.
+
+```python
+messages.append({"role": "tool", "tool_call_id": tool_call_id, "name": "get_current_temperature", "content": "22.0"})
+```
+
+أخيرًا، دعنا نجعل المساعد يقرأ مخرجات الدالة ويكمل الدردشة مع المستخدم:
+
+```python
+inputs = tokenizer.apply_chat_template(messages, chat_template="tool_use", tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v.to(model.device) for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+ونحصل على:
+
+```text
+The current temperature in Paris, France is 22.0 ° Celsius.<|im_end|>
+```
+
+<Tip>
+لا تستخدم جميع نماذج استخدام الأدوات جميع ميزات استدعاء الأدوات الموضحة أعلاه. يستخدم البعض معرفات استدعاء الأدوات، بينما يستخدم البعض الآخر ببساطة اسم الدالة ويقارن استدعاءات الأدوات بالنتائج باستخدام الترتيب، وهناك عدة نماذج لا تستخدم أيًا منهما ولا تصدر سوى استدعاء أداة واحد في كل مرة لتجنب الارتباك. إذا كنت تريد أن يكون رمزك متوافقًا مع أكبر عدد ممكن من النماذج، فإننا نوصي بهيكلة استدعاءات الأدوات الخاصة بك كما هو موضح هنا، وإعادة نتائج الأدوات بالترتيب الذي أصدرها النموذج. يجب أن تتعامل قوالب الدردشة على كل نموذج مع الباقي.
+</Tip>
+
+### فهم مخططات الأدوات
+
+يتم تحويل كل دالة تقوم بتمريرها إلى معامل `tools` في دالة `apply_chat_template` إلى [مخطط JSON](https://json-schema.org/learn/getting-started-step-by-step). يتم بعد ذلك تمرير هذه المخططات إلى قالب الدردشة النموذج. وبعبارة أخرى، فإن نماذج استخدام الأدوات لا ترى دوالك مباشرة، ولا ترى مطلقًا الكود الموجود بداخلها. ما يهمها هو**تعريفات** الدوال و**المعامﻻت** التي تحتاج إلى تمريرها إليها - فهي تهتم بما تفعله الأدوات وكيفية استخدامها، وليس بكيفية عملها! يقع على عاتقك قراءة مخرجاتها، والكشف عما إذا كانت قد طلبت استخدام أداة، وتمرير المعامﻻت إلى دالة الأداة، وإرجاع الرد في الدردشة.
+
+يجب أن يكون إنشاء مخططات JSON لتمريرها إلى القالب تلقائيًا وغير مرئي طالما أن دوالك تتبع المواصفات الموضحة أعلاه، ولكن إذا واجهت مشكلات، أو إذا كنت تريد ببساطة مزيدًا من التحكم في التحويل، فيمكنك التعامل مع التحويل يدويًا. فيما يلي مثال على تحويل مخطط يدوي:
+
+```python
+from transformers.utils import get_json_schema
+
+def multiply(a: float, b: float):
+    """
+    A function that multiplies two numbers
+    
+    Args:
+        a: The first number to multiply
+        b: The second number to multiply
+    """
+    return a * b
+
+schema = get_json_schema(multiply)
+print(schema)
+```
+
+سيؤدي هذا إلى ما يلي:
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+إذا كنت ترغب في ذلك، يمكنك تحرير هذه المخططات، أو حتى كتابتها من البداية بنفسك دون استخدام `get_json_schema` على الإطلاق. يمكن تمرير مخططات JSON مباشرةً إلى معامل  `tools` في `apply_chat_template` - يمنحك هذا الكثير من القوة لتعريف مخططات دقيقة لوظائف أكثر تعقيدًا. ولكن كن حذرًا - كلما زاد تعقيد مخططاتك، زاد احتمال ارتباك النموذج عند التعامل معها! نوصي  بتوقيعات دوال بسيطة حيثما أمكن، مع تقليل المعامﻻت (وخاصة المعامﻻت المعقدة والمتداخلة) إلى الحد الأدنى.
+
+فيما يلي مثال على تعريف المخططات يدويًا، وتمريرها مباشرةً إلى `apply_chat_template`:
+
+```python
+# A simple function that takes no arguments
+current_time = {
+  "type": "function", 
+  "function": {
+    "name": "current_time",
+    "description": "Get the current local time as a string.",
+    "parameters": {
+      'type': 'object',
+      'properties': {}
+    }
+  }
+}
+
+# A more complete function that takes two numerical arguments
+multiply = {
+  'type': 'function',
+  'function': {
+    'name': 'multiply',
+    'description': 'A function that multiplies two numbers', 
+    'parameters': {
+      'type': 'object', 
+      'properties': {
+        'a': {
+          'type': 'number',
+          'description': 'The first number to multiply'
+        }, 
+        'b': {
+          'type': 'number', 'description': 'The second number to multiply'
+        }
+      }, 
+      'required': ['a', 'b']
+    }
+  }
+}
+
+model_input = tokenizer.apply_chat_template(
+    messages,
+    tools = [current_time, multiply]
+)
+```
+
+## متقدم: توليد قائم على الاسترجاع
+يمكن لنماذج اللغة الكبيرة من نوع "توليد قائم على الاسترجاع" أو "RAG" البحث في مجموعة نصوص عن معلومات قبل الرد على الاستعلام. يسمح هذا للنماذج بتوسيع قاعدة معارفها بشكل كبير إلى ما هو أبعد من حجم سياقها المحدود. توصيتنا لنماذج RAG هي أن يقبل قالبها وسيطة `documents`. يجب أن تكون هذه قائمة من المستندات، حيث يكون كل "مستند" عبارة عن قاموس واحد بمفاتيح `title` و `contents`، وكلاهما سلاسل نصية. نظرًا لأن هذا التنسيق أبسط بكثير من مخططات JSON المستخدمة للأدوات، فلا توجد حاجة إلى دوال مساعدة.
+
+فيما يلي مثال على قالب RAG بالفعل:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# تحميل النموذج والمجزىء اللغوي
+model_id = "CohereForAI/c4ai-command-r-v01-4bit"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
+device = model.device # الحصول على الجهاز الذي تم تحميل النموذج عليه
+
+# تعريف مُدخلات المحادثة
+conversation = [
+    {"role": "user", "content": "What has Man always dreamed of?"}
+]
+
+# تعريف المستندات لتوليد قائم على الاسترجاع
+documents = [
+    {
+        "title": "The Moon: Our Age-Old Foe", 
+        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
+    },
+    {
+        "title": "The Sun: Our Age-Old Friend",
+        "text": "Although often underappreciated, the sun provides several notable benefits..."
+    }
+]
+# معالجة المحادثة والمستندات باستخدام قالب RAG، وإرجاع موترات PyTorch.
+input_ids = tokenizer.apply_chat_template(
+    conversation=conversation,
+    documents=documents,
+    chat_template="rag",
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt").to(device)
+
+# توليد الرد
+gen_tokens = model.generate(
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.3,
+    )
+
+# فك تشفير النص المُوَلّد وطباعته
+gen_text = tokenizer.decode(gen_tokens[0])
+print(gen_text)
+```
+إن مُدخل documents للتوليد القائم على الاسترجاع غير مدعوم على نطاق واسع، والعديد من النماذج لديها قوالب دردشة تتجاهل هذا المُدخل ببساطة.
+
+للتحقق مما إذا كان النموذج يدعم مُدخل `documents`، يمكنك قراءة بطاقة النموذج الخاصة به، أو `print(tokenizer.chat_template)` لمعرفة ما إذا كان مفتاح `documents` مستخدمًا في أي مكان.
+<Tip>
+ومع ذلك، فإن أحد فئات النماذج التي تدعمه هي [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) و [Command-R+](https://huggingface.co/CohereForAI/c4ai-command-r-pluse-08-2024) من Cohere، من خلال قالب الدردشة rag الخاص بهم. يمكنك رؤية أمثلة إضافية على التوليد باستخدام هذه الميزة في بطاقات النموذج الخاصة بهم.
+</Tip>
+
+## متقدم: كيف تعمل قوالب الدردشة؟
+يتم تخزين قالب الدردشة للنموذج في الخاصية `tokenizer.chat_template`. إذا لم يتم تعيين قالب دردشة، فسيتم استخدام القالب الافتراضي لفئة النموذج هذه بدلاً من ذلك. دعونا نلقي نظرة على قالب دردشة `Zephyr`، ولكن لاحظ أن هذا القالب مُبسّط قليلاً عن القالب الفعلي!
+
+```
+{%- for message in messages %}
+    {{- '<|' + message['role'] + |>\n' }}
+    {{- message['content'] + eos_token }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|assistant|>\n' }}
+{%- endif %}
+```
+إذا لم تكن قد رأيت أحد هذه القوالب من قبل، فهذا [قالب Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) .Jinja هي لغة قوالب تسمح لك بكتابة تعليمات برمجية بسيطة تُوَلّد نصًا. من نواحٍ عديدة، يُشبه الرمز والتركيب للغة Python. أما في لغة Python، سيبدو هذا القالب كما يلي:
+
+```python
+for message in messages:
+    print(f'<|{message["role"]}|>')
+    print(message['content'] + eos_token)
+if add_generation_prompt:
+    print('<|assistant|>')
+```
+يقوم القالب بثلاثة أشياء بشكل فعال:
+
+- لكل رسالة، بطبع الدور مُحاطًا بـ `<|` و `|>`، مثل `<|user|>` أو `<|assistant|>`.
+- بعد ذلك، يطبع محتوى الرسالة، متبوعًا برمز نهاية التسلسل `eos_token` .
+- أخيرًا، إذا تم تعيين `add_generation_prompt` ، يطبع الرمز المساعد، حتى يعرف النموذج أنه يجب أن يبدأ في توليد استجابة المساعد.
+  
+هذا قالب بسيط جدًا، لكن Jinja تمنحك الكثير من المرونة للقيام بأشياء أكثر تعقيدًا! دعونا نرى قالب Jinja يُمكنه تنسيق المُدخلات بطريقة تُشبه الطريقة التي تُنسّق بها LLaMA مُدخلاتها (لاحظ أن قالب LLaMA الحقيقي يتضمن معالجة لرسائل النظام الافتراضية ومعالجة رسائل النظام بشكل مختلف قليلاً بشكل عام - لا تستخدم هذا القالب في التعليمات البرمجية الفعلية الخاصة بك!)
+```
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- bos_token + '[INST] ' + message['content'] + ' [/INST]' }}
+    {%- elif message['role'] == 'system' %}
+        {{- '<<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- ' '  + message['content'] + ' ' + eos_token }}
+    {%- endif %}
+{%- endfor %}
+```
+نأمل أنه إذا حدقت في هذا لفترة قصيرة، يمكنك أن ترى ما يفعله هذا القالب - فهو يُضيف رموزًا مُحددة مثل `[INST]` و `[/INST]` بناءً على دور كل رسالة. يمكن تمييز رسائل المستخدم والمساعد والنظام بوضوح للنموذج بسبب الرموز التي تُحيط بها.
+
+## متقدم: إضافة وتعديل قوالب الدردشة
+
+### كيف أنشئ قالب دردشة؟
+ببساطة، اكتب قالب Jinja واضبط `tokenizer.chat_template`. قد تجد أنه من الأسهل البدء بقالب موجود من نموذج آخر وتحريره ببساطة ليناسب احتياجاتك! على سبيل المثال، يمكننا أن نأخذ قالب LLaMA أعلاه ونضيف `[ASST]` و `[/ASST]` إلى رسائل المساعد:
+
+```
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
+    {%- elif message['role'] == 'system' %}
+        {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
+    {%- endif %}
+{%- endfor %}
+```
+
+الآن، اضبط ببساطة الخاصية `tokenizer.chat_template`. في المرة القادمة التي تستخدم فيها [`~PreTrainedTokenizer.apply_chat_template`] ، سيستخدم القالب الجديد الخاص بك! سيتم حفظ هذه الخاصية في ملف `tokenizer_config.json`، حتى تتمكن من استخدام  [`~utils.PushToHubMixin.push_to_hub`] لتحميل قالبك الجديد إلى Hub والتأكد من أن الجميع يستخدم القالب الصحيح لنموذجك!
+
+```python
+template = tokenizer.chat_template
+template = template.replace("SYS", "SYSTEM")  # تغيير رمز النظام
+tokenizer.chat_template = template  # تعيين القالب الجديد
+tokenizer.push_to_hub("model_name")  # تحميل القالب الجديد إلى Hub!
+```
+
+يتم استدعاء الدالة [`~PreTrainedTokenizer.apply_chat_template`] الذي نستخدم قالب الدردشة الخاص بك بواسطة فئة [`TextGenerationPipeline`] لذلك بمجرد تعيين قالب الدردشة الصحيح، سيصبح نموذجك متوافقًا تلقائيًا مع [`TextGenerationPipeline`].
+
+<Tip>
+إذا كنت تُجري ضبطًا دقيقًا لنموذج للدردشة، بالإضافة إلى تعيين قالب دردشة، فربما يجب عليك إضافة أي رموز تحكم دردشة جديدة كرموز خاصة في المجزىء اللغوي. لا يتم تقسيم الرموز الخاصة أبدًا، مما يضمن معالجة رموز التحكم الخاصة بك دائمًا كرموز فردية بدلاً من تجزئتها إلى أجزاء. يجب عليك أيضًا تعيين خاصية `eos_token` للمجزىء اللغوي إلى الرمز الذي يُشير إلى نهاية توليدات المساعد في قالبك. سيضمن هذا أن أدوات توليد النصوص يمكنها تحديد وقت إيقاف توليد النص بشكل صحيح.
+</Tip>
+
+### لماذا تحتوي بعض النماذج على قوالب متعددة؟
+تستخدم بعض النماذج قوالب مختلفة لحالات استخدام مختلفة. على سبيل المثال، قد تستخدم قالبًا واحدًا للدردشة العادية وآخر لاستخدام الأدوات، أو التوليد القائم على الاسترجاع. في هذه الحالات، تكون `tokenizer.chat_template` قاموسًا. يمكن أن يتسبب هذا في بعض الارتباك، وحيثما أمكن، نوصي باستخدام قالب واحد لجميع حالات الاستخدام. يمكنك استخدام عبارات Jinja مثل `if tools is defined` وتعريفات `{% macro %}` لتضمين مسارات تعليمات برمجية متعددة بسهولة في قالب واحد.
+
+عندما يحتوي المعالج اللغوي على قوالب متعددة، ستكون `tokenizer.chat_template dict`، حيث يكون كل مفتاح هو اسم قالب. يحتوي أسلوب `apply_chat_template` على معالجة خاصة لأسماء قوالب مُعينة: على وجه التحديد، سيبحث عن قالب باسم `default` في معظم الحالات، وسيُثير خطأً إذا لم يتمكن من العثور على واحد. ومع ذلك، إذا كان هناك قالب باسم `tool_use` عندما قام المستخدم بتمرير وسيطة `tools`، فسيستخدم هذا القالب بدلاً من ذلك. للوصول إلى قوالب بأسماء أخرى، مرر اسم القالب الذي تُريده إلى وسيطة `chat_template` لـ `apply_chat_template()`.
+
+نجد أن هذا قد يكون مُربكًا بعض الشيء للمستخدمين - لذلك إذا كنت تكتب قالبًا بنفسك، فننصحك بمحاولة وضعه كله في قالب واحد حيثما أمكن!
+
+## ما القالب الذي يجب أن أستخدمه؟
+
+عند تعيين قالب لنموذج تم تدريبه بالفعل على الدردشة، يجب التأكد من أن القالب يتطابق تمامًا مع تنسيق الرسالة الذي شاهده النموذج أثناء التدريب، وإلا فمن المحتمل أن تواجه تدهورًا في الأداء. هذا صحيح حتى إذا كنت تدرب النموذج بشكل إضافي - فمن المحتمل أن تحصل على أفضل أداء إذا قمت بإبقاء رموز الدردشة ثابتة.  يُشبه هذا إلى حد كبير عملية التجزئة - فأنت تحصل بشكل عام على أفضل أداء للاستدلال أو الضبط الدقيق عندما تتطابق بدقة مع التجزئة المستخدمة أثناء التدريب.
+
+من ناحية أخرى، إذا كنت تُدرّب نموذجًا من البداية، أو تقوم بضبط دقيق لنموذج لغة أساسي للدردشة، لديك حرية اختيار قالب مناسب! تتمتع LLMs بالذكاء الكافي للتعامل مع العديد من تنسيقات الإدخال المختلفة. أحد الخيارات الشائعة هو تنسيق "ChatML"، وهو خيار جيد ومرن للعديد من حالات الاستخدام. يبدو كالتالي:
+
+```
+{%- for message in messages %}
+    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{%- endfor %}
+```
+
+إذا أعجبك هذا، فإليك نسخة جاهزة لوضعها في كودك. يتضمن الخط المفرد أيضًا دعمًا مفيدًا [لإرشادات التوليد](#what-are-generation-prompts)، ولكن لاحظ أنه لا يضيف رموز BOS أو EOS! إذا كان نموذجك يتوقع هذه الرموز، فلن يتم إضافتها تلقائيًا بواسطة "apply_chat_template" - بمعنى آخر، سيتم تجزئة النص باستخدام "add_special_tokens=False". هذا لتجنب التعارضات المحتملة بين القالب ومنطق "add_special_tokens". إذا كان نموذجك يتوقع رموزًا خاصة، فتأكد من إضافتها إلى القالب!
+
+```python
+tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+```
+
+يُحيط هذا القالب كل رسالة بين الرمزين "<|im_start|>" و "<|im_end|>"، ويكتب ببساطة الدور كسلسلة نصية، مما يسمح بالمرونة في الأدوار التي تتدرب عليها. يبدو الناتج كما يلي:
+
+```text
+<|im_start|>system
+You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|>
+<|im_start|>user
+How are you?<|im_end|>
+<|im_start|>assistant
+I'm doing great!<|im_end|>
+```
+
+تعد أدوار "user" و "system" و "assistant" هي الأدوار القياسية للدردشة، ونوصي باستخدامها عندما يكون ذلك منطقيًا، خاصة إذا كنت تريد أن يعمل نموذجك بشكل جيد مع [`TextGenerationPipeline`]. ومع ذلك، فأنت لست مقيدًا بهذه الأدوار - فإن القوالب مرنة للغاية، ويمكن أن تكون أي سلسلة نصية دورًا.
+
+
+## أريد إضافة بعض قوالب الدردشة! كيف أبدأ؟
+
+إذا كان لديك أي نماذج دردشة، فيجب عليك تعيين الخاصية "tokenizer.chat_template" الخاصة بها واختبارها باستخدام [`~PreTrainedTokenizer.apply_chat_template`]، ثم رفع  المجزىء اللغوي المُحدّث إلى Hub. ينطبق هذا حتى إذا لم تكن مالك النموذج - إذا كنت تستخدم نموذجًا بقالب دردشة فارغ، أو لا يزال يستخدم قالب الفئة الافتراضية، فيرجى فتح [طلب سحب](https://huggingface.co/docs/hub/repositories-pull-requests-discussions)  إلى مستودع النموذج حتى يمكن تعيين الخاصية بشكل صحيح!
+
+بمجرد تعيين الخاصية، هذا كل شيء، لقد انتهيت! ستعمل "tokenizer.apply_chat_template" الآن بشكل صحيح لهذا النموذج، مما يعني أنها مدعومة أيضًا بشكل تلقائي في أماكن مثل "TextGenerationPipeline"!
+
+من خلال ضمان امتلاك النماذج لهذه الخاصية، يُمكننا التأكد من أن المجتمع بأكمله يستخدم القوة الكاملة للنماذج مفتوحة المصدر. لقد كانت عدم تطابق التنسيق تطارد المجال وأضرت الأداء بصمت لفترة طويلة جدًا - لقد حان الوقت لوضع حد لها!
+
+## متقدم: نصائح لكتابة القوالب
+
+<Tip>
+أسهل طريقة للبدء في كتابة قوالب Jinja هي إلقاء نظرة على بعض القوالب الموجودة. يمكنك استخدام `print(tokenizer.chat_template)` لأي نموذج دردشة لمعرفة القالب الذي يستخدمه. بشكل عام، تحتوي النماذج التي تدعم استخدام الأدوات على قوالب أكثر تعقيدًا بكثير من النماذج الأخرى - لذلك عندما تبدأ للتو، فمن المحتمل أنها مثال سيئ للتعلم منه! يمكنك أيضًا إلقاء نظرة على [وثائق Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) للحصول على تفاصيل حول تنسيق Jinja العام وتركيبه.
+
+</Tip>
+
+تُطابق قوالب Jinja في `transformers` قوالب Jinja في أي مكان آخر. الشيء الرئيسي الذي يجب معرفته هو أن سجل الدردشة سيكون متاحًا داخل قالبك كمتغير يسمى `messages`. ستتمكن من الوصول إلى `messages` في قالبك تمامًا كما يمكنك في Python، مما يعني أنه يمكنك التكرار خلاله باستخدام `{% for message in messages %}` أو الوصول إلى رسائل فردية باستخدام `{{ messages[0] }}`، على سبيل المثال.
+
+يمكنك أيضًا استخدام النصائح التالية لكتابة قوالب Jinja نظيفة وفعالة:
+
+### إقتطاع المسافات الفارغة
+
+بشكل افتراضي، ستطبع Jinja أي مسافات فارغة تأتي قبل أو بعد كتلة. يمكن أن يكون هذا مشكلة لقوالب الدردشة، والتي تريد عادةً أن تكون دقيقة جدًا مع المسافات! لتجنب ذلك، نوصي بشدة بكتابة قوالبك على النحو التالي:
+
+```
+{%- for message in messages %}
+    {{- message['role'] + message['content'] }}
+{%- endfor %}
+```
+
+بدلاً من ذلك:
+
+```
+{% for message in messages %}
+    {{ message['role'] + message['content'] }}
+{% endfor %}
+```
+
+سيؤدي إضافة "-" إلى إزالة أي مسافات تأتي قبل الكتلة. يبدو المثال الثاني عادية، ولكن قد يتم تضمين السطر الجديد والمسافة البادئة في المخرجات، وهو على الأرجح ليس ما تُريده!
+
+
+### المتغيرات الخاصة
+
+ داخل قالبك، سيكون لديك حق الوصول إلى العديد من المتغيرات الخاصة. أهمها هو `messages`، والذي يحتوي على سجل الدردشة كقائمة من قواميس الرسائل. ومع ذلك، هناك العديد من المتغيرات الأخرى. لن يتم استخدام كل متغير في كل قالب. المتغيرات الأكثر شيوعًا هي:
+
+- `tools` تحتوي على قائمة بالأدوات بتنسيق مخطط JSON. ستكون `None` أو غير مُعرّفة إذا لم يتم تمرير أي أدوات.
+- `documents` تحتوي على قائمة من المستندات بالتنسيق `{"title": "العنوان", "contents": "المحتويات"}`، تُستخدم للتوليد المُعزز بالاسترجاع. ستكون `None` أو غير مُعرّفة إذا لم يتم تمرير أي مستندات.
+- `add_generation_prompt` هي قيمة منطقية تكون `True` إذا طلب المستخدم مُطالبة توليد، و `False` بخلاف ذلك. إذا تم تعيين هذا، فيجب أن يُضيف قالبك رأس رسالة مساعد إلى نهاية المحادثة. إذا لم يكن لدى نموذجك رأس مُحدد لرسائل المساعد، فيمكنك تجاهل هذا العلم.
+- **الرموز الخاصة** مثل `bos_token` و `eos_token`. يتم استخراجها من `tokenizer.special_tokens_map`. ستختلف الرموز الدقيقة المتاحة داخل كل قالب اعتمادًا على المجزىء اللغوي الأصلي.
+
+
+<Tip>
+
+يمكنك في الواقع تمرير أي `kwarg` إلى `apply_chat_template`، وستكون متاحة داخل القالب كمتغير. بشكل عام، نوصي بمحاولة الالتزام بالمتغيرات الأساسية المذكورة أعلاه، لأن ذلك سيجعل نموذجك أكثر صعوبة في الاستخدام إذا كان على المستخدمين كتابة تعليمات برمجية مخصصة لتمرير `kwargs` خاصة بالنموذج. ومع ذلك، فنحن نُدرك أن هذا المجال يتحرك بسرعة، لذلك إذا كانت لديك حالة استخدام جديدة لا تتناسب مع واجهة برمجة التطبيقات الأساسية، فلا تتردد في استخدام `kwarg`  معامل جديد لها! إذا أصبح `kwarg` المعامل الجديد شائعًا، فقد نقوم بترقيته إلى واجهة برمجة التطبيقات الأساسية وإنشاء  وتوثيق الخاص به.
+
+</Tip>
+
+### دوال قابلة للاستدعاء
+
+هناك أيضًا قائمة قصيرة من الدوال القابلة للاستدعاء المتاحة لك داخل قوالبك. هذه هي:
+
+- `raise_exception(msg)`: تُثير `TemplateException`. هذا مفيد لتصحيح الأخطاء، ولإخبار المستخدمين عندما يفعلون شيئًا لا يدعمه قالبك.
+- `strftime_now(format_str)`: تُكافئ `datetime.now().strftime(format_str)` في Python. يُستخدم هذا للحصول على التاريخ/الوقت الحالي بتنسيق مُحدد، والذي يتم تضمينه أحيانًا في رسائل النظام.
+
+### التوافق مع Jinja غير Python
+
+هناك تطبيقات متعددة لـ Jinja بلغات مختلفة. عادة ما يكون لها نفس التركيب، ولكن الاختلاف الرئيسي هو أنه عند كتابة قالبًا في Python، يمكنك استخدام أساليب Python، مثل ".lower()" على السلاسل أو ".items()" على القواميس. سيؤدي هذا إلى كسر إذا حاول شخص ما استخدام قالبك في تنفيذ غير Python لـ Jinja. تعد التطبيقات غير Python شائعة بشكل خاص في بيئات النشر، حيث تعد JS و Rust شائعة جدًا.
+
+لا تقلق، على الرغم من ذلك! هناك بعض التغييرات البسيطة التي يمكنك إجراؤها على قوالبك لضمان توافقها عبر جميع تطبيقات Jinja:
+
+- استبدل أساليب Python بمرشحات Jinja. عادة ما يكون لها نفس الاسم، على سبيل المثال، يصبح "string.lower()" عبارة عن "string|lower"، ويصبح "dict.items()" عبارة عن "dict|items". أحد التغييرات الملحوظة هو أن "string.strip()" يصبح "string|trim". راجع [قائمة المرشحات المدمجة](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters) في وثائق Jinja لمزيد من المعلومات.
+- استبدل "True" و "False" و "None"، وهي خاصة بـ Python، بـ "true" و "false" و "none".
+- قد يؤدي عرض قاموس أو قائمة مباشرة إلى نتائج مختلفة في التطبيقات الأخرى (على سبيل المثال، قد تتغير  مدخﻻت السلسلة النصية من علامات اقتباس مفردة ' إلى علامات اقتباس مزدوجة "). يمكن أن يساعد إضافة "tojson" في ضمان الاتساق هنا.
+
+## كتابة مطالبات التوليد
+لقد ذكرنا أعلاه أن add_generation_prompt هو متغير خاص يمكن الوصول إليه داخل قالبك، ويتحكم فيه المستخدم من خلال تعيين معامل add_generation_prompt. إذا كان نموذجك يتوقع عنوان لرسائل المساعد، فيجب أن يدعم قالبك إضافة العنوان عند تعيين add_generation_prompt.
+
+فيما يلي مثال على قالب يُنسّق الرسائل بأسلوب ChatML، مع دعم مُطالبة التوليد:
+
+```text
+{{- bos_token }}
+{%- for message in messages %}
+    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
+```
+سيعتمد المحتوى الدقيق لعنوان المساعد على نموذجك المُحدد، ولكن يجب أن يكون دائمًا السلسلة النصية التي تُمثل بداية رسالة المساعد، بحيث إذا قام المستخدم بتطبيق قالبك باستخدام add_generation_prompt=True ثم قام بتوليد نص، سيكتب النموذج استجابة المساعد. لاحظ أيضًا أن بعض النماذج لا تحتاج إلى مُطالبة توليد، لأن رسائل المساعد تبدأ دائمًا فورًا بعد رسائل المستخدم. هذا شائع بشكل خاص لنماذج LLaMA و Mistral، حيث تبدأ رسائل المساعد فورًا بعد رمز [/INST] الذي ينهي رسائل المستخدم. في هذه الحالات، يمكن للقالب تجاهل معامل add_generation_prompt.
+
+مُطالبات التوليد مُهمة! إذا كان نموذجك يتطلب مُطالبة توليد ولكنها غير مُعيّنة في القالب، فمن المُحتمل أن تتدهور عمليات توليد النموذج بشدة، أو قد يُظهر النموذج سلوكًا غير عادي مثل متابعة رسالة المستخدم الأخيرة!
+
+### كتابة قوالب أكبر وتصحيحها
+عندما تم تقديم هذه الميزة، كانت معظم القوالب صغيرة جدًا، أي ما يُعادل نص برمجي "من سطر واحد" في Jinja. ومع ذلك، مع النماذج والميزات الجديدة مثل استخدام الأدوات و RAG، يمكن أن يصل طول بعض القوالب إلى 100 سطر أو أكثر. عند كتابة قوالب كهذه، من الجيد كتابتها في ملف مُنفصل، باستخدام مُحرر نصوص. يمكنك بسهولة استخراج قالب دردشة إلى ملف:
+
+```python
+open("template.jinja", "w").write(tokenizer.chat_template)
+```
+أو تحميل القالب المُحرر مرة أخرى إلى المعالج اللغوي:
+
+```python
+tokenizer.chat_template = open("template.jinja").read()
+```
+كميزة إضافية، عندما تكتب قالبًا طويلاً متعدد الأسطر في ملف مُنفصل، ستتوافق أرقام الأسطر في هذا الملف تمامًا مع أرقام الأسطر في أخطاء تحليل القالب أو تنفيذه. سيُسهّل هذا كثيرًا تحديد مكان المشكلات.
+
+### كتابة قوالب للأدوات
+على الرغم من أن قوالب الدردشة لا تفرض واجهة برمجة تطبيقات مُحددة للأدوات (أو لأي شيء حقًا)، فإننا نوصي مؤلفي القوالب بمحاولة الالتزام بواجهة برمجة تطبيقات قياسية حيثما أمكن. الهدف النهائي لقوالب الدردشة هو السماح بنقل التعليمات البرمجية عبر النماذج، لذا فإن الانحراف عن واجهة برمجة تطبيقات الأدوات القياسية يعني أن المستخدمين سيضطرون إلى كتابة تعليمات برمجية مخصصة لاستخدام الأدوات مع نموذجك. في بعض الأحيان يكون ذلك أمرًا لا مفر منه، ولكن غالبًا ما يكون من الممكن استخدام واجهة برمجة التطبيقات القياسية من خلال استخدام قوالب ذكية!
+
+أدناه، سنُدرج عناصر واجهة برمجة التطبيقات القياسية، ونقدم نصائح حول كتابة قوالب ستعمل بشكل جيد معها.
+
+#### تعريفات الأدوات
+يجب أن يتوقع قالبك أن يكون المتغير tools إما فارغًا (إذا لم يتم تمرير أي أدوات)، أو قائمة من قواميس مخطط JSON. تسمح أساليب قالب الدردشة الخاصة بنا للمستخدمين بتمرير الأدوات إما كمخطط JSON أو كدوال Python، ولكن عندما يتم تمرير الدوال، فإننا نقوم تلقائيًا بإنشاء مخطط JSON وتمريره إلى قالبك. نتيجة لذلك، سيكون متغير tools الذي يستقبله قالبك دائمًا قائمة من مخططات JSON. هنا مخطط JSON أداة نموذجي:
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "دالة تضرب عددين", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "الرقم الأول للضرب"
+        }, 
+        "b": {
+          "type": "number", 
+          "description": "الرقم الثاني للضرب"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+وهنا بعض الأمثلة البرمجية للتعامل مع الأدوات في قالب الدردشة الخاص بك. تذكر أن هذا مجرد مثال لتنسيق مُحدد - من المحتمل أن يحتاج نموذجك إلى تنسيق مختلف!
+```text
+{%- if tools %}
+    {%- for tool in tools %}
+        {{- '<tool>' + tool['function']['name'] + '\n' }}
+        {%- for argument in tool['function']['parameters']['properties'] %}
+            {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }}
+        {%- endfor %}
+        {{- '\n</tool>' }}
+    {%- endif %}
+{%- endif %}
+```
+
+يجب بالطبع اختيار الرموز المحددة ووصف الأدوات التي يُعرضها قالبك لتتناسب مع تلك التي تم تدريب نموذجك عليها. لا يوجد شرط أن يفهم نموذجك مُدخلات مخطط JSON، فقط أن يتمكن قالبك من ترجمة مخطط JSON إلى تنسيق نموذجك. على سبيل المثال، تم تدريب Command-R باستخدام أدوات مُعرّفة باستخدام رؤوس دوال Python، ولكن يقبل قالب أداة Command-R مخطط JSON، ويُحوّل الأنواع داخليًا ويُعرض أدوات الإدخال كعناوين Python. يمكنك فعل الكثير باستخدام القوالب!
+
+#### استدعاءات الأدوات
+استدعاءات الأدوات، إذا كانت موجودة، ستكون قائمة مُرفقة برسالة بدور "assistant". لاحظ أن tool_calls هي دائمًا قائمة، على الرغم من أن معظم نماذج استدعاء الأدوات تدعم فقط استدعاءات أدوات فردية في كل مرة، مما يعني أن القائمة ستحتوي عادةً على عنصر واحد فقط. هنا قاموس رسالة نموذجي يحتوي على استدعاء أداة:
+
+```json
+{
+  "role": "assistant",
+  "tool_calls": [
+    {
+      "type": "function",
+      "function": {
+        "name": "multiply",
+        "arguments": {
+          "a": 5,
+          "b": 6
+        }
+      }
+    }
+  ]
+}
+```
+والنمط الشائع للتعامل معها سيكون كهذا:
+
+```text
+{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
+    {%- for tool_call in message['tool_calls'] %}
+            {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
+        {%- endif %}
+    {%- endfor %}
+{%- endif %}
+```
+
+مرة أخرى، يجب عليك عرض استدعاء الأداة بالتنسيق والرموز الخاصة التي يتوقعها نموذجك.
+
+#### استجابات الأدوات
+استجابات الأدوات لها تنسيق بسيط: إنها قاموس رسالة بدور "tool"، ومفتاح "name" يُعطي اسم الدالة المُستدعاة، ومفتاح "content" يحتوي على نتيجة استدعاء الأداة. هنا استجابة أداة نموذجية:
+
+```json
+{
+  "role": "tool",
+  "name": "multiply",
+  "content": "30"
+}
+```
+لست بحاجة إلى استخدام جميع المفاتيح في استجابة الأداة. على سبيل المثال، إذا كان نموذجك لا يتوقع تضمين اسم الدالة في استجابة الأداة، فيمكن أن يكون عرضها بسيطًا مثل:
+
+```text
+{%- if message['role'] == 'tool' %}
+    {{- "<tool_result>" + message['content'] + "</tool_result>" }}
+{%- endif %}
+```
+
+مرة أخرى، تذكر أن التنسيق الفعلي والرموز الخاصة خاصة بالنموذج - يجب أن تُولي عناية كبيرة لضمان أن الرموز والمسافات الفارغة وكل شيء آخر يتطابق تمامًا مع التنسيق الذي تم تدريب نموذجك عليه!
diff --git a/docs/source/ar/create_a_model.md b/docs/source/ar/create_a_model.md
new file mode 100644
index 00000000000000..6b511fe0de4a8a
--- /dev/null
+++ b/docs/source/ar/create_a_model.md
@@ -0,0 +1,436 @@
+# إنشاء بنية مخصصة
+
+تحدد فئة [`AutoClass`](model_doc/auto) تلقائيًا بنية النموذج وتقوم بتنزيل تكوين وأوزان مسبقين للنموذج. بشكل عام، نوصي باستخدام  `AutoClass` لإنتاج كود غير مرتبط بنسخة معينة. ولكن يمكن للمستخدمين الذين يريدون مزيدًا من التحكم في معلمات النموذج المحددة إنشاء نموذج مخصص من 🤗 Transformers من مجرد بضع فئات أساسية. قد يكون هذا مفيدًا بشكل خاص لأي شخص مهتم بدراسة نموذج 🤗 Transformers أو تدريبه أو إجراء تجارب عليه. في هذا الدليل، سنغوص بشكل أعمق في إنشاء نموذج مخصص بدون `AutoClass`. تعرف على كيفية:
+
+- تحميل تكوين النموذج وتخصيصه.
+- إنشاء بنية نموذج.
+- إنشاء مجزء لغوى سريع وبطيء للنص.
+- إنشاء معالج صور لمهام الرؤية.
+- إنشاء مستخرج ميزات لمهام الصوت.
+- إنشاء معالج للمهام متعددة الوسائط.
+
+## التكوين
+
+يشير مصطلح [التكوين](main_classes/configuration) إلى الخصائص المحددة للنموذج. لكل تكوين نموذج خصائصه الخاصة؛ على سبيل المثال، تشترك جميع نماذج NLP في الخصائص  `hidden_size` و`num_attention_heads` و`num_hidden_layers` و`vocab_size` المشتركة. تحدد هذه الخصائص عدد رؤوس الانتباه أو الطبقات المخفية لبناء نموذج بها.
+
+اطلع على [DistilBERT](model_doc/distilbert) من خلال [`DistilBertConfig`] لمعاينة خصائصه:
+
+```py
+>>> from transformers import DistilBertConfig
+
+>>> config = DistilBertConfig()
+>>> print(config)
+DistilBertConfig {
+  "activation": "gelu",
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+يعرض [`DistilBertConfig`] جميع الخصائص الافتراضية المستخدمة لبناء نموذج [`DistilBertModel`] أساسي. جميع الخصائص قابلة للتعديل، مما ييتيح مجالاً للتجريب. على سبيل المثال، يمكنك تعديل نموذج افتراضي لـ:
+
+- تجربة دالة تنشيط مختلفة باستخدام معامل `activation`.
+- استخدام معدل إسقاط أعلى الاحتمالات الانتباه مع معامل  `attention_dropout`.
+
+```py
+>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
+>>> print(my_config)
+DistilBertConfig {
+  "activation": "relu",
+  "attention_dropout": 0.4,
+ 
+```
+
+يمكن تعديل خصائص النموذج المدرب مسبقًا في دالة [`~PretrainedConfig.from_pretrained`] :
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+```
+
+بمجرد أن تصبح راضيًا عن تكوين نموذجك، يمكنك حفظه باستخدام [`~PretrainedConfig.save_pretrained`]. يتم تخزين ملف التكوين الخاص بك على أنه ملف JSON في دليل الحفظ المحدد:
+
+```py
+>>> my_config.save_pretrained(save_directory="./your_model_save_path")
+```
+
+لإعادة استخدام ملف التكوين، قم بتحميله باستخدام [`~PretrainedConfig.from_pretrained`]:
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
+```
+
+<Tip>
+يمكنك أيضًا حفظ ملف التكوين كقاموس أو حتى كفرق بين خصائص التكوين المُعدّلة والخصائص التكوين الافتراضية! راجع وثائق [التكوين](main_classes/configuration) لمزيد من التفاصيل.
+</Tip>
+
+
+## النموذج
+
+الخطوة التالية هي إنشاء [نموذج](main_classes/models). النموذج - ويُشار إليه أحيانًا باسم البنية - يُحدد وظيفة كل طبقة والعمليات الحسابية المُنفذة. تُستخدم خصائص مثل `num_hidden_layers` من التكوين لتحديد هذه البنية. تشترك جميع النماذج في  فئة أساسية واحدة هي [`PreTrainedModel`] وبعض الوظائف المُشتركة مثل غيير حجم مُدخلات الكلمات وتقليص رؤوس آلية الانتباه الذاتي. بالإضافة إلى ذلك، فإن جميع النماذج هي  فئات فرعية إما من [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html)، [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) أو [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) . هذا يعني النماذج متوافقة مع كل استخدام لإطار العمل الخاص بها.
+
+<frameworkcontent>
+<pt>
+قم بتحميل خصائص التكوين المخصصة الخاصة بك في النموذج:
+
+```py
+>>> from transformers import DistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
+>>> model = DistilBertModel(my_config)
+```
+
+هذا ينشئ نموذجًا بقيم عشوائية بدلاً من الأوزان  المُدربة مسبقًا. لن يكون هذا النموذج مفيدًا حتى يتم تدريبه.  تُعد عملية التدريب مكلفة وتستغرق وقتًا طويلاً. من الأفضل بشكل عام استخدام نموذج مُدرب مسبقًا للحصول على نتائج أفضل بشكل أسرع، مع استخدام جزء بسيط فقط من الموارد المطلوبة للتدريب.
+
+قم بإنشاء نموذج مُدرب مسبقًا باستخدام [`~PreTrainedModel.from_pretrained`]:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+عند بتحميل الأوزان المُدربة مسبقًا، يتم تحميل تكوين النموذج الافتراضي تلقائيًا إذا كان النموذج من مكتبة 🤗 Transformers. ومع ذلك، يمكنك أيضًا استبدال - بعض أو كل - سإعدادات النموذج الافتراضية بإعداداتك الخاصة:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased"، config=my_config)
+```
+</pt>
+<tf>
+قم بتحميل خصائص التكوين المُخصصة الخاصة بك في النموذج:
+
+```py
+>>> from transformers import TFDistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+>>> tf_model = TFDistilBertModel(my_config)
+```
+
+هذا ينشئ نموذجًا بقيم عشوائية بدلاً من الأوزان المُدربة مسبقًا. لن يكون هذا النموذج مفيدًا حتى يتم تدريبه. تُعد عملية التدريب مكلفة وتستغرق وقتًا طويلاً. من الأفضل بشكل عام استخدام نموذج مُدرب مسبقًا للحصول على نتائج أفضل بشكل أسرع، مع استخدام جزء بسيط فقط من الموارد المطلوبة للتدريب.
+
+قم بإنشاء نموذج مُدرب مسبقًا باستخدام [`~TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+عندما تقوم بتحميل الأوزان المُدربة مسبقًا،يتم تحميل إعدادات النموذج الافتراضي تلقائيًا إذا كان النموذج من مكتبة 🤗 Transformers. ومع ذلك، يمكنك أيضًا استبدال - بعض أو كل - إعدادات النموذج  الافتراضية بإعداداتك الخاصة:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased"، config=my_config)
+```
+</tf>
+</frameworkcontent>
+
+### رؤوس النموذج
+
+في هذه المرحلة، لديك نموذج DistilBERT الأساسي الذي يخرج *حالات الكامنة*. تُمرَّر هذه الحالات الكامنة كمدخلات لرأس النموذج لإنتاج  المخرجات النهائية. توفر مكتبة 🤗 Transformers رأس نموذج مختلف لكل مهمة طالما أن النموذج يدعم المهمة (أي لا يمكنك استخدام DistilBERT لمهمة تسلسل إلى تسلسل مثل الترجمة).
+
+<frameworkcontent>
+<pt>
+على سبيل المثال، [`DistilBertForSequenceClassification`] هو نموذج DistilBERT الأساس  مزودًا برأس تصنيف تسلسلي.  يُشكّل رأس التصنيف التسلسلي طبقة خطية فوق المخرجات المجمعة.
+
+```py
+>>> from transformers import DistilBertForSequenceClassification
+
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+أعد استخدام هذا نقطة التحقق هذه لمهمة أخرى بسهولة، وذلك بتغيير رأس النموذج.ففي مهمة الإجابة على الأسئلة، ستستخدم رأس النموذج [`DistilBertForQuestionAnswering`]. رأس الإجابة على الأسئلة مشابه لرأس التصنيف التسلسلي باستثناء أنه طبقة خطية فوق مخرجات الحالات الكامنة.
+
+```py
+>>> from transformers import DistilBertForQuestionAnswering
+
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
+```
+</pt>
+<tf>
+على سبيل المثال، [`TFDistilBertForSequenceClassification`] هو نموذج DistilBERT الأساسي برأس تصنيف تسلسل. رأس التصنيف التسلسلي هو طبقة خطية أعلى المخرجات المجمعة.
+
+```py
+>>> from transformers import TFDistilBertForSequenceClassification
+
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+أعد استخدام هذا نقطة التحقق لمهمة أخرى عن طريق التبديل إلى رأس نموذج مختلف. لمهمة الإجابة على الأسئلة، ستستخدم رأس النموذج [`TFDistilBertForQuestionAnswering`]. رأس الإجابة على الأسئلة مشابه لرأس التصنيف التسلسلي باستثناء أنه طبقة خطية أعلى حالات الإخراج المخفية.
+
+```py
+>>> from transformers import TFDistilBertForQuestionAnswering
+
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
+```
+</tf>
+</frameworkcontent>
+
+## مجزئ النصوص
+
+الفئة الأساسية الأخيرة التي تحتاجها قبل استخدام نموذج للبيانات النصية هي [مجزئ النصوص](main_classes/tokenizer) لتحويل النص الخام إلى تنسورات  (tensors). هناك نوعان من المحولات الرموز التي يمكنك استخدامها مع 🤗 Transformers:
+
+- [`PreTrainedTokenizer`]: تنفيذ Python لمجزئ النصوص.
+ - [`PreTrainedTokenizerFast`]: مجزئ النصوص من مكتبة [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) المُبنية على لغة Rust.  هذا النوع من المجزئات أسرع بكثير، خاصةً عند معالجة دفعات النصوص، وذلك بفضل تصميمه بلغة Rust.  كما يوفر مجزئ النصوص السريع طرقًا إضافية مثل *مخطط الإزاحة* الذي يُطابق الرموز بكلماتها أو أحرفها الأصلية.
+
+يدعم كلا النوعين من المجزئات طرقًا شائعة مثل الترميز وفك الترميز، وإضافة رموز جديدة، وإدارة الرموز الخاصة.
+
+<Tip warning={true}>
+
+لا يدعم كل نموذج  مجزئ النصوص سريع. الق نظرة على هذا [جدول](index#supported-frameworks) للتحقق مما إذا كان النموذج يحتوي على دعم  مجزئ النصوص سريع.
+
+</Tip>
+
+إذا دربت مجزئ النصوص خاص بك، فيمكنك إنشاء واحد من *قاموسك*:```
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt"، do_lower_case=False، padding_side="left")
+```
+
+من المهم أن تتذكر أن قاموس مجزئ النصوص المُخصص سيكون مختلفًا عن قاموس مجزئ النصوص نموذج مُدرّب مسبقًا. يجب عليك استخدام قاموس نموذج مُدرّب مسبقًا إذا كنت تستخدم نموذجًا مُدرّبًا مسبقًا، وإلا فلن تكون المدخلات ذات معنى. قم بإنشاء مجزئ النصوص باستخدام قاموس نموذج مُدرّب مسبقًا باستخدام فئة [`DistilBertTokenizer`]:
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+قم بإنشاء مجزئ نصوص سريع باستخدام فئة [`DistilBertTokenizerFast`]:
+
+```py
+>>> from transformers import DistilBertTokenizerFast
+
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+<Tip>
+افتراضيًا، سيحاول [`AutoTokenizer`] تحميل مجزئ نصوص سريع. يمكنك تعطيل هذا السلوك عن طريق تعيين `use_fast=False` في `from_pretrained`.
+</Tip>
+
+## معالج الصور
+
+يعالج معالج الصور بيانات الرؤية. وهو يرث من الفئة الأساسية [`~image_processing_utils.ImageProcessingMixin`].
+
+لبناء معالج صور خاص بالنموذج المستخدم، أنشئ مثلاً مُعالج  [`ViTImageProcessor`] افتراضيًا إذا كنت تستخدم [ViT](model_doc/vit) لتصنيف الصور:
+
+```py
+>>> from transformers import ViTImageProcessor
+
+>>> vit_extractor = ViTImageProcessor()
+>>> print(vit_extractor)
+ViTImageProcessor {
+  "do_normalize": true,
+  "do_resize": true,
+  "image_processor_type": "ViTImageProcessor",
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 2,
+  "size": 224
+}
+```
+
+<Tip>
+
+إذا كنت لا تبحث عن أي تخصيص، فما عليك سوى استخدام طريقة `from_pretrained` لتحميل معلمات معالج الصور الافتراضية للنموذج.
+
+</Tip>
+
+عدل أيًا من معلمات [`ViTImageProcessor`] لإنشاء معالج الصور المخصص الخاص بك:
+
+```py
+>>> from transformers import ViTImageProcessor
+
+>>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
+>>> print(my_vit_extractor)
+ViTImageProcessor {
+  "do_normalize": false,
+  "do_resize": true,
+ "image_processor_type": "ViTImageProcessor",
+  "image_mean": [
+    0.3,
+    0.3,
+    0.3
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": "PIL.Image.BOX",
+  "size": 224
+}
+```
+## العمود الفقري
+
+<div style="text-align: center">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Backbone.png">
+</div>
+
+تتكون نماذج رؤية الحاسب من جزء أساسي، وجزء وسيط، وجزء معالجة نهائي. يستخرج الجزء الأساسي الميزات من صورة الإدخال، ويجمع الجزء الوسيط هذه الميزات المستخرجة ويعززها،  ويُستخدم الجزء  النهائي للمهمة الرئيسية (مثل اكتشاف الأجسام). ابدأ عبتهيئة الجزء الأساسي في تكوين النموذج وحدد ما إذا كنت تريد تحميل أوزان مدربة مسبقًا أو أوزانًا عشوائية. بعد ذلك، يمكنك تمرير تكوين النموذج إلى جزء المعالجة النهائي.
+
+على سبيل المثال، لتحميل [ResNet](../model_doc/resnet) backbone في نموذج [MaskFormer](../model_doc/maskformer) مع رأس تجزئة مثيل:
+
+<hfoptions id="backbone">
+<hfoption id="pretrained weights">
+
+قم بتعيين `use_pretrained_backbone=True` لتحميل الأوزان المسبقة التدريب لـ ResNet للعمود الفقري.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True) # تكوين الجزء الأساسي والجزء الوسيط
+model = MaskFormerForInstanceSegmentation(config) # جزء المعالجة النهائي
+```
+
+</hfoption>
+<hfoption id="random weights">
+
+قم بتعيين `use_pretrained_backbone=False` لتهيئة جزء ResNet الأساسي بشكل عشوائي.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=False) # تكوين الجزء الأساسي والجزء الوسيط
+model = MaskFormerForInstanceSegmentation(config) # جزء المعالجة النهائي
+```
+
+يمكنك أيضًا تحميل تكوين الجزء الأساسي بشكل منفصل، ثم تمريره إلى تكوين النموذج.```
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig
+
+backbone_config = ResNetConfig()
+config = MaskFormerConfig(backbone_config=backbone_config)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+</hfoption>
+<hfoption id="timm backbone">
+
+يتم تحميل نماذج [timm](https://hf.co/docs/timm/index) داخل نموذج باستخدام `use_timm_backbone=True` أو باستخدام [`TimmBackbone`] و [`TimmBackboneConfig`].
+
+استخدم `use_timm_backbone=True` و `use_pretrained_backbone=True` لتحميل أوزان timm المُدرّبة مسبقًا للجزء الأساسي.
+
+```python
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=True, use_timm_backbone=True) # تكوين الجزء الأساسي والجزء الوسيط
+model = MaskFormerForInstanceSegmentation(config) # جزء المعالجة النهائي
+```
+
+قم بتعيين `use_timm_backbone=True` و `use_pretrained_backbone=False` لتحميل عمود فقري timm مبدئي عشوائي.
+
+```python
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=False, use_timm_backbone=True) # تكوين الجزء الأساسي والجزء الوسيط
+model = MaskFormerForInstanceSegmentation(config) # جزء المعالجة النهائي
+```
+
+يمكنك أيضًا تحميل تكوين الجزء الأساسي واستخدامه لإنشاء `TimmBackbone` أو تمريره إلى تكوين النموذج. سيتم تحميلأوزان الجزء الأساسي لـ Timm المُدرّبة مسبقًا افتراضيًا. عيّن `use_pretrained_backbone=False` لتحميل الأوزان المبدئية العشوائية.
+
+```python
+from transformers import TimmBackboneConfig, TimmBackbone
+
+backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=False)
+
+# قم بإنشاء مثيل من العمود الفقري
+backbone = TimmBackbone(config=backbone_config)
+
+# قم بإنشاء نموذج باستخدام عمود فقري timm
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone_config=backbone_config)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+## مستخرج الميزات
+
+يقوم مُستخرج الميزات بمعالجة المدخلات الصوتية. يرث من فئة الأساس [`~feature_extraction_utils.FeatureExtractionMixin`]، وقد يرث أيضًا من فئة [`SequenceFeatureExtractor`] لمعالجة المدخلات الصوتية.
+
+للاستخدام، قم بإنشاء مستخرج ميزات مرتبط بالنموذج الذي تستخدمه. على سبيل المثال، قم بإنشاء مستخرج ميزات Wav2Vec2 الافتراضي إذا كنت تستخدم [Wav2Vec2](model_doc/wav2vec2) لتصنيف الصوت:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor()
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}
+```
+
+<Tip>
+إذا لم تكن بحاجة لأي تخصيص، فاستخدم فقط طريقة `from_pretrained` لتحميل معلمات مستخرج الميزات الافتراضية للنموذج. 
+</Tip>
+
+قم بتعديل أي من معلمات [`Wav2Vec2FeatureExtractor`] لإنشاء مستخرج ميزات مخصص:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000، do_normalize=False)
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": false,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor"،
+  "feature_size": 1،
+  "padding_side": "right"،
+  "padding_value": 0.0،
+  "return_attention_mask": false،
+  "sampling_rate": 8000
+}
+```
+
+## المعالج
+
+بالنسبة للنماذج التي تدعم مهام الوسائط المتعددة، توفر مكتبة 🤗 Transformers فئة معالج تجمع بفاعلية فئات المعالجة مثل مستخرج الميزات ومقسّم الرموز في كائن واحد. على سبيل المثال، دعنا نستخدم [`Wav2Vec2Processor`] لمهمة التعرف الآلي على الكلام (ASR). تقوم مهمة ASR بتحويل الصوت إلى نص، لذلك ستحتاج إلى مستخرج ميزات ومقسّم رموز.
+
+قم بإنشاء مستخرج ميزات لمعالجة المدخلات الصوتية:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
+```
+
+قم بإنشاء مقسّم رموز لمعالجة المدخلات النصية:
+
+```py
+>>> from transformers import Wav2Vec2CTCTokenizer
+
+>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
+```
+
+قم بدمج مستخرج الميزات ومقسّم الرموز في [`Wav2Vec2Processor`]:
+
+```py
+>>> from transformers import Wav2Vec2Processor
+
+>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+```
+
+باستخدام فئتين أساسيتين - التكوين والنموذج - بالإضافة إلى فئة معالجة مسبق (مقسّم رموز أو معالج صورة أو مستخرج ميزات أو معالج)، يمكنك إنشاء أي من النماذج التي تدعمها مكتبة 🤗 Transformers. يمكن تكوين كل من هذه الفئات الأساسية، مما يسمح لك باستخدام السمات المطلوبة. يمكنك بسهولة تهيئة نموذج للتدريب أو تعديل نموذج مدرب مسبقاً لإجراء ضبط دقيق.
diff --git a/docs/source/ar/custom_models.md b/docs/source/ar/custom_models.md
new file mode 100644
index 00000000000000..daaba5e54ee2f6
--- /dev/null
+++ b/docs/source/ar/custom_models.md
@@ -0,0 +1,323 @@
+# بناء نماذج مخصصة
+
+تم تصميم مكتبة 🤗 Transformers لتكون قابلة للتوسيع بسهولة. كل نموذج مُشفّر بالكامل في مجلد فرعي معين بالمستودع، دون أي تجريد، لذلك يمكنك بسهولة نسخ ملف النمذجة وتعديله وفقًا لاحتياجاتك.
+
+إذا كنت تُنشئ نموذجًا جديدًا تمامًا، فقد يكون من الأسهل البدء من الصفر. في هذا البرنامج التعليمي، سنُرِيك كيفية كتابة نموذج مخصص وتكوينه ليُستخدم داخل Transformers، وكيفية مشاركته مع المجتمع (مع الكود الذي يعتمد عليه) بحيث يمكن لأي شخص استخدامه، حتى إذا لم يكن موجودًا في مكتبة 🤗 Transformers. سنرى كيفية البناء على المحولات ونوسّع الإطار  باستخدام الأدوات التي يمكن استخدامها لتعديل سلوك الإطار (hooks) والتعليمات البرمجية المخصصة.
+
+سنوضح كل هذا من خلال نموذج ResNet، بتغليف فئة ResNet من
+[مكتبة timm](https://github.com/rwightman/pytorch-image-models) داخل [`PreTrainedModel`].
+
+## كتابة إعدادات مخصصة
+
+لنبدأ بكتابة إعدادات النموذج. إعدادات النموذج هو كائنٌ يحتوي على جميع المعلومات اللازمة لبنائه. كما سنرى لاحقًا، يتطلب النموذج كائن `config` لتهيئته، لذا يجب أن يكون هذا الكائن كاملاً.
+
+<Tip>
+
+تتبع النماذج في مكتبة `transformers` اتفاقية قبول كائن `config` في دالة  `__init__` الخاصة بها، ثم تمرر كائن `config` بالكامل إلى الطبقات الفرعية في النموذج، بدلاً من تقسيمه إلى معامﻻت متعددة. يؤدي كتابة نموذجك بهذا الأسلوب إلى كود أبسط مع "مصدر حقيقة" واضح لأي فرط معلمات، كما يسهل إعادة استخدام الكود من نماذج أخرى في `transformers`.
+
+</Tip>
+
+في مثالنا، سنعدّل بعض الوسائط في فئة ResNet التي قد نرغب في ضبطها. ستعطينا التكوينات المختلفة أنواع ResNets المختلفة الممكنة. سنقوم بتخزين هذه الوسائط بعد التحقق من صحته.
+
+```python
+from transformers import PretrainedConfig
+from typing import List
+
+
+class ResnetConfig(PretrainedConfig):
+    model_type = "resnet"
+
+    def __init__(
+        self,
+        block_type="bottleneck",
+        layers: List[int] = [3, 4, 6, 3],
+        num_classes: int = 1000,
+        input_channels: int = 3,
+        cardinality: int = 1,
+        base_width: int = 64,
+        stem_width: int = 64,
+        stem_type: str = "",
+        avg_down: bool = False,
+        **kwargs,
+    ):
+        if block_type not in ["basic", "bottleneck"]:
+            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
+        if stem_type not in ["", "deep", "deep-tiered"]:
+            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
+
+        self.block_type = block_type
+        self.layers = layers
+        self.num_classes = num_classes
+        self.input_channels = input_channels
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.stem_width = stem_width
+        self.stem_type = stem_type
+        self.avg_down = avg_down
+        super().__init__(**kwargs)
+```
+الأشياء الثلاثة المهمة التي يجب تذكرها عند كتابة تكوينك الخاص هي:
+
+- يجب أن ترث من `PretrainedConfig`،
+- يجب أن تقبل دالة  `__init__` الخاصة بـ `PretrainedConfig` أي معامﻻت إضافية kwargs،
+- يجب تمرير هذه المعامﻻت الإضافية إلى دالة `__init__` فى الفئة الأساسية الاعلى.
+
+يضمن الإرث حصولك على جميع الوظائف من مكتبة 🤗 Transformers، في حين أن القيدين التانى والثالث يأتيان من حقيقة أن `PretrainedConfig` لديه المزيد من الحقول أكثر من تلك التي تقوم بتعيينها. عند إعادة تحميل تكوين باستخدام طريقة `from_pretrained`، يجب أن يقبل تكوينك هذه الحقول ثم إرسالها إلى الفئة الأساسية الأعلى.
+
+تحديد `model_type` لتكوينك (هنا `model_type="resnet"`) ليس إلزاميًا، ما لم ترغب في
+تسجيل نموذجك باستخدام الفئات التلقائية (راجع القسم الأخير).
+
+مع القيام بذلك، يمكنك بسهولة إنشاء تكوينك وحفظه مثلما تفعل مع أي تكوين نموذج آخر في
+المكتبة. إليك كيفية إنشاء تكوين resnet50d وحفظه:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d_config.save_pretrained("custom-resnet")
+```
+
+سيؤدي هذا إلى حفظ ملف باسم `config.json` داخل مجلد `custom-resnet`. يمكنك بعد ذلك إعادة تحميل تكوينك باستخدام
+طريقة `from_pretrained`:
+
+```py
+resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
+```
+
+يمكنك أيضًا استخدام أي طريقة أخرى من فئة [`PretrainedConfig`]، مثل [`~PretrainedConfig.push_to_hub`] لتحميل تكوينك مباشرة إلى Hub.
+
+## كتابة نموذج مخصص
+
+الآن بعد أن أصبح لدينا تكوين ResNet، يمكننا المتابعة لإنشاء نموذجين: الأول يستخرج الميزات المخفية  من دفعة من الصور (مثل [`BertModel`]) والآخر مناسب لتصنيف الصور (مثل [`BertForSequenceClassification`]).
+
+ كما ذكرنا سابقًا، سنقوم ببناء نموذج مبسط لتسهيل الفهم في هذا المثال. الخطوة الوحيدة المطلوبة قبل كتابة هذه الفئة هي لربط أنواع وحدات البناء بفئات ذات وحدات بناء فعلية. بعد ذلك، يُعرّف النموذج من خلال التكوين عبر تمرير كل شيء إلى فئة `ResNet`:
+
+```py
+from transformers import PreTrainedModel
+from timm.models.resnet import BasicBlock, Bottleneck, ResNet
+from .configuration_resnet import ResnetConfig
+
+
+BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
+
+
+class ResnetModel(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor):
+        return self.model.forward_features(tensor)
+```
+
+بالنسبة للنموذج الذي سيصنف الصور، فإننا نغير فقط طريقة التقديم:
+
+```py
+import torch
+
+
+class ResnetModelForImageClassification(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor, labels=None):
+        logits = self.model(tensor)
+        if labels is not None:
+            loss = torch.nn.cross_entropy(logits, labels)
+            return {"loss": loss, "logits": logits}
+        return {"logits": logits}
+```
+في كلتا الحالتين، لاحظ كيف نرث من `PreTrainedModel` ونستدعي مُهيئ الفئة الرئيسية باستخدام `config` (كما تفعل عند إنشاء وحدة `torch.nn.Module` عادية). ليس من الضروري تعريف `config_class` إلا إذا كنت ترغب في تسجيل نموذجك مع الفئات التلقائية (راجع القسم الأخير).
+
+<Tip>
+
+إذا كان نموذجك مشابهًا جدًا لنموذج داخل المكتبة، فيمكنك إعادة استخدام نفس التكوين مثل هذا النموذج.
+
+</Tip>
+
+يمكن لنموذجك أن يعيد أي شيء تريده، ولكن إعادة قاموس مثلما فعلنا لـ
+`ResnetModelForImageClassification`، مع تضمين الخسارة عند تمرير العلامات، سيجعل نموذجك قابلًا للاستخدام مباشرة داخل فئة [`Trainer`]. يعد استخدام تنسيق إخراج آخر أمرًا جيدًا طالما أنك تخطط لاستخدام حلقة تدريب خاصة بك أو مكتبة أخرى للتدريب.
+
+الآن بعد أن أصبح لدينا فئة النموذج، دعنا ننشئ واحدة:
+
+```py
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+```
+
+يمكنك استخدام أي من طرق فئة [`PreTrainedModel`]، مثل [`~PreTrainedModel.save_pretrained`] أو
+[`~PreTrainedModel.push_to_hub`]. سنستخدم الثاني في القسم التالي، وسنرى كيفية دفع أوزان النموذج مع كود نموذجنا. ولكن أولاً، دعنا نحمل بعض الأوزان المُعلمة مسبقًا داخل نموذجنا.
+
+في حالة الاستخدام الخاصة بك، فمن المحتمل أن تقوم بتدريب نموذجك المخصص على بياناتك الخاصة. للانتقال بسرعة خلال هذا البرنامج التعليمي،
+سنستخدم الإصدار المُعلم مسبقًا من resnet50d. نظرًا لأن نموذجنا هو مجرد غلاف حوله، فمن السهل نقل هذه الأوزان:
+
+```py
+import timm
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+الآن دعونا نرى كيفية التأكد من أنه عند قيامنا بـ [`~PreTrainedModel.save_pretrained`] أو [`~PreTrainedModel.push_to_hub`]، يتم حفظ كود النموذج.
+
+## تسجيل نموذج مع كود مخصص للفئات التلقائية
+
+إذا كنت تكتب مكتبة توسع 🤗 Transformers، فقد ترغب في توسيع الفئات التلقائية لتشمل نموذجك الخاص. يختلف هذا عن نشر الكود إلى Hub بمعنى أن المستخدمين سيحتاجون إلى استيراد مكتبتك للحصول على النماذج المخصصة (على عكس تنزيل كود النموذج تلقائيًا من Hub).
+
+ما دام تكوينك يحتوي على معامل  `model_type` مختلفة عن أنواع النماذج الحالية، وأن فئات نماذجك لديك لديها الخصائص الصحيحة `config_class`، فيمكنك ببساطة إضافتها إلى الفئات التلقائية مثل هذا:
+
+```py
+from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
+
+AutoConfig.register("resnet", ResnetConfig)
+AutoModel.register(ResnetConfig, ResnetModel)
+AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
+```
+
+لاحظ أن الحجة الأولى المستخدمة عند تسجيل تكوينك المخصص لـ [`AutoConfig`] يجب أن تتطابق مع `model_type`
+من تكوينك المخصص، والحجة الأولى المستخدمة عند تسجيل نماذجك المخصصة لأي فئة نموذج تلقائي يجب
+أن تتطابق مع `config_class` من تلك النماذج.
+
+## إرسال الكود إلى Hub
+
+<Tip warning={true}>
+
+هذا API تجريبي وقد يكون له بعض التغييرات الطفيفة في الإصدارات القادمة.
+
+</Tip>
+
+أولاً، تأكد من تعريف نموذجك بالكامل في ملف `.py`. يمكن أن يعتمد على الاستيراد النسبي لملفات أخرى طالما أن جميع الملفات موجودة في نفس الدليل (لا ندعم الوحدات الفرعية لهذه الميزة حتى الآن). في مثالنا، سنحدد ملف `modeling_resnet.py` وملف `configuration_resnet.py` في مجلد باسم "resnet_model" في دليل العمل الحالي. يحتوي ملف التكوين على كود لـ `ResnetConfig` ويحتوي ملف النمذجة على كود لـ `ResnetModel` و`ResnetModelForImageClassification`.
+
+```
+.
+└── resnet_model
+    ├── __init__.py
+    ├── configuration_resnet.py
+    └── modeling_resnet.py
+```
+
+يمكن أن يكون ملف `__init__.py` فارغًا، فهو موجود فقط حتى يتمكن Python من اكتشاف أن `resnet_model` يمكن استخدامه كموديل.
+
+<Tip warning={true}>
+
+إذا كنت تقوم بنسخ ملفات النمذجة من المكتبة، فسوف تحتاج إلى استبدال جميع الواردات النسبية في أعلى الملف
+لاستيرادها من حزمة `transformers`.
+
+</Tip>
+
+لاحظ أنه يمكنك إعادة استخدام (أو توسيع) تكوين/نموذج موجود.
+
+لمشاركة نموذجك مع المجتمع، اتبع الخطوات التالية: أولاً، قم باستيراد نموذج ResNet والتكوين من الملفات التي تم إنشاؤها حديثًا:
+
+```py
+from resnet_model.configuration_resnet import ResnetConfig
+from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
+```
+
+بعد ذلك، يجب عليك إخبار المكتبة بأنك تريد نسخ ملفات الكود الخاصة بهذه الكائنات عند استخدام طريقة `save_pretrained`
+وتسجيلها بشكل صحيح باستخدام فئة تلقائية (خاصة للنماذج)، ما عليك سوى تشغيل:
+
+```py
+ResnetConfig.register_for_auto_class()
+ResnetModel.register_for_auto_class("AutoModel")
+ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
+```
+
+لاحظ أنه لا توجد حاجة لتحديد فئة تلقائية للتكوين (هناك فئة تلقائية واحدة فقط لها،
+[`AutoConfig`]) ولكن الأمر يختلف بالنسبة للنماذج. قد يكون نموذجك المخصص مناسبًا للعديد من المهام المختلفة، لذلك يجب
+تحديد أي من الفئات التلقائية هو الصحيح لنموذجك.
+
+<Tip>
+
+استخدم `register_for_auto_class()` إذا كنت تريد نسخ ملفات الكود. إذا كنت تفضل استخدام الكود على Hub من مستودع آخر،
+فلا تحتاج إلى استدعائه. في الحالات التي يوجد فيها أكثر من فئة تلقائية واحدة، يمكنك تعديل ملف `config.json` مباشرة باستخدام
+الهيكل التالي:
+
+```json
+"auto_map": {     
+	"AutoConfig": "<your-repo-name>--<config-name>",     
+	"AutoModel": "<your-repo-name>--<config-name>",
+	"AutoModelFor<Task>": "<your-repo-name>--<config-name>",    
+},
+```
+
+</Tip>
+
+بعد ذلك، دعنا نقوم بإنشاء التكوين والنماذج كما فعلنا من قبل:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+الآن لإرسال النموذج إلى Hub، تأكد من تسجيل الدخول. إما تشغيل في المحطة الأوامر الطرفية الخاصة بك:
+
+```bash
+huggingface-cli login
+```
+
+أو من دفتر ملاحظات:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+يمكنك بعد ذلك الضغط على مساحة الاسم الخاصة بك (أو منظمة أنت عضو فيها) مثل هذا:
+
+```py
+resnet50d.push_to_hub("custom-resnet50d")
+```
+
+بالإضافة إلى أوزان النمذجة والتكوين بتنسيق json، فقد قام هذا أيضًا بنسخ ملفات النمذجة والتكوين `.py` في مجلد `custom-resnet50d` وتحميل النتيجة إلى Hub. يمكنك التحقق من النتيجة في هذا [مستودع النموذج](https://huggingface.co/sgugger/custom-resnet50d).
+
+راجع [البرنامج التعليمي للمشاركة](model_sharing) لمزيد من المعلومات حول طريقة الدفع إلى المحور.
+
+### استخدام نموذج مع كود مخصص
+
+يمكنك استخدام أي تكوين أو نموذج أو مقسم لغوي مع ملفات برمجة مخصصة في مستودعه باستخدام الفئات التلقائية و دالة `from_pretrained`.تُفحص جميع الملفات والرموز المرفوع إلى Hub بحثًا عن البرامج الضارة (راجع وثائق [أمان Hub](https://huggingface.co/docs/hub/security#malware-scanning) لمزيد من المعلومات)، ولكن يجب عليك مراجعة كود النموذج والمؤلف لتجنب تنفيذ التعليمات البرمجية الضارة على جهازك. لتفعيل نموذج يحتوي على شفرة برمجية مخصصة،  عيّن `trust_remote_code=True`:
+
+```py
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
+```
+
+يُنصح بشدة بتحديد رقم إصدار (commit hash) كـ `revision`  للتأكد من عدم تعديل مؤلف النموذج للشفرة لاحقًابإضافة أسطر ضارة  (إلا إذا كنت تثق تمامًا بمؤلفي النموذج):
+
+```py
+commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
+model = AutoModelForImageClassification.from_pretrained(
+    "sgugger/custom-resnet50d"، trust_remote_code=True، revision=commit_hash
+)
+```
+
+لاحظ وجود زرّ لنسخ رقم إصدار بسهولة عند تصفح سجل التزامات مستودع النموذج على منصة Hugging Face.
diff --git a/docs/source/ar/fast_tokenizers.md b/docs/source/ar/fast_tokenizers.md
new file mode 100644
index 00000000000000..539712969e813f
--- /dev/null
+++ b/docs/source/ar/fast_tokenizers.md
@@ -0,0 +1,51 @@
+# استخدام مجزئيات النصوص من 🤗 Tokenizers
+
+يعتمد [`PreTrainedTokenizerFast`] على مكتبة [🤗 Tokenizers](https://huggingface.co/docs/tokenizers). يمكن تحميل المجزئات اللغويين الذين تم الحصول عليهم من مكتبة 🤗 Tokenizers ببساطة شديدة في 🤗 Transformers.
+
+قبل الدخول في التفاصيل، دعونا نبدأ أولاً بإنشاء مُجزىء لغوي تجريبي في بضع سطور:
+
+```python
+>>> from tokenizers import Tokenizer
+>>> from tokenizers.models import BPE
+>>> from tokenizers.trainers import BpeTrainer
+>>> from tokenizers.pre_tokenizers import Whitespace
+
+>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+
+>>> tokenizer.pre_tokenizer = Whitespace()
+>>> files = [...]
+>>> tokenizer.train(files, trainer)
+```
+
+الآن لدينا مُجزىء لغوي مدرب على الملفات التي حددناها. يمكننا إما الاستمرار في استخدامه في وقت التشغيل هذا، أو حفظه في ملف JSON لإعادة استخدامه لاحقًا.
+
+## تحميل مُجزئ  النّصوص  مُباشرةً
+
+دعونا نرى كيف يمكننا الاستفادة من كائن (مُجزئ النصوص) في مكتبة 🤗 Transformers. تسمح فئة [`PreTrainedTokenizerFast`] سهولة إنشاء *tokenizer*، من خلال قبول كائن *المُجزئ النصوص*  مُهيّأ مُسبقًا كمعامل:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+```
+
+يمكن الآن استخدام هذا الكائن مع جميع الطرق المُشتركة بين مُجزّئي النّصوص  لـ 🤗 Transformers! انتقل إلى [صفحة مُجزّئ  النّصوص](main_classes/tokenizer) لمزيد من المعلومات.
+
+## التحميل من ملف JSON
+
+لتحميل مُجزّئ النص من ملف JSON، دعونا نبدأ أولاً بحفظ مُجزّئ النّصوص:
+
+```python
+>>> tokenizer.save("tokenizer.json")
+```
+
+يمكن تمرير المسار الذي حفظنا به هذا الملف إلى طريقة تهيئة [`PreTrainedTokenizerFast`] باستخدام المُعامل  `tokenizer_file`:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+```
+
+يمكن الآن استخدام هذا الكائن مع جميع الطرق التي تشترك فيها مُجزّئي  النّصوص لـ 🤗 Transformers! انتقل إلى [صفحة مُجزّئ النص](main_classes/tokenizer) لمزيد من المعلومات.
\ No newline at end of file
diff --git a/docs/source/ar/gguf.md b/docs/source/ar/gguf.md
new file mode 100644
index 00000000000000..cdb20c5640a689
--- /dev/null
+++ b/docs/source/ar/gguf.md
@@ -0,0 +1,89 @@
+# GGUF وتفاعلها مع المحولات
+
+تُستخدم صيغة ملف GGUF لتخزين النماذج للاستدلال باستخدام [GGML](https://github.com/ggerganov/ggml) والمكتبات الأخرى التي تعتمد عليه، مثل [llama.cpp](https://github.com/ggerganov/llama.cpp) أو [whisper.cpp](https://github.com/ggerganov/whisper.cpp) الشهيرة جدًا.
+
+إنها صيغة ملف [مدعومة من قبل Hugging Face Hub](https://huggingface.co/docs/hub/en/gguf) مع ميزات تسمح بالفحص السريع للموترات والبيانات الوصفية داخل الملف.
+
+تم تصميم تنسيق الملف هذا كـ "تنسيق ملف واحد" حيث يحتوي ملف واحد عادةً على كل من سمات التكوين ومفردات المجزىء اللغوي والخصائص الأخرى، بالإضافة إلى جميع الموترات التي سيتم تحميلها في النموذج. تأتي هذه الملفات بتنسيقات مختلفة وفقًا لنوع التكميم في الملف. نلقي نظرة موجزة على بعضها [هنا](https://huggingface.co/docs/hub/en/gguf#quantization-types).
+
+## الدعم داخل المحولات
+
+أضفنا القدرة على تحميل ملفات `gguf` داخل `المحولات` لتوفير قدرات تدريب/ضبط إضافية لنماذج gguf، قبل إعادة تحويل تلك النماذج إلى `gguf` لاستخدامها داخل نظام `ggml`. عند تحميل نموذج، نقوم أولاً بإلغاء تكميمه إلى fp32، قبل تحميل الأوزان لاستخدامها في PyTorch.
+
+> [!NOTE]
+> لا يزال الدعم تجريبيًا للغاية ونرحب بالمساهمات من أجل ترسيخه عبر أنواع التكميم وبنى النماذج.
+
+فيما يلي، بنيات النماذج وأنواع التكميم المدعومة:
+
+### أنواع التكميم المدعومة
+
+تُحدد أنواع التكميم المدعومة مبدئيًا وفقًا لملفات التكميم الشائعة التي تمت مشاركتها على Hub.
+
+- F32
+- F16
+- BF16
+- Q4_0
+- Q4_1
+- Q5_0
+- Q5_1
+- Q8_0
+- Q2_K
+- Q3_K
+- Q4_K
+- Q5_K
+- Q6_K
+- IQ1_S
+- IQ1_M
+- IQ2_XXS
+- IQ2_XS
+- IQ2_S
+- IQ3_XXS
+- IQ3_S
+- IQ4_XS
+- IQ4_NL
+
+> [!NOTE]
+> لدعم إلغاء تكميم gguf، يلزم تثبيت `gguf>=0.10.0`.
+
+### بنيات النماذج المدعومة
+
+في الوقت الحالي، بنيات النماذج المدعومة هي البنيات التي كانت شائعة جدًا على Hub، وهي:
+
+- LLaMa
+- Mistral
+- Qwen2
+- Qwen2Moe
+- Phi3
+- Bloom
+- Falcon
+- StableLM
+- GPT2
+- Starcoder2
+- T5
+
+## مثال الاستخدام
+
+لتحميل ملفات `gguf` في `transformers`، يجب تحديد معامل `gguf_file` فى دالة `from_pretrained` لكل من المُجزّئ اللغوية والنموذج. فيما يلي كيفية تحميل المُجزّئ اللغوي ونموذج، يمكن تحميلهما من نفس الملف:
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
+model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)
+```
+
+الآن لديك إمكانية الوصول إلى النسخة الكامل غير المكممة للنموذج في بيئة PyTorch، حيث يمكنك دمجه مع مجموعة كبيرة من الأدوات الأخرى.
+
+لإعادة التحويل إلى ملف `gguf`، نوصي باستخدام ملف [`convert-hf-to-gguf.py`](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) من llama.cpp.
+
+فيما يلي كيفية إكمال البرنامج النصي أعلاه لحفظ النموذج وإعادة تصديره مرة أخرى إلى `gguf`:
+
+```py
+tokenizer.save_pretrained('directory')
+model.save_pretrained('directory')
+
+!python ${path_to_llama_cpp}/convert-hf-to-gguf.py ${directory}
+```
diff --git a/docs/source/ar/model_sharing.md b/docs/source/ar/model_sharing.md
index 620261a0c58a3b..b802eb3ef038f0 100644
--- a/docs/source/ar/model_sharing.md
+++ b/docs/source/ar/model_sharing.md
@@ -28,7 +28,7 @@ picture-in-picture" allowfullscreen></iframe>
 
 ```py
 >>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # اسم العلامة، أو اسم الفرع، أو تجزئة الالتزام
+...     "julien-c/EsperBERTo-small", revision="4c77982"  # اسم العلامة، أو اسم الفرع، أو تجزئة الالتزام
 ... )
 ```
 
diff --git a/docs/source/ar/multilingual.md b/docs/source/ar/multilingual.md
new file mode 100644
index 00000000000000..b4b2a94fd40a33
--- /dev/null
+++ b/docs/source/ar/multilingual.md
@@ -0,0 +1,160 @@
+# النماذج متعددة اللغات للاستدلال
+
+هناك العديد من النماذج متعددة اللغات في مكتبة 🤗 Transformers، وتختلف طريقة استخدامها للاستدلال عن النماذج أحادية اللغة. ولكن ليس كل استخدام النماذج متعددة اللغات مختلف. فبعض النماذج، مثل [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased)، يمكن استخدامها تمامًا مثل النموذج أحادي اللغة. سيوضح لك هذا الدليل كيفية استخدام النماذج متعددة اللغات التي تختلف طريقة استخدامها للاستدلال.
+
+## XLM
+
+يحتوي XLM على عشر نسخ مختلفة، واحدة منها فقط أحادية اللغة. ويمكن تقسيم نسخ النماذج التسع المتبقية إلى فئتين: نسخ التي تستخدم تضمينات اللغة (language embeddings)  وتلك التي لا تستخدمها.
+
+### XLM مع تضمينات اللغة
+
+تستخدم النماذج التالية من XLM تضمينات اللغة لتحديد اللغة المستخدمة أثناء الاستدلال:
+
+- `FacebookAI/xlm-mlm-ende-1024` (نمذجة اللغة المقنعة، الإنجليزية-الألمانية)
+- `FacebookAI/xlm-mlm-enfr-1024` (نمذجة اللغة المقنعة، الإنجليزية-الفرنسية)
+- `FacebookAI/xlm-mlm-enro-1024` (نمذجة اللغة المقنعة، الإنجليزية-الرومانية)
+- `FacebookAI/xlm-mlm-xnli15-1024` (نمذجة اللغة المقنعة، لغات XNLI)
+- `FacebookAI/xlm-mlm-tlm-xnli15-1024` (نمذجة اللغة المقنعة + الترجمة، لغات XNLI)
+- `FacebookAI/xlm-clm-enfr-1024` (نمذجة اللغة السببية، الإنجليزية-الفرنسية)
+- `FacebookAI/xlm-clm-ende-1024` (نمذجة اللغة السببية، الإنجليزية-الألمانية)
+
+تُمثل تضمينات اللغة على شكل مصفوفة بنفس شكل  `input_ids` التي يتم تمريره إلى النموذج. وتعتمد القيم في هذه المصفوفات على اللغة المستخدمة ويتم تحديدها بواسطة معاملى المجزىء `lang2id` و `id2lang`.
+
+في هذا المثال، قم بتحميل نسخة `FacebookAI/xlm-clm-enfr-1024` ( نمذجة اللغة السببية، الإنجليزية-الفرنسية):
+
+```py
+>>> import torch
+>>> from transformers import XLMTokenizer, XLMWithLMHeadModel
+
+>>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
+```
+
+تُظهر خاصية `lang2id` في المجزىء اللغات وأرقام تعريفها في هذا النموذج:
+
+```py
+>>> print(tokenizer.lang2id)
+{'en': 0, 'fr': 1}
+```
+
+بعد ذلك، قم بإنشاء مثال على المدخلات:
+
+```py
+>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
+```
+
+قم بتعيين معرف اللغة إلى `"en"` واستخدمه لتحديد تضمين اللغة. وتضمين اللغة عبارة عن مصفوفة مملوءة بـ `0` لأن هذا هو معرف اللغة الإنجليزية. يجب أن تكون هذه المصفوفة بنفس حجم `input_ids`.
+
+```py
+>>> language_id = tokenizer.lang2id["en"]  # 0
+>>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+
+>>> # نقوم بإعادة تشكيلها لتكون بالحجم (batch_size، sequence_length)
+>>> langs = langs.view(1, -1)  # الآن بالحجم [1، sequence_length] (لدينا batch size تساوي 1)
+```
+
+الآن يمكنك تمرير `input_ids` وتضمين اللغة إلى النموذج:
+
+```py
+>>> outputs = model(input_ids, langs=langs)
+```
+
+يمكن لنص البرنامج النصي [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) توليد النص باستخدام تضمينات اللغة مع نقاط تفتيش `xlm-clm`.
+
+### XLM بدون تضمينات اللغة
+
+النماذج التالية من XLM لا تتطلب تضمينات اللغة أثناء الاستنتاج:
+
+- `FacebookAI/xlm-mlm-17-1280` (نمذجة اللغة المقنعة، 17 لغة)
+- `FacebookAI/xlm-mlm-100-1280` (نمذجة اللغة المقنعة، 100 لغة)
+
+تُستخدم هذه النماذج لتمثيل الجمل العامة، على عكس نسح XLM السابقة.
+
+## BERT
+
+يمكن استخدام النماذج التالية من BERT للمهام متعددة اللغات:
+
+- `google-bert/bert-base-multilingual-uncased` (نمذجة اللغة المقنعة + التنبؤ بالجملة التالية، 102 لغة)
+- `google-bert/bert-base-multilingual-cased` (نمذجة اللغة المقنعة + التنبؤ بالجملة التالية، 104 لغات)
+
+لا تتطلب هذه النماذج تضمينات اللغة أثناء الاستدلال. يجب أن تُحدّد اللغة من السياق وتستنتج وفقاً لذلك.
+
+## XLM-RoBERTa
+
+يمكن استخدام النماذج التالية من XLM-RoBERTa للمهام متعددة اللغات:
+
+- `FacebookAI/xlm-roberta-base` (نمذجة اللغة المقنعة، 100 لغة)
+- `FacebookAI/xlm-roberta-large` (نمذجة اللغة المقنعة، 100 لغة)
+
+تم تدريب XLM-RoBERTa على 2.5 تيرابايت من بيانات CommonCrawl الجديدة والمحسنة في 100 لغة. ويوفر مكاسب قوية على النماذج متعددة اللغات التي تم إصدارها سابقاً مثل mBERT أو XLM في مهام المصب مثل التصنيف، ووضع العلامات التسلسلية، والأسئلة والأجوبة.
+
+## M2M100
+
+يمكن استخدام النماذج التالية من M2M100 للترجمة متعددة اللغات:
+
+- `facebook/m2m100_418M` (الترجمة)
+- `facebook/m2m100_1.2B` (الترجمة)
+
+في هذا المثال، قم بتحميل نسحة  `facebook/m2m100_418M` لترجمة النص من الصينية إلى الإنجليزية. يمكنك تعيين اللغة المصدر في المجزىء اللغوى:
+
+```py
+>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
+
+>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
+>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+```
+
+تقسيم النّص إلى رموز:
+
+```py
+>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
+```
+
+يجبر M2M100 معرف اللغة الهدف كأول رمز مولد للترجمة إلى اللغة الهدف. قم بتعيين `forced_bos_token_id` إلى `en` في طريقة `generate` للترجمة إلى الإنجليزية:
+
+```py
+>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
+```
+
+## MBart
+
+يمكن استخدام النماذج التالية من MBart للترجمة متعددة اللغات:
+
+- `facebook/mbart-large-50-one-to-many-mmt` (الترجمة الآلية متعددة اللغات من واحد إلى كثير، 50 لغة)
+- `facebook/mbart-large-50-many-to-many-mmt` (الترجمة الآلية متعددة اللغات من كثير إلى كثير، 50 لغة)
+- `facebook/mbart-large-50-many-to-one-mmt` (الترجمة الآلية متعددة اللغات من كثير إلى واحد، 50 لغة)
+- `facebook/mbart-large-50` (الترجمة متعددة اللغات، 50 لغة)
+- `facebook/mbart-large-cc25`
+
+في هذا المثال، قم بتحميل نسخة `facebook/mbart-large-50-many-to-many-mmt` لترجمة النص من الفنلندية إلى الإنجليزية. يمكنك تعيين اللغة المصدر في المجزىء:
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
+
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+```
+
+تقسيم النّص إلى رموز:
+
+```py
+>>> encoded_en = tokenizer(en_text, return_tensors="pt")
+```
+
+يجبر MBart معرف لغة الهدف كأول رمز مولد للترجمة إلى اللغة الهدف. قم بتعيين `forced_bos_token_id` إلى `en` في طريقة `generate` للترجمة إلى الإنجليزية:
+
+```py
+>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
+```
+
+إذا كنت تستخدم نسخة `facebook/mbart-large-50-many-to-one-mmt`، فلا تحتاج إلى إجبار معرف لغة الهدف كأول رمز مولد، وإلا فإن الاستخدام هو نفسه.
\ No newline at end of file
diff --git a/docs/source/ar/sagemaker.md b/docs/source/ar/sagemaker.md
new file mode 100644
index 00000000000000..6bb53816baaaee
--- /dev/null
+++ b/docs/source/ar/sagemaker.md
@@ -0,0 +1,8 @@
+# تشغيل التدريب على Amazon SageMaker
+
+تم نقل التوثيق إلى [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). وسيتم إزالة هذه الصفحة في الإصدار 5.0 من برنامج Transformers.
+
+### جدول المحتويات
+
+- [تدريب نماذج Hugging Face على Amazon SageMaker باستخدام SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train)
+- [نشر نماذج Hugging Face على Amazon SageMaker باستخدام SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference)
\ No newline at end of file
diff --git a/docs/source/ar/serialization.md b/docs/source/ar/serialization.md
new file mode 100644
index 00000000000000..2df620d86239a0
--- /dev/null
+++ b/docs/source/ar/serialization.md
@@ -0,0 +1,170 @@
+# التصدير إلى ONNX
+
+غالباً ما يتطلب نشر نماذج 🤗 Transformers في بيئات الإنتاج أو يمكن أن يستفيد من تصدير النماذج إلى تنسيق تسلسلي يُمكن تحميله وتنفيذه على أجهزة وبرامج تشغيل مُتخصصة.
+
+🤗 Optimum هو امتداد لـ Transformers يمكّن من تصدير النماذج من PyTorch أو TensorFlow إلى تنسيقات مُتسلسلة مثل ONNX و TFLite من خلال وحدة `exporters` الخاصة به. يوفر 🤗 Optimum أيضًا مجموعة من أدوات تحسين الأداء لتدريب النماذج وتشغيلها على أجهزة مستهدفة بكفاءة قصوى.
+
+يوضح هذا الدليل كيفية تصدير نماذج 🤗 Transformers إلى ONNX باستخدام 🤗 Optimum، وللحصول على الدليل الخاص بتصدير النماذج إلى TFLite، يُرجى الرجوع إلى صفحة [التصدير إلى TFLite](tflite).
+
+## التصدير إلى ONNX
+
+مجمد [ONNX (Open Neural Network Exchange)](http://onnx.ai) هو معيار مفتوح يُحدد مجموعة مشتركة من العوامل وتنسيق ملف مشترك لتمثيل نماذج التعلم العميق في مجموعة متنوعة واسعة من الأطر، بما في ذلك PyTorch وTensorFlow. عندما يتم تصدير نموذج إلى تنسيق ONNX، يتم استخدام هذه المشغلات لبناء رسم بياني حاسوبي (يُطلق عليه غالبًا اسم _تمثيل وسيط_) والذي يمثل تدفق البيانات عبر الشبكة العصبية.
+
+من خلال عرض رسم بياني بعوامل وأنواع بيانات معيارية، يُسهّل ONNX  التبديل بين الأطر. على سبيل المثال، يُمكن تصدير نموذج مدرب في PyTorch إلى تنسيق ONNX ثم استيراده في TensorFlow (والعكس صحيح).
+
+بمجرد التصدير إلى تنسيق ONNX، يُمكن:
+
+-  تحسين النموذج للاستدلال عبر تقنيات مثل [تحسين الرسم البياني](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) و [التكميم](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization).
+- تشغيله باستخدام ONNX Runtime عبر فئات [`ORTModelForXXX`](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort)، والتي تتبع نفس واجهة برمجة التطبيقات (API) لـ `AutoModel` التي اعتدت عليها في 🤗 Transformers.
+- تشغيله باستخدام [قنوات معالجة الاستدلال مُحسّنة](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines)، والتي لها نفس واجهة برمجة التطبيقات (API) مثل وظيفة [`pipeline`] في 🤗 Transformers.
+
+يوفر 🤗 Optimum دعمًا لتصدير ONNX من خلال الاستفادة من كائنات التكوين. تأتي كائنات التكوين هذه جاهزة لعدد من معماريات النماذج، وقد تم تصميمها لتكون قابلة للتوسعة بسهولة إلى معماريات أخرى.
+
+للاطلاع على قائمة بالتكوينات الجاهزة، يُرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/overview).
+
+هناك طريقتان لتصدير نموذج 🤗 Transformers إلى ONNX،  نعرض هنا كليهما:
+
+- التصدير باستخدام 🤗 Optimum عبر واجهة سطر الأوامر (CLI).
+- التصدير باستخدام 🤗 Optimum مع `optimum.onnxruntime`.
+
+### تصدير نموذج 🤗 Transformers إلى ONNX باستخدام واجهة سطر الأوامر
+
+لتصدير نموذج 🤗 Transformers إلى ONNX، قم أولاً بتثبيت اعتماد إضافي:
+
+```bash
+pip install optimum[exporters]
+```
+
+للاطلاع على جميع المعامﻻت المتاحة، يرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)، أو عرض المساعدة في سطر الأوامر:
+
+```bash
+optimum-cli export onnx --help
+```
+```bash
+optimum-cli export onnx --help
+```
+
+لتصدير نقطة تفتيش نموذج من 🤗 Hub، على سبيل المثال، `distilbert/distilbert-base-uncased-distilled-squad`، قم بتشغيل الأمر التالي:
+
+```bash
+optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
+```
+
+يجب أن تشاهد السجلات التي تشير إلى التقدم المحرز وتظهر المكان الذي تم فيه حفظ ملف `model.onnx` الناتج، مثل هذا:
+
+```bash
+Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx...
+	-[✓] ONNX model output names match reference model (start_logits, end_logits)
+	- Validating ONNX Model output "start_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+	- Validating ONNX Model output "end_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx
+```
+
+يوضح المثال أعلاه تصدير نقطة تفتيش من 🤗 Hub. عند تصدير نموذج محلي، تأكد أولاً من حفظ ملفات أوزان النموذج ومحول الرموز في نفس الدليل (`local_path`). عند استخدام واجهة سطر الأوامر، قم بتمرير `local_path` إلى وسيط `model` بدلاً من اسم نقطة التفتيش على 🤗 Hub وقدم وسيط `--task`. يمكنك مراجعة قائمة المهام المدعومة في [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/task_manager). إذا لم يتم توفير وسيط `task`، فسيتم تعيينه افتراضيًا إلى هندسة النموذج دون أي رأس محدد للمهمة.
+
+```bash
+optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/
+```
+
+يمكن بعد ذلك تشغيل ملف `model.onnx` الناتج على أحد [المسرعات](https://onnx.ai/supported-tools.html#deployModel) العديدة التي تدعم معيار ONNX. على سبيل المثال، يمكننا تحميل النموذج وتشغيله باستخدام [ONNX Runtime](https://onnxruntime.ai/) كما يلي:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from optimum.onnxruntime import ORTModelForQuestionAnswering
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+
+تكون العملية مماثلة بالنسبة إلى نقاط تفتيش TensorFlow على Hub. على سبيل المثال، إليك كيفية تصدير نقطة تفتيش TensorFlow نقية من [منظمة Keras](https://huggingface.co/keras-io):
+
+```bash
+optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_squad_onnx/
+```
+
+### تصدير نموذج 🤗 Transformers إلى ONNX باستخدام `optimum.onnxruntime`
+
+كبديل لواجهة سطر الأوامر، يُمكنك تصدير نموذج 🤗 Transformers إلى ONNX برمجيًا كما يلي:
+
+```python
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
+>>> from transformers import AutoTokenizer
+
+>>> model_checkpoint = "distilbert_base_uncased_squad"
+>>> save_directory = "onnx/"
+
+>>> # تحميل نموذج من transformers وتصديره إلى ONNX
+>>> ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+
+>>> # حفظ نموذج onnx ومجزىء النصوص
+>>> ort_model.save_pretrained(save_directory)
+>>> tokenizer.save_pretrained(save_directory)
+```
+
+### تصدير نموذج لهندسة غير مدعومة
+
+إذا كنت ترغب في المساهمة من خلال إضافة دعم لنموذج لا يُمكن تصديره حاليًا، فيجب عليك أولاً التحقق مما إذا كان مدعومًا في [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)، وإذا لم يكن مدعومًا، [فيمكنك المساهمة في 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) مُباشرةً.
+
+### تصدير نموذج باستخدام `transformers.onnx`
+
+<Tip warning={true}>
+
+لم يعد يتم دعم `tranformers.onnx`  يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة.
+
+</Tip>
+
+لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `tranformers.onnx`، ثبّت التبعيات الإضافية:
+
+```bash
+pip install transformers[onnx]
+```
+
+استخدم حزمة `transformers.onnx` كنموذج Python لتصدير نقطة حفظ باستخدام تكوين جاهز:
+
+```bash
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
+```
+
+يُصدّر هذا رسمًا بيانيًا ONNX لنقطة الحفظ المُحددة بواسطة وسيطة `--model`. مرر أي نقطة حفظ على 🤗 Hub أو نقطة حفظ مُخزنة محليًا.
+يُمكن بعد ذلك تشغيل ملف `model.onnx` الناتج على أحد المُسرعات العديدة التي تدعم معيار ONNX. على سبيل المثال، قم بتحميل وتشغيل النموذج باستخدام ONNX Runtime كما يلي:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # يتوقع ONNX Runtime مصفوفات NumPy كمدخلات
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
+```
+
+يُمكن الحصول على أسماء المخرجات المطلوبة (مثل `["last_hidden_state"]`) من خلال إلقاء نظرة على تكوين ONNX لكل نموذج. على سبيل المثال، بالنسبة لـ DistilBERT، لدينا:
+
+```python
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
+
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
+```
+
+العمليات مُتطابقة لنقاط الحفظ TensorFlow على Hub. على سبيل المثال، صدّر نقطة حفظ TensorFlow خالصة كما يلي:
+
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
+```
+
+لتصدير نموذج مُخزن محليًا، احفظ أوزان النموذج ومجزىء اللغوى في نفس الدليل (على سبيل المثال `local-pt-checkpoint`)، ثم قم بتصديره إلى ONNX عن طريق توجيه وسيط `--model` لحزمة `transformers.onnx` إلى الدليل المطلوب:
+
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
\ No newline at end of file
diff --git a/docs/source/ar/tflite.md b/docs/source/ar/tflite.md
new file mode 100644
index 00000000000000..5e75c7a10a3c66
--- /dev/null
+++ b/docs/source/ar/tflite.md
@@ -0,0 +1,40 @@
+# التصدير إلى TFLite
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/guide) هو إطار عمل خفيف الوزن لنشر نماذج التعلم الآلي على الأجهزة المحدودة الموارد، مثل الهواتف المحمولة، والأنظمة المدمجة، وأجهزة إنترنت الأشياء (IoT). تم تصميم TFLite لتشغيل النماذج وتحسينها بكفاءة على هذه الأجهزة ذات الطاقة الحاسوبية والذاكرة واستهلاك الطاقة المحدودة.
+
+يُمثَّل نموذج TensorFlow Lite بتنسيق محمول فعال خاص يُعرَّف بامتداد الملف `.tflite`.
+
+🤗 Optimum يقدم وظيفة لتصدير نماذج 🤗 Transformers إلى TFLite من خلال الوحدة النمطية `exporters.tflite`. بالنسبة لقائمة هندسات النماذج المدعومة، يرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/tflite/overview).
+
+لتصدير نموذج إلى TFLite، قم بتثبيت متطلبات البرنامج المطلوبة:
+
+```bash
+pip install optimum[exporters-tf]
+```
+
+للاطلاع على جميع المغامﻻت المتاحة، راجع [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model)، أو عرض المساعدة في سطر الأوامر:
+
+```bash
+optimum-cli export tflite --help
+```
+
+لتصدير نسخة النموذج ل 🤗 Hub، على سبيل المثال، `google-bert/bert-base-uncased`، قم بتشغيل الأمر التالي:
+
+```bash
+optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
+```
+
+ستظهر لك السجلات  التي تُبيّن التقدم وموقع حفظ ملف  `model.tflite` الناتج، كما في المثال التالي:
+
+```bash
+Validating TFLite model...
+	-[✓] TFLite model output names match reference model (logits)
+	- Validating TFLite Model output "logits":
+		-[✓] (1, 128, 30522) matches (1, 128, 30522)
+		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
+The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
+- logits: max diff = 5.817413330078125e-05.
+ The exported model was saved at: bert_tflite
+```
+
+يُبيّن المثال أعلاه كيفية تصدير نسخة من النموذج ل 🤗 Hub. عند تصدير نموذج محلي، تأكد أولاً من حفظ ملفات أوزان النموذج المجزء اللغوى في نفس المسار (`local_path`). عند استخدام CLI، قم بتمرير `local_path` إلى معامل `model` بدلاً من اسم النسخة على 🤗 Hub.
\ No newline at end of file
diff --git a/docs/source/ar/torchscript.md b/docs/source/ar/torchscript.md
new file mode 100644
index 00000000000000..bf0bc0dde04b62
--- /dev/null
+++ b/docs/source/ar/torchscript.md
@@ -0,0 +1,154 @@
+# التصدير إلى TorchScript
+
+<Tip>
+
+هذه هي بداية تجاربنا مع TorchScript ولا زلنا نستكشف قدراته مع نماذج المدخلات المتغيرة الحجم. إنه مجال اهتمامنا وسنعمق تحليلنا في الإصدارات القادمة، مع المزيد من الأمثلة البرمجية، وتنفيذ أكثر مرونة، ومقاييس مقارنة بين  الأكواد القائمة على Python مع أكواد TorchScript المُجمّعة.
+
+</Tip>
+
+وفقًا لـ [وثائق TorchScript](https://pytorch.org/docs/stable/jit.html):
+
+> TorchScript هي طريقة لإنشاء نماذج قابلة للتسلسل والتحسين من تعليمات PyTorch البرمجية.
+
+هناك وحدتان من PyTorch، [JIT and TRACE](https://pytorch.org/docs/stable/jit.html)، تتيحان للمطورين تصدير نماذجهم لإعادة استخدامها في برامج أخرى مثل برامج C++ المُحسّنة للأداء.
+
+نقدم واجهة تتيح لك تصدير نماذج 🤗 Transformers إلى TorchScript بحيث يمكن إعادة استخدامها في بيئة مختلفة عن برامج Python القائمة إلى PyTorch. هنا نشرح كيفية تصدير نماذجنا واستخدامها باستخدام TorchScript.
+
+يتطلب تصدير نموذج أمرين:
+
+- تهيئة مثيل للنموذج باستخدام علامة `torchscript`
+- تمرير مُدخلات وهمية (dummy inputs) خلال النموذج
+
+تنطوي هذه الضرورات على عدة أمور يجب على المطورين توخي الحذر بشأنها كما هو مفصل أدناه.
+
+## علامة TorchScript والأوزان المرتبطة
+
+علامة `torchscript` ضرورية لأن معظم نماذج اللغة 🤗 Transformers لها أوزان مرتبطة بين طبقة `Embedding` وطبقة `Decoding`. لا يسمح لك TorchScript بتصدير النماذج ذات الأوزان المرتبطة، لذلك من الضروري فصل الأوزان ونسخها مسبقًا.
+
+النماذج المُهيأة باستخدام علامة `torchscript` لها طبقة `Embedding` وطبقة`Decoding` منفصلتين، مما يعني أنه لا ينبغي تدريبها لاحقًا. سيؤدي التدريب إلى عدم تزامن الطبقتين، مما يؤدي إلى نتائج غير متوقعة.
+
+هذا لا ينطبق على النماذج التي لا تحتوي على رأس نموذج اللغة، حيث لا تملك أوزانًا مرتبطة. يمكن تصدير هذه النماذج بأمان دون علامة `torchscript`.
+
+## المدخلات الوهمية والأطوال القياسية
+
+تُستخدم المُدخلات الوهمية لتمرير أمامي خلال النموذج. أثناء انتشار قيم المُدخلات عبر الطبقات، يتتبع PyTorch العمليات المختلفة التي يتم تنفيذها على كل مصفوفة(tensor). ثم يتم استخدام هذه العمليات المُسجلة بعد ذلك لإنشاء *أثر* النموذج.
+
+يتم إنشاء التتبع بالنسبة لأبعاد المُدخلات. وبالتالي، فهو مُقيّد بأبعاد المُدخلات الوهمية، ولن يعمل لأي طول تسلسل أو حجم دفعة مختلف. عند المحاولة بحجم مختلف، يتم رفع الخطأ التالي:
+
+```
+`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
+```
+
+نوصي بتتبع النموذج باستخدام حجم مُدخلات وهمية لا يقل عن أكبر مُدخل سيتم تقديمه للنموذج أثناء الاستدلال. يمكن أن تساعد الحشوة(padding) في ملء القيم المفقودة. ومع ذلك، نظرًا لتتبع النموذج بحجم مُدخل أكبر، ستكون أبعاد المصفوفة ستكون كبيرة أيضًا، مما يؤدي عنه المزيد من الحسابات.
+
+انتبه إلى إجمالي عدد العمليات المُنفذة على كل مُدخل وتابع الأداء عن كثب عند تصدير نماذج متغيرة طول التسلسل.
+
+## استخدام TorchScript في Python
+
+يوضح هذا القسم كيفية حفظ النماذج وتحميلها، بالإضافة إلى كيفية استخدام التتبع للاستدلال.
+
+### حفظ نموذج
+
+لتصدير `BertModel` باستخدام TorchScript، قم بتهيئة ـ `BertModel` من فئة `BertConfig` ثم احفظه على القرص تحت اسم الملف `traced_bert.pt`:
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+
+enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+
+# Tokenizing input text
+text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+tokenized_text = enc.tokenize(text)
+
+# Masking one of the input tokens
+masked_index = 8
+tokenized_text[masked_index] = "[MASK]"
+indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+# Creating a dummy input
+tokens_tensor = torch.tensor([indexed_tokens])
+segments_tensors = torch.tensor([segments_ids])
+dummy_input = [tokens_tensor, segments_tensors]
+
+# Initializing the model with the torchscript flag
+# Flag set to True even though it is not necessary as this model does not have an LM Head.
+config = BertConfig(
+    vocab_size_or_config_json_file=32000,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    torchscript=True,
+)
+
+# Instantiating the model
+model = BertModel(config)
+
+# The model needs to be in evaluation mode
+model.eval()
+
+# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
+model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
+
+# Creating the trace
+traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
+torch.jit.save(traced_model, "traced_bert.pt")
+```
+
+### تحميل نموذج
+
+يمكنك الآن تحميل `BertModel` المُحفظ سابقًا، `traced_bert.pt`، من القرص واستخدامه على `dummy_input` المُهيأ سابقًا:
+
+```python
+loaded_model = torch.jit.load("traced_bert.pt")
+loaded_model.eval()
+
+all_encoder_layers, pooled_output = loaded_model(*dummy_input)
+```
+
+### استخدام نموذج مُتتبع للاستدلال
+
+استخدم النموذج المُتتبع للاستدلال باستخدام أسلوب `__call__` الخاص به:
+
+```python
+traced_model(tokens_tensor, segments_tensors)
+```
+
+## نشر نماذج Hugging Face TorchScript على AWS باستخدام Neuron SDK
+
+قدمت AWS عائلة [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) من اﻷجهزة لخفض التكلفة وأداء التعلم الآلي عالي الأداء في البيئة السحابية. تعمل أجهزة Inf1 بواسطة شريحة Inferentia من AWS، وهي مُسرّع أجهزة مُخصص، متخصص في أعباء عمل الاستدلال للتعلم العميق. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) هي SDK لـ Inferentia التي تدعم تتبع نماذج المحولات وتحسينها للنشر على Inf1. توفر Neuron SDK ما يلي:
+
+1. واجهة برمجة تطبيقات سهلة الاستخدام مع تغيير سطر واحد من التعليمات البرمجية لتتبع نموذج TorchScript وتحسينه للاستدلال في البيئة السحابية.
+2. تحسينات الأداء الجاهزة للاستخدام [تحسين التكلفة والأداء](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>).
+3. دعم نماذج Hugging Face المحولات المبنية باستخدام إما [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) أو [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
+
+### الآثار المترتبة
+
+تعمل نماذج المحولات المستندة إلى بنية [BERT (تمثيلات الترميز ثنائية الاتجاه من المحولات)](https://huggingface.co/docs/transformers/main/model_doc/bert) أو متغيراتها مثل [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) و [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) بشكل أفضل على Inf1 للمهام غير التوليدية مثل الإجابة على الأسئلة الاستخراجية، وتصنيف التسلسلات، وتصنيف الرموز (tokens). ومع ذلك، يمكن تكييف مهام توليد النصوص للعمل على Inf1 وفقًا لهذا [برنامج تعليمي AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). يمكن العثور على مزيد من المعلومات حول النماذج التي يمكن تحويلها جاهزة على Inferentia في قسم [ملاءمة بنية النموذج](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) من وثائق Neuron.
+
+### التبعيات (Dependencies)
+
+يتطلب استخدام AWS Neuron لتحويل النماذج [بيئة SDK Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) والتي تأتي مسبقًا على [AMI للتعلم العميق من AWS](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
+
+### تحويل نموذج لـ AWS Neuron
+
+قم بتحويل نموذج لـ AWS NEURON باستخدام نفس التعليمات البرمجية من [استخدام TorchScript في Python](torchscript#using-torchscript-in-python) لتتبع `BertModel`. قم باستيراد امتداد إطار عمل `torch.neuron` للوصول إلى مكونات Neuron SDK من خلال واجهة برمجة تطبيقات Python:
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+import torch.neuron
+```
+
+كل ما عليك فعله هو تعديل السطر التالي:
+
+```diff
+- torch.jit.trace(model, [tokens_tensor, segments_tensors])
++ torch.neuron.trace(model, [token_tensor, segments_tensors])
+```
+
+يتيح ذلك لـ Neuron SDK تتبع النموذج وتحسينه لمثيلات Inf1.
+
+لمعرفة المزيد حول ميزات AWS Neuron SDK والأدوات ودروس البرامج التعليمية والتحديثات الأخيرة، يرجى الاطلاع على [وثائق AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
diff --git a/docs/source/ar/trainer.md b/docs/source/ar/trainer.md
new file mode 100644
index 00000000000000..7da7cbf4e1714b
--- /dev/null
+++ b/docs/source/ar/trainer.md
@@ -0,0 +1,720 @@
+# Trainer
+
+تُتيح وحدة [`Trainer`] حلقة تدريب وتقييم متكاملة لنماذج PyTorch المطبقة في مكتبة Transformers. تحتاج فقط إلى تمرير المكونات الضرورية للتدريب (النموذج، والمجزىء النصى، ومجموعة البيانات، دالة التقييم، معلمات التدريب الفائقة، إلخ)، وستتولى فئة [`Trainer`] الباقي. هذا يُسهّل بدء التدريب بشكل أسرع دون كتابة حلقة التدريب الخاصة بك يدويًا. ولكن في الوقت نفسه، فإن [`Trainer`] قابل للتخصيص بدرجة كبيرة ويوفر العديد من خيارات التدريب حتى تتمكن من تخصيصه وفقًا لاحتياجات التدريب الخاصة بك بدقة.
+
+<Tip>
+
+بالإضافة إلى فئة [`Trainer`], توفر مكتبة Transformers أيضًا فئة [`Seq2SeqTrainer`] للمهام التسلسلية مثل الترجمة أو التلخيص. هناك أيضًا فئة [`~trl.SFTTrainer`] من مكتبة [TRL](https://hf.co/docs/trl) التي تغلّف فئة [`Trainer`] وهي مُحُسَّنة لتدريب نماذج اللغة مثل Llama-2 وMistral باستخدام تقنيات التوليد اللغوي. كما يدعم [`~trl.SFTTrainer`] ميزات مثل حزم التسلسلات، وLoRA، والقياس الكمي، وDeepSpeed مما يُمكّن من التدريب بكفاءة على نماذج ضخمة الحجم.
+
+<br>
+
+لا تتردد في الاطلاع على [مرجع API](./main_classes/trainer) لهذه الفئات الأخرى من النوع [`Trainer`] لمعرفة المزيد حول متى يتم استخدام كل منها. بشكل عام، [`Trainer`] هو الخيار الأكثر تنوعًا ومناسبًا لمجموعة واسعة من المهام. تم تصميم [`Seq2SeqTrainer`] للمهام التسلسلية ، و [`~trl.SFTTrainer`] مُصمم لتدريب نماذج اللغة الكبيرة.
+
+</Tip>
+
+قبل البدء، تأكد من تثبيت مكتبة [Accelerate](https://hf.co/docs/accelerate) - وهي مكتبة تُمكّن تشغيل تدريب PyTorch في بيئات مُوزعة.
+
+```bash
+pip install accelerate
+
+# upgrade
+pip install accelerate --upgrade
+```
+
+يوفر هذا الدليل نظرة عامة على فئة [`Trainer`].
+
+## الاستخدام الأساسي
+
+يتضمن [`Trainer`] جميع التعليمات البرمجية التي ستجدها في حلقة التدريب الأساسية:
+
+1. قم بتنفيذ خطوة تدريب لحساب الخسارة
+2. احسب المشتقات باستخدام طريقة [`~accelerate.Accelerator.backward`]
+3. تحديث الأوزان بناءً على المشتقات
+4. كرر هذه العملية حتى تصل إلى عدد محدد مسبقًا من الدورات (epochs).
+
+تُجرد فئة [`Trainer`] كل هذه التعليمات البرمجية حتى لا تضطر إلى القلق بشأن كتابة حلقة تدريب يدويًا في كل مرة أما إذا كنت بدأت للتو في PyTorch والتدريب. كل ما عليك فعله هو توفير المكونات الأساسية اللازمة للتدريب، مثل النموذج ومجموعة بيانات، وتتعامل فئة [`Trainer`] مع كل شيء آخر.
+
+إذا كنت تُريد تحديد أي خيارات تدريب أو معلمات فائقة، فيمكنك العثور عليها في فئة [`TrainingArguments`]. على سبيل المثال، دعنا نحدد أين يتم حفظ النموذج في `output_dir` ورفع النموذج إلى Hub بعد التدريب باستخدام `push_to_hub=True`.
+
+```py
+from transformers import TrainingArguments
+
+training_args = TrainingArguments(
+    output_dir="your-model"،
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    eval_strategy="epoch"،
+    save_strategy="epoch"،
+    load_best_model_at_end=True,
+    push_to_hub=True,
+)
+```
+مرر `training_args` إلى [`Trainer`] جنبًا إلى جنب مع النموذج، ومجموعة بيانات، وشئ لمعالجة مجموعة البيانات مسبقًا (حسب نوع البيانات، فقد يكون محللًا رمزيًا أو مستخرج ميزات أو معالج صور)، وجامع بيانات، ودالة لحساب المقاييس التي تُريد تتبعها أثناء التدريب.
+
+أخيرًا، استدعِ [`~Trainer.train`] لبدء التدريب!
+
+```py
+from transformers import Trainer
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"]،
+    eval_dataset=dataset["test"]،
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+)
+
+trainer.train()
+```
+
+### نقاط الحفظ
+
+تحفظ فئة [`Trainer`] نقاط الحفظ النموذج في الدليل المحدد في معامل `output_dir` من [`TrainingArguments`]. ستجد نقاط الحفظ في مجلد فرعي يسمى `checkpoint-000` حيث تتوافق الأرقام في النهاية مع خطوة التدريب. إن حفظ نقاط الحفظ مفيد لاستئناف التدريب لاحقًا.
+
+```py
+# استأنف من أحدث نقطة حفظ
+trainer.train(resume_from_checkpoint=True)
+
+# استأنف من نقطة حفظ محددة محفوظة في دليل الإخراج
+trainer.train(resume_from_checkpoint="your-model/checkpoint-1000")
+```
+
+يمكنك حفظ نقاط الحفظ الخاصة بك (لا يتم حفظ حالة المُجزىء اللغوى تقائيًا)  إلى Hub عن طريق تعيين `push_to_hub=True` في [`TrainingArguments`] لرفعها. الخيارات الأخرى لاتخاذ القرار بشأن كيفية حفظ هذة النقاط الخاصة بك هي الإعداد في معامل [`hub_strategy`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.hub_strategy):
+
+* `hub_strategy="checkpoint"` يدفع أحدث نقطة حفظ إلى مجلد فرعي يسمى "last-checkpoint" يمكنك استئناف التدريب منه
+* `hub_strategy="all_checkpoints"` يدفع جميع نقاط الحفظ إلى الدليل المحدد في `output_dir` (سترى نقطة حفظ واحدة لكل مجلد في مستودع النموذج الخاص بك)
+
+عند استئناف التدريب من نقطة حفظ، تُحاول [`Trainer`] الحفاظ على حالات RNG Python وNumPy وPyTorch كما كانت عندما تم حفظ نقطة الحفظ. ولكن لأن PyTorch لديها العديد من الإعدادات الافتراضية غير الحتمية مُتنوعة، فإن حالات RNG ليست مضمونة لتكون هي نفسها. إذا كنت تريد تمكين الحتمية الكاملة، فراجع دليل [التحكم في مصادر العشوائية](https://pytorch.org/docs/stable/notes/randomness#controlling-sources-of-randomness) لمعرفة ما يُمكنك تمكينه لجعل تدريبك حتميًا تمامًا. ضع في اعتبارك أنه من خلال جعل إعدادات معينة حتمية، فقد يكون التدريب أبطأ.
+
+## تخصيص المدرب
+
+في حين أن فئة [`Trainer`] مُصممة لتكون سهلة الوصول وسهلة الاستخدام، فإنها توفر أيضًا الكثير من قابلية التخصيص للمستخدمين المغامرين.  يُمكن إنشاء فئات فرعية من العديد من أساليب [`Trainer`] وتجاوزها لدعم الوظائف التي تُريدها، دون الحاجة إلى إعادة كتابة حلقة التدريب بأكملها من البداية لاستيعابها. تتضمن هذه الأساليب:
+
+* [`~Trainer.get_train_dataloader`] ينشئ DataLoader للتدريب
+* [`~Trainer.get_eval_dataloader`] ينشئ DataLoader للتقييم
+* [`~Trainer.get_test_dataloader`] ينشئ DataLoader للاختبار
+* [`~Trainer.log`] يسجل معلومات حول مختلف الكائنات التي تراقب التدريب
+* [`~Trainer.create_optimizer_and_scheduler`] ينشئ محسنًا ومخططًا لمُعدل التعلم إذا لم يتم تمريرهما في `__init__`؛ يمكن أيضًا تخصيص هذه الوظائف بشكل منفصل باستخدام [`~Trainer.create_optimizer`] و [`~Trainer.create_scheduler`] على التوالي
+* [`~Trainer.compute_loss`] يحسب دالة الخسارة على دفعة من مُدخلات التدريب
+* [`~Trainer.training_step`] يُنفذ خطوة التدريب
+* [`~Trainer.prediction_step`] يُنفذ خطوة التنبؤ والاختبار
+* [`~Trainer.evaluate`] يُقيّم النموذج ويعيد مقاييس التقييم
+* [`~Trainer.predict`] يُجري التنبؤات (مع المقاييس إذا كانت العلامات متاحة) على مجموعة الاختبار
+
+على سبيل المثال، إذا كنت تريد تخصيص طريقة [`~Trainer.compute_loss`] لاستخدام دالة خسارة ذات ترجيح بدلاً من ذلك.
+
+
+```py
+from torch import nn
+from transformers import Trainer
+
+class CustomTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.pop("labels")
+        # forward pass
+        outputs = model(**inputs)
+        logits = outputs.get("logits")
+        # compute custom loss for 3 labels with different weights
+        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
+        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
+        return (loss, outputs) if return_outputs else loss
+```
+
+### دوال الاستدعاء Callbacks
+
+خيار آخر لتخصيص [`Trainer`] هو استخدام [دوال الاستدعاء](callbacks). لا *تغير* دوال الاستدعاء أي شيء في حلقة التدريب. إنهم تفحص حالة حلقة التدريب ثم تُنفذ بعض الإجراءات (مثل الإيقاف المبكر أو تسجيل النتائج، إلخ) اعتمادًا على الحالة. وبعبارة أخرى، لا يمكن استخدام دالة الاستدعاء لتنفيذ شيء مثل دالة خسارة مخصصة، ويجب عليك تجاوز دالة [`~Trainer.compute_loss`] لذلك.
+
+على سبيل المثال، إذا كنت تريد إضافة دالة استدعاء إيقاف مبكر إلى حلقة التدريب بعد 10 خطوات.
+
+```py
+from transformers import TrainerCallback
+
+class EarlyStoppingCallback(TrainerCallback):
+    def __init__(self, num_steps=10):
+        self.num_steps = num_steps
+    
+    def on_step_end(self, args, state, control, **kwargs):
+        if state.global_step >= self.num_steps:
+            return {"should_training_stop": True}
+        else:
+            return {}
+```
+
+ثم مرره إلى معامل `callback` في [`Trainer`].
+
+```py
+from transformers import Trainer
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"]،
+    eval_dataset=dataset["test"]،
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+    callback=[EarlyStoppingCallback()],
+)
+```
+
+## تسجيل الأحداث (Logging)
+
+<Tip>
+
+راجع مرجع [API](./main_classes/logging) للتسجيل للحصول على مزيد من المعلومات حول مستويات التسجيل المختلفة للأحداث.
+
+</Tip>
+
+يتم تعيين [`Trainer`] إلى `logging.INFO` افتراضيًا والذي يُبلغ عن الأخطاء والتحذيرات ومعلومات أساسية أخرى. يتم تعيين نسخة [`Trainer`] - في البيئات الموزعة - إلى `logging.WARNING` والتي يُبلغ فقط عن الأخطاء والتحذيرات. يمكنك تغيير مستوى تسجيل الأحداث باستخدام معاملي [`log_level`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.log_level) و [`log_level_replica`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.log_level_replica) في [`TrainingArguments`].
+
+لتهيئة إعداد مُستوى تسجيل  اﻷحداث لكل عقدة، استخدم معامل [`log_on_each_node`](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.log_on_each_node) لتحديد ما إذا كان سيتم استخدام مُستوى السجل على كل عقدة أو فقط على العقدة الرئيسية.
+
+<Tip>
+
+يحدد [`Trainer`] مُستوى التسجيل بشكل مُنفصل لكل عقدة في طريقة [`Trainer.__init__`]، لذا فقد ترغب في التفكير في تعيين هذا الإعداد في وقت سابق إذا كنت تستخدم وظائف Transformers الأخرى قبل إنشاء كائن [`Trainer`].
+
+</Tip>
+
+على سبيل المثال، لتعيين التعليمات البرمجية والوحدات النمطية الرئيسية الخاصة بك لاستخدام نفس مُستوى التسجيل وفقًا لكل عقدة:
+
+```py
+logger = logging.getLogger(__name__)
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s"،
+    datefmt="%m/%d/%Y %H:%M:%S"،
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+
+log_level = training_args.get_process_log_level()
+logger.setLevel(log_level)
+datasets.utils.logging.set_verbosity(log_level)
+transformers.utils.logging.set_verbosity(log_level)
+
+trainer = Trainer(...)
+```
+
+استخدم تركيبات مختلفة من `log_level` و `log_level_replica` لتهيئة ما يتم تسجيله على كل من العقد.
+
+
+<hfoptions id="logging">
+<hfoption id="single node">
+
+```bash
+my_app.py ... --log_level warning --log_level_replica error
+```
+
+</hfoption>
+<hfoption id="multi-node">
+
+أضف معلمة `log_on_each_node 0` لبيئات متعددة العقد.
+
+```bash
+my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0
+
+# set to only report errors
+my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
+```
+
+</hfoption>
+</hfoptions>
+
+## NEFTune
+
+[NEFTune](https://hf.co/papers/2310.05914) هي تقنية يمكن أن تحسن الأداء عن طريق إضافة ضوضاء إلى مُتجهات التعلم أثناء التدريب. لتمكينه في [`Trainer`], قم بتعيين معامل `neftune_noise_alpha` في [`TrainingArguments`] للتحكم في مقدار الضوضاء المُضافة.
+
+```py
+from transformers import TrainingArguments, Trainer
+
+training_args = TrainingArguments(..., neftune_noise_alpha=0.1)
+trainer = Trainer(..., args=training_args)
+```
+
+يتم تعطيل NEFTune بعد التدريب لاستعادة طبقة التعلم الأصلية لتجنب أي سلوك غير متوقع.
+
+## نواة Liger
+[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel هي مجموعة من نوى Triton التي طورتها Linkedin مُصممة خصيصًا لتدريب نماذج اللغة الكبيرة (LLM). لقد قمنا بتنفيذ RMSNorm و RoPE و SwiGLU و CrossEntropy و FusedLinearCrossEntropy مُتوافقة مع Hugging Face، والمزيد قادم. يُمكنها زيادة إنتاجية التدريب متعدد وحدات معالجة الرسومات (GPU) بنسبة 20٪ وتقليل استخدام الذاكرة بنسبة 60٪. تعمل النواة بشكل تلقائي مع flash attention و PyTorch FSDP و Microsoft DeepSpeed.
+
+احصل على زيادة في الإنتاجية بنسبة 20٪ وتقليل استخدام الذاكرة بنسبة 60٪ على تدريب نماذج LLaMA 3-8B. حقق أطوال سياق أكبر وأحجام دفعات أكبر. كما أنها مُفيدة إذا كنت تُريد زيادة حجم نموذجك إلى تدريب بنماذج متعددة الرؤوس أو أحجام مُفردات ضخمة. أطلق العنان للتدريب بنماذج متعددة الرؤوس (medusa) والمزيد. راجع التفاصيل والأمثلة في [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples)
+تأكد أولاً من تثبيت مستودع Liger الرسمي:
+```bash
+pip install liger-kernel
+```
+يجب عليك تمرير `use_liger_kernel=True` لتطبيق نواة `liger` على نموذجك، على سبيل المثال:
+
+```python
+from transformers import TrainingArguments
+
+training_args = TrainingArguments(
+    output_dir="your-model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+    use_liger_kernel=True
+)
+```
+
+تدعم النواة معماريات نماذج Llama و Gemma و Mistral و Mixtral. يُمكن العثور على أحدث قائمة بالنمائج المدعومة [هنا](https://github.com/linkedin/Liger-Kernel). عندما يتم تعيين `use_liger_kernel` إلى `True`، سيتم تصحيح الطبقات المُقابلة في النموذج الأصلي باستخدام تطبيق Liger الفعال، لذلك لا تحتاج إلى فعل أي شيء إضافي بخلاف تعيين قيمة المعامل.
+
+## المُحسِّنات
+يمكنك اختيار مُحسِّن مدمج للتدريب باستخدام:
+```python
+from transformers import TrainingArguments
+training_args = TrainingArguments(..., optim="adamw_torch")
+```
+اطلع على [`OptimizerNames`](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py) للاطلاع على القائمة الكاملة للخيارات. نُدرج أمثلة مُتقدمة في الأقسام أدناه.
+
+يمكنك أيضًا استخدام مُحسِّن PyTorch عشوائي عبر:
+```python
+import torch
+
+optimizer_cls = torch.optim.AdamW
+optimizer_kwargs = {
+    "lr": 4e-3,
+    "betas": (0.9, 0.999),
+    "weight_decay": 0.05,
+}
+
+from transformers import Trainer
+trainer = Trainer(..., optimizer_cls_and_kwargs=(optimizer_cls, optimizer_kwargs))
+```
+
+
+
+
+### GaLore
+
+إسقاط التدرج ذو الرتبة المنخفضة (GaLore) هو إستراتيجية تدريب ذات رتبة منخفضة فعّالة من حيث الذاكرة، تسمح بتعلم المعلمات الكاملة ولكنها أكثر كفاءة من حيث الذاكرة من أساليب التكيّف الشائعة ذات الرتبة المنخفضة، مثل LoRA.
+
+أولاً، تأكد من تثبيت المستودع الرسمي لـ GaLore:
+
+```bash
+pip install galore-torch
+```
+
+ثم أضف ببساطة أحد `["galore_adamw"، "galore_adafactor"، "galore_adamw_8bit"]` في `optim` جنبًا إلى جنب مع `optim_target_modules`، والتي يمكن أن تكون قائمة من السلاسل أو التعبيرات النمطية regex أو المسار الكامل المطابق لأسماء الوحدات المستهدفة التي تريد تكييفها. فيما يلي مثال على النص البرمجي كامل(تأكد من `pip install trl datasets`):
+
+```python
+import torch
+import datasets
+import trl
+
+from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+args = TrainingArguments(
+    output_dir="./test-galore"،
+    max_steps=100,
+    per_device_train_batch_size=2,
+    optim="galore_adamw"،
+    optim_target_modules=[r".*.attn.*"، r".*.mlp.*"]
+)
+
+model_id = "google/gemma-2b"
+
+config = AutoConfig.from_pretrained(model_id)
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_config(config).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model, 
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=512,
+)
+
+trainer.train()
+```
+
+لتمرير معامﻻت إضافية يدعمها  GaLore، يجب عليك تمرير `optim_args` بشكل صحيح، على سبيل المثال:
+
+```python
+import torch
+import datasets
+import trl
+
+from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+args = TrainingArguments(
+    output_dir="./test-galore",
+    max_steps=100,
+    per_device_train_batch_size=2,
+    optim="galore_adamw",
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
+    optim_args="rank=64, update_proj_gap=100, scale=0.10",
+)
+
+model_id = "google/gemma-2b"
+
+config = AutoConfig.from_pretrained(model_id)
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_config(config).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model, 
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=512,
+)
+
+trainer.train()
+```
+يمكنك قراءة المزيد حول الطريقة في [المستودع الأصلي](https://github.com/jiaweizzhao/GaLore) أو [الورقة البحثية](https://arxiv.org/abs/2403.03507).
+
+حاليًا، يمكنك فقط تدريب الطبقات الخطية التي تعتبر طبقات GaLore وستستخدم التحلل  ذو الرتبة المنخفضة للتدريب بينما سيتم تحسين الطبقات المتبقية بالطريقة التقليدية.
+
+لاحظ أنه سيستغرق الأمر بعض الوقت قبل بدء التدريب (~3 دقائق لنموذج 2B على NVIDIA A100)، ولكن يجب أن يسير التدريب بسلاسة بعد ذلك.
+
+يمكنك أيضًا إجراء تحسين طبقة تلو الأخرى عن طريق إضافة `layerwise` إلى اسم المُحسِّن كما هو موضح أدناه:
+
+```python
+import torch
+import datasets
+import trl
+
+from transformers import TrainingArguments، AutoConfig، AutoTokenizer، AutoModelForCausalLM
+
+train_dataset = datasets.load_dataset('imdb'، split='train')
+
+args = TrainingArguments(
+    output_dir="./test-galore"،
+    max_steps=100،
+    per_device_train_batch_size=2،
+    optim="galore_adamw_layerwise"،
+    optim_target_modules=[r".*.attn.*"، r".*.mlp.*"]
+)
+
+model_id = "google/gemma-2b"
+
+config = AutoConfig.from_pretrained(model_id)
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_config(config).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model،
+    args=args،
+    train_dataset=train_dataset،
+    dataset_text_field='text'،
+    max_seq_length=512،
+)
+
+trainer.train()
+```
+
+لاحظ أن تحسين الطبقة تجريبي إلى حد ما ولا يدعم DDP (Distributed Data Parallel)، وبالتالي يمكنك تشغيل التعليمات البرمجية  للتدريب على وحدة معالجة الرسومات (GPU) واحدة فقط. يرجى الاطلاع على [هذا القسم المناسب](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) لمزيد من التفاصيل. قد لا تدعم الميزات الأخرى مثل تقليم التدرجات أو DeepSpeed، إلخ. من الصندوق. يرجى [تقديم تقرير عن المشكلة على GitHub](https://github.com/huggingface/transformers/issues) إذا واجهتك مثل هذه المشكلة.
+
+### محسنات LOMO
+
+تم تقديم مُحسِّنات LOMO في [التدريب على المعلمات الكاملة لنماذج اللغة الكبيرة باستخدام موارد محدودة](https://hf.co/papers/2306.09782) و [AdaLomo: تحسين ذاكرة منخفضة بمعدل تعلم متكيف](https://hf.co/papers/2310.10195).
+يتكون كلاهما من طريقة فعالة لضبط المعلمات الكاملة. تدمج محسنات LOMO حساب الاشتقاق وتحديث المعلمات في خطوة واحدة لتقليل استخدام الذاكرة. محسنات LOMO المدعومة هي `"lomo"` و `"adalomo"`. أولاً قم بتثبيت LOMO من pypi `pip install lomo-optim` أو قم بتثبيته من المصدر باستخدام `pip install git+https://github.com/OpenLMLab/LOMO.git`.
+
+<Tip>
+
+وفقًا للمؤلفين، يوصى باستخدام `AdaLomo` بدون `grad_norm` للحصول على أداء أفضل وسرعة أعلى.
+
+</Tip>
+
+فيما يلي نص برمجي بسيط يوضح كيفية ضبط نموذج [google/gemma-2b](https://huggingface.co/google/gemma-2b) على مجموعة بيانات IMDB في الدقة الكاملة:
+
+```python
+import torch
+import datasets
+from transformers import TrainingArguments، AutoTokenizer، AutoModelForCausalLM
+import trl
+
+train_dataset = datasets.load_dataset('imdb'، split='train')
+
+args = TrainingArguments(
+    output_dir="./test-lomo"،
+    max_steps=100،
+    per_device_train_batch_size=4،
+    optim="adalomo"،
+    gradient_checkpointing=True،
+    logging_strategy="steps"،
+    logging_steps=1،
+    learning_rate=2e-6،
+    save_strategy="no"،
+    run_name="lomo-imdb"،
+)
+
+model_id = "google/gemma-2b"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id، low_cpu_mem_usage=True).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model،
+    args=args،
+    train_dataset=train_dataset،
+    dataset_text_field='text'،
+    max_seq_length=1024،
+)
+
+trainer.train()
+```
+
+### مُحسِّن GrokAdamW
+تم تصميم مُحسِّن GrokAdamW لتعزيز أداء التدريب واستقراره، خاصةً للنماذج التي تستفيد من دوال إشارة `grokking`. لاستخدام `GrokAdamW`، قم أولاً بتثبيت حزمة المُحسِّن باستخدام `pip install grokadamw`.
+<Tip>
+يُعد GrokAdamW مفيدًا بشكل خاص للنماذج التي تتطلب تقنيات تحسين مُتقدمة لتحقيق أداء واستقرار أفضل.
+</Tip>
+
+فيما يلي نص برمجى بسيط لشرح كيفية ضبط [google/gemma-2b](https://huggingface.co/google/gemma-2b) بدقة على مجموعة بيانات IMDB باستخدام مُحسِّن GrokAdamW:
+```python
+import torch
+import datasets
+from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, Trainer
+
+# تحميل مجموعة البيانات IMDB
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+# تعريف معامﻻت التدريب
+args = TrainingArguments(
+    output_dir="./test-grokadamw",
+    max_steps=1000,
+    per_device_train_batch_size=4,
+    optim="grokadamw",
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-5,
+    save_strategy="no",
+    run_name="grokadamw-imdb",
+)
+
+# تحميل النموذج والمجزىء اللغوي
+model_id = "google/gemma-2b"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
+
+# تهيئة المدرب
+trainer = Trainer(
+    model=model,
+    args=args,
+    train_dataset=train_dataset,
+)
+
+# تدريب النموذج
+trainer.train()
+```
+يوضح هذا النص البرمجى كيفية ضبط نموذج google/gemma-2b بدقة على مجموعة بيانات IMDB باستخدام مُحسِّن GrokAdamW. يتم تكوين TrainingArguments لاستخدام GrokAdamW، ويتم تمرير مجموعة البيانات إلى Trainer للتدريب.
+
+### مُحسِّن بدون جدوله (Schedule Free Optimizer)
+تم تقديم مُحسِّنات بدون جدوله في [The Road Less Scheduled](https://hf.co/papers/2405.15682).
+يستبدل التعلم بدون جدوله زخم المُحسِّن الأساسي بمزيج من المتوسط ​​والتداخل، لإزالة الحاجة تمامًا إلى تخفيف مُعدل التعلم باستخدام جدوله تقليديه.
+المُحسِّنات المدعومة لـ SFO هي "schedule_free_adamw" و "schedule_free_sgd". قم أولاً بتثبيت `schedulefree` من pypi باستخدام الأمر  `pip install schedulefree`.
+
+فيما يلي نص برمجى بسيط لشرح كيفية ضبط [google/gemma-2b](https://huggingface.co/google/gemma-2b) بدقة على مجموعة بيانات IMDB بدقة كاملة:
+```python
+import torch
+import datasets
+from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
+import trl
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+args = TrainingArguments(
+    output_dir="./test-schedulefree",
+    max_steps=1000,
+    per_device_train_batch_size=4,
+    optim="schedule_free_adamw",
+    gradient_checkpointing=True,
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-6,
+    save_strategy="no",
+    run_name="sfo-imdb",
+)
+
+model_id = "google/gemma-2b"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model, 
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=1024,
+)
+
+trainer.train()
+```
+## تسريع ومدرب
+
+يتم تشغيل فئة [`Trainer`] بواسطة [تسريع](https://hf.co/docs/accelerate)، وهي مكتبة لتدريب نماذج PyTorch بسهولة في بيئات موزعة مع دعم عمليات التكامل مثل [FullyShardedDataParallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) و [DeepSpeed](https://www.deepspeed.ai/).
+
+<Tip>
+
+تعرف على المزيد حول استراتيجيات تجزئة FSDP، وتفريغ وحدة المعالجة المركزية (CPU)، والمزيد مع [`Trainer`] في [دليل Fully Sharded Data Parallel](fsdp).
+
+</Tip>
+
+لاستخدام Accelerate مع [`Trainer`]]، قم بتشغيل الأمر [`accelerate.config`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-config) لإعداد التدريب لبيئة التدريب الخاصة بك. نشئ هذا الأمر ملف `config_file.yaml` الذي سيتم استخدامه عند تشغيل نص للتدريب البرمجى. على سبيل المثال، بعض تكوينات المثال التي يمكنك إعدادها هي:
+
+<hfoptions id="config">
+<hfoption id="DistributedDataParallel">
+
+```yml
+compute_environment: LOCAL_MACHINE                                                                                             
+distributed_type: MULTI_GPU                                                                                                    
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0 #change rank as per the node
+main_process_ip: 192.168.20.1
+main_process_port: 9898
+main_training_function: main
+mixed_precision: fp16
+num_machines: 2
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+</hfoption>
+<hfoption id="FSDP">
+
+```yml
+compute_environment: LOCAL_MACHINE
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch_policy: BACKWARD_PRE
+  fsdp_forward_prefetch: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: 1
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_transformer_layer_cls_to_wrap: BertLayer
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+</hfoption>
+<hfoption id="DeepSpeed">
+
+```yml
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_config_file: /home/user/configs/ds_zero3_config.json
+  zero3_init_flag: true
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+</hfoption>
+<hfoption id="DeepSpeed with Accelerate plugin">
+
+```yml
+compute_environment: LOCAL_MACHINE                                                                                             
+deepspeed_config:                                                                                                              
+  gradient_accumulation_steps: 1
+  gradient_clipping: 0.7
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+</hfoption>
+</hfoptions>
+يُعد أمر  [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) هو الطريقة المُوصى بها لتشغيل نص البرمجى للتدريب على نظام موزع باستخدام Accelerate و [`Trainer`] مع المعلمات المحددة في `config_file.yaml`. يتم حفظ هذا الملف في مجلد ذاكرة التخزين المؤقت لـ Accelerate ويتم تحميله تلقائيًا عند تشغيل `accelerate_launch`.
+
+على سبيل المثال، لتشغيل النص البرنامجي للتدريب [run_glue.py](https://github.com/huggingface/transformers/blob/f4db565b695582891e43a5e042e5d318e28f20b8/examples/pytorch/text-classification/run_glue.py#L4) مع تكوين FSDP:
+
+```bash
+accelerate launch \
+    ./examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path google-bert/bert-base-cased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size 16 \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3 \
+    --output_dir /tmp/$TASK_NAME/ \
+    --overwrite_output_dir
+```
+
+يمكنك أيضًا تحديد المعلمات من ملف `config_file.yaml` مباشرة في سطر الأوامر:
+
+```bash
+accelerate launch --num_processes=2 \
+    --use_fsdp \
+    --mixed_precision=bf16 \
+    --fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP  \
+    --fsdp_transformer_layer_cls_to_wrap="BertLayer" \
+    --fsdp_sharding_strategy=1 \
+    --fsdp_state_dict_type=FULL_STATE_DICT \
+    ./examples/pytorch/text-classification/run_glue.py
+    --model_name_or_path google-bert/bert-base-cased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size 16 \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3 \
+    --output_dir /tmp/$TASK_NAME/ \
+    --overwrite_output_dir
+```
+
+اطلع على برنامج تعليمي [Launching your Accelerate scripts](https://huggingface.co/docs/accelerate/basic_tutorials/launch) لمعرفة المزيد حول `accelerate_launch` والتكوينات المخصصة.
diff --git a/docs/source/ar/troubleshooting.md b/docs/source/ar/troubleshooting.md
new file mode 100644
index 00000000000000..7874a9fad13304
--- /dev/null
+++ b/docs/source/ar/troubleshooting.md
@@ -0,0 +1,171 @@
+# استكشاف الأخطاء وإصلاحها
+
+تحدث الأخطاء أحيانًا، لكننا هنا للمساعدة! يغطي هذا الدليل بعض المشكلات الأكثر شيوعًا التي واجهناها وكيفية حلها. مع ذلك، لا يُقصد بهذا الدليل أن يكون مجموعة شاملة لكل مشكلات 🤗 Transformers. لمزيد من المساعدة في استكشاف مشكلتك وإصلاحها، جرب ما يلي:
+<Youtube id="S2EEG3JIt2A"/>
+
+
+1. اطلب المساعدة على [المنتديات](https://discuss.huggingface.co/). هناك فئات محددة يمكنك نشر سؤالك فيها، مثل [المبتدئين](https://discuss.huggingface.co/c/beginners/5) أو [🤗 Transformers](https://discuss.huggingface.co/c/transformers/9). تأكد من كتابة منشور جيد وواضح على المنتدى مع بعض التعليمات البرمجية القابلة للتكرار لزيادة احتمالية حل مشكلتك!
+<Youtube id="_PAli-V4wj0"/>
+
+2. قم بإنشاء [مشكلة](https://github.com/huggingface/transformers/issues/new/choose) في مستودع 🤗 Transformers إذا كانت هناك مشكلة متعلقة بالمكتبة. حاول تضمين أكبر قدر ممكن من المعلومات التي تصف المشكلة لمساعدتنا في معرفة ما هو الخطأ وكيفية إصلاحه.
+
+3. تحقق من دليل [الترحيل](migration) إذا كنت تستخدم إصدارًا أقدم من مكتبة 🤗 Transformers حيث تم إدخال بعض التغييرات المهمة بين الإصدارات.
+
+
+للحصول على مزيد من التفاصيل حول استكشاف الأخطاء وإصلاحها والحصول على المساعدة، راجع [الفصل 8](https://huggingface.co/course/chapter8/1?fw=pt) من دورة Hugging Face.
+
+## بيئات جدار الحماية
+
+بعض وحدات معالجة الرسومات (GPU) على السحابة وإعدادات الشبكة الداخلية محمية بجدار حماية من الاتصالات الخارجية، مما يؤدي إلى حدوث خطأ في الاتصال. عندما تحاول تعليمات البرنامج النصي تنزيل أوزان النموذج أو مجموعات البيانات، سيتوقف التنزيل ثم ينتهي بخطأ مثل:
+
+```
+ValueError: Connection error, and we cannot find the requested files in the cached path.
+Please try again or make sure your Internet connection is on.
+```
+
+في هذه الحالة، يجب محاولة تشغيل 🤗 Transformers في [وضع عدم الاتصال](installation#offline-mode) لتجنب خطأ الاتصال.
+
+## CUDA نفاد الذاكرة
+
+يمكن أن يكون تدريب النماذج الكبيرة التي تحتوي على ملايين المعلمات أمرًا صعبًا بدون الأجهزة المناسبة. أحد الأخطاء الشائعة التي قد تواجهها عند نفاد ذاكرة GPU هو:
+
+```
+CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 11.17 GiB total capacity; 9.70 GiB already allocated; 179.81 MiB free; 9.85 GiB reserved in total by PyTorch)
+```
+
+فيما يلي بعض الحلول المحتملة التي يمكنك تجربتها لتقليل استخدام الذاكرة:
+
+- قلل من قيمة [`per_device_train_batch_size`](main_classes/trainer#transformers.TrainingArguments.per_device_train_batch_size) في [`TrainingArguments`].
+
+- حاول استخدام [`gradient_accumulation_steps`](main_classes/trainer#transformers.TrainingArguments.gradient_accumulation_steps) في [`TrainingArguments`] لزيادة حجم الدُفعة بشكل فعال.
+
+<Tip>
+راجع دليل [الأداء](performance) لمزيد من التفاصيل حول تقنيات توفير الذاكرة.
+</Tip>
+
+## عدم القدرة على تحميل نموذج TensorFlow محفوظ
+
+تقوم طريقة TensorFlow [model.save](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) بحفظ النموذج بالكامل - الهندسة المعمارية، الأوزان، تكوين التدريب - في ملف واحد. ومع ذلك، عند تحميل ملف النموذج مرة أخرى، قد تواجه خطأ لأن مكتبة 🤗 Transformers قد لا تقوم بتحميل جميع الكائنات المتعلقة بـ TensorFlow في ملف النموذج. لتجنب المشكلات  المتعلقة بحفظ وتحميل نماذج TensorFlow، نوصي بما يلي:
+
+- احفظ أوزان النموذج كملف `h5` باستخدام [`model.save_weights`](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) ثم أعد تحميل النموذج باستخدام [`~TFPreTrainedModel.from_pretrained`]:
+
+```python
+>>> from transformers import TFPreTrainedModel
+>>> from tensorflow import keras
+
+>>> model.save_weights("some_folder/tf_model.h5")
+>>> model = TFPreTrainedModel.from_pretrained("some_folder")
+```
+
+- احفظ النموذج باستخدام [`~TFPretrainedModel.save_pretrained`] وقم بتحميله مرة أخرى باستخدام [`~TFPreTrainedModel.from_pretrained`]:
+
+```python
+>>> from transformers import TFPreTrainedModel
+
+>>> model.save_pretrained("path_to/model")
+>>> model = TFPreTrainedModel.from_pretrained("path_to/model")
+```
+
+## ImportError
+
+خطأ شائع آخر قد تواجهه، خاصة إذا كان نموذجًا تم إصداره حديثًا، هو `ImportError`:
+
+```
+ImportError: cannot import name 'ImageGPTImageProcessor' from 'transformers' (unknown location)
+```
+
+بالنسبة لأنواع الأخطاء هذه، تحقق من أن لديك أحدث إصدار من مكتبة Hugging Face Transformers مثبتًا للوصول إلى أحدث النماذج:
+
+```bash
+pip install transformers --upgrade
+```
+
+## خطأ CUDA: تم تشغيل التأكيد على جانب الجهاز
+
+في بعض الأحيان، قد تواجه خطأ CUDA عامًا حول خطأ في كود الجهاز.
+
+```
+RuntimeError: CUDA error: device-side assert triggered
+```
+
+يجب عليك محاولة تشغيل الكود على وحدة المعالجة المركزية (CPU) أولاً للحصول على رسالة خطأ أكثر دقة. أضف متغير البيئة التالي في بداية كودك للتبديل إلى وحدة المعالجة المركزية:
+
+```python
+>>> import os
+
+>>> os.environ["CUDA_VISIBLE_DEVICES"] = ""
+```
+
+الخيار الآخر هو الحصول على تتبع مكدس أفضل من GPU. أضف متغير البيئة التالي في بداية كودك للحصول على تتبع المكدس للإشارة إلى مصدر الخطأ:
+
+```python
+>>> import os
+
+>>> os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+```
+
+## إخراج غير صحيح عند عدم إخفاء رموز الحشو
+
+في بعض الحالات، قد يكون `hidden_state` غير صحيحة إذا تضمنت `input_ids` رموز حشو. ولإثبات ذلك، قم بتحميل نموذج ومجزىء لغوى. يمكنك الوصول إلى `pad_token_id` للنموذج لمعرفة قيمته. قد تكون `pad_token_id` `None` لبعض النماذج، ولكن يمكنك دائمًا تعيينها يدويًا.
+
+```python
+>>> from transformers import AutoModelForSequenceClassification
+>>> import torch
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
+>>> model.config.pad_token_id
+0
+```
+
+يوضح المثال التالي المُخرجات بدون إخفاء رموز الحشو:
+
+```python
+>>> input_ids = torch.tensor([[7592, 2057, 2097, 2393, 9611, 2115], [7592, 0, 0, 0, 0, 0]])
+>>> output = model(input_ids)
+>>> print(output.logits)
+tensor([[ 0.0082, -0.2307],
+[ 0.1317, -0.1683]], grad_fn=<AddmmBackward0>)
+```
+
+هنا المُخرجات الفعلية للتسلسل الثاني:
+
+```python
+>>> input_ids = torch.tensor([[7592]])
+>>> output = model(input_ids)
+>>> print(output.logits)
+tensor([[-0.1008, -0.4061]], grad_fn=<AddmmBackward0>)
+```
+
+يجب عليك في معظم الوقت توفير `attention_mask` للنموذج لتجاهل رموز الحشو لتجنب هذا الخطأ الصامت. الآن يتطابق مُخرجات التسلسل الثاني مع مُخرجاته الفعلية:
+
+<Tip>
+بشكل افتراضي، ينشئ مجزىء النصوص `attention_mask` لك استنادًا إلى إعدادات المجزىء المحدد.
+</Tip>
+
+```python
+>>> attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0]])
+>>> output = model(input_ids, attention_mask=attention_mask)
+>>> print(output.logits)
+tensor([[ 0.0082, -0.2307],
+[-0.1008, -0.4061]], grad_fn=<AddmmBackward0>)
+```
+
+لا ينشئ 🤗 Transformers تلقائيًا `attention_mask` لإخفاء رمز الحشو إذا تم توفيره لأن:
+
+- بعض النماذج ليس لها رمز حشو.
+
+- بالنسبة لبعض الاستخدامات، يريد المستخدمون أن ينتبه النموذج إلى رمز الحشو.
+## ValueError: فئة التكوين غير المعترف بها XYZ لهذا النوع من AutoModel
+
+بشكل عام، نوصي باستخدام فئة [`AutoModel`] لتحميل النسخ المدربة مسبقًا من النماذج. يمكن لهذه الفئة أن تستنتج وتُحمل تلقائيًا البنية الصحيحة من نسخ معينة بناءً على التكوين. إذا رأيت هذا الخطأ `ValueError` عند تحميل نموذج من نسخة، فهذا يعني أن الفئة التلقائية (Auto) لم تتمكن من العثور على خريطة من التكوين في نقطة التفتيش المعطاة إلى نوع النموذج الذي تُحاول تحميله. وغالبًا ما يحدث هذا عندما لا تدعم نقطة التفتيش مهمة معينة.
+
+على سبيل المثال، سترى هذا الخطأ في المثال التالي لأنه لا يوجد GPT2 للإجابة على الأسئلة:
+
+```py
+>>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
+
+>>> processor = AutoProcessor.from_pretrained("openai-community/gpt2-medium")
+>>> model = AutoModelForQuestionAnswering.from_pretrained("openai-community/gpt2-medium")
+ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: AutoModelForQuestionAnswering.
+Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, ...
+```
diff --git a/docs/source/de/model_sharing.md b/docs/source/de/model_sharing.md
index 6bbb6e10cb4942..850d9a3454a9c1 100644
--- a/docs/source/de/model_sharing.md
+++ b/docs/source/de/model_sharing.md
@@ -43,7 +43,7 @@ Folglich können Sie eine bestimmte Modellversion mit dem Parameter "Revision" l
 
 ```py
 >>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
+...     "julien-c/EsperBERTo-small", revision="4c77982"  # tag name, or branch name, or commit hash
 ... )
 ```
 
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index a7806059afaa59..ca7ee4557feec7 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -218,6 +218,8 @@
       title: CPU inference
     - local: perf_infer_gpu_one
       title: GPU inference
+    - local: perf_infer_gpu_multi
+      title: Multi-GPU inference
     title: Optimizing inference
   - local: big_models
     title: Instantiate a big model
@@ -514,6 +516,8 @@
         title: Nyströmformer
       - local: model_doc/olmo
         title: OLMo
+      - local: model_doc/olmo_1124
+        title: OLMo November 2024
       - local: model_doc/olmoe
         title: OLMoE
       - local: model_doc/open-llama
diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md
index 2327357525d8d9..e80e402d7374cd 100644
--- a/docs/source/en/agents_advanced.md
+++ b/docs/source/en/agents_advanced.md
@@ -66,10 +66,10 @@ manager_agent.run("Who is the CEO of Hugging Face?")
 
 Let's take again the tool example from main documentation, for which we had implemented a `tool` decorator.
 
-If you need to add variation, like custom attributes for your too, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass.
+If you need to add variation, like custom attributes for your tool, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass.
 
 The custom tool needs:
-- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name is `model_download_counter`.
+- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name it `model_download_counter`.
 - An attribute `description` is used to populate the agent's system prompt.
 - An `inputs` attribute, which is a dictionary with keys `"type"` and `"description"`. It contains information that helps the Python interpreter make educated choices about the input.
 - An `output_type` attribute, which specifies the output type.
@@ -123,52 +123,70 @@ from transformers import load_tool, CodeAgent
 model_download_tool = load_tool("m-ric/hf-model-downloads")
 ```
 
-### Use gradio-tools
+### Import a Space as a tool 🚀
 
-[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging
-Face Spaces as tools. It supports many existing Spaces as well as custom Spaces.
+You can directly import a Space from the Hub as a tool using the [`Tool.from_space`] method!
 
-Transformers supports `gradio_tools` with the [`Tool.from_gradio`] method. For example, let's use the [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) from `gradio-tools` toolkit for improving prompts to generate better images.
+You only need to provide the id of the Space on the Hub, its name, and a description that will help you agent understand what the tool does. Under the hood, this will use [`gradio-client`](https://pypi.org/project/gradio-client/) library to call the Space.
 
-Import and instantiate the tool, then pass it to the `Tool.from_gradio` method:
+For instance, let's import the [FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) Space from the Hub and use it to generate an image.
 
-```python
-from gradio_tools import StableDiffusionPromptGeneratorTool
-from transformers import Tool, load_tool, CodeAgent
+```
+from transformers import Tool
 
-gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
-prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
+image_generation_tool = Tool.from_space(
+    "black-forest-labs/FLUX.1-dev",
+    name="image_generator",
+    description="Generate an image from a prompt")
+
+image_generation_tool("A sunny beach")
 ```
+And voilà, here's your image! 🏖️
 
-Now you can use it just like any other tool. For example, let's improve the prompt  `a rabbit wearing a space suit`.
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sunny_beach.webp">
+
+Then you can use this tool just like any other tool.  For example, let's improve the prompt  `a rabbit wearing a space suit` and generate an image of it.
 
 ```python
-image_generation_tool = load_tool('huggingface-tools/text-to-image')
-agent = CodeAgent(tools=[prompt_generator_tool, image_generation_tool], llm_engine=llm_engine)
+from transformers import ReactCodeAgent
+
+agent = ReactCodeAgent(tools=[image_generation_tool])
 
 agent.run(
     "Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit'
 )
 ```
 
-The model adequately leverages the tool:
 ```text
-======== New task ========
-Improve this prompt, then generate an image of it.
-You have been provided with these initial arguments: {'prompt': 'A rabbit wearing a space suit'}.
-==== Agent is executing the code below:
-improved_prompt = StableDiffusionPromptGenerator(query=prompt)
-while improved_prompt == "QUEUE_FULL":
-    improved_prompt = StableDiffusionPromptGenerator(query=prompt)
-print(f"The improved prompt is {improved_prompt}.")
-image = image_generator(prompt=improved_prompt)
-====
+=== Agent thoughts:
+improved_prompt could be "A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background"
+
+Now that I have improved the prompt, I can use the image generator tool to generate an image based on this prompt.
+>>> Agent is executing the code below:
+image = image_generator(prompt="A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background")
+final_answer(image)
 ```
 
-Before finally generating the image:
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit_spacesuit_flux.webp">
+
+How cool is this? 🤩
 
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png">
+### Use gradio-tools
 
+[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging
+Face Spaces as tools. It supports many existing Spaces as well as custom Spaces.
+
+Transformers supports `gradio_tools` with the [`Tool.from_gradio`] method. For example, let's use the [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) from `gradio-tools` toolkit for improving prompts to generate better images.
+
+Import and instantiate the tool, then pass it to the `Tool.from_gradio` method:
+
+```python
+from gradio_tools import StableDiffusionPromptGeneratorTool
+from transformers import Tool, load_tool, CodeAgent
+
+gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
+prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
+```
 
 > [!WARNING]
 > gradio-tools require *textual* inputs and outputs even when working with different modalities like image and audio objects. Image and audio inputs and outputs are currently incompatible.
@@ -179,7 +197,7 @@ We love Langchain and think it has a very compelling suite of tools.
 To import a tool from LangChain, use the `from_langchain()` method.
 
 Here is how you can use it to recreate the intro's search result using a LangChain web search tool.
-
+This tool will need `pip install google-search-results` to work properly.
 ```python
 from langchain.agents import load_tools
 from transformers import Tool, ReactCodeAgent
@@ -188,7 +206,7 @@ search_tool = Tool.from_langchain(load_tools(["serpapi"])[0])
 
 agent = ReactCodeAgent(tools=[search_tool])
 
-agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?")
+agent.run("How many more blocks (also denoted as layers) are in BERT base encoder compared to the encoder from the architecture proposed in Attention is All You Need?")
 ```
 
 ## Display your agent run in a cool Gradio interface
@@ -240,4 +258,4 @@ with gr.Blocks() as demo:
 
 if __name__ == "__main__":
     demo.launch()
-```
\ No newline at end of file
+```
diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index 64ded9613716a5..380b39fe62acdf 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -403,7 +403,7 @@ culture, and they allow us to design the'
 
 This guide illustrates the main parameters that enable various decoding strategies. More advanced parameters exist for the
 [`generate`] method, which gives you even further control over the [`generate`] method's behavior.
-For the complete list of the available parameters, refer to the [API documentation](./main_classes/text_generation.md).
+For the complete list of the available parameters, refer to the [API documentation](./main_classes/text_generation).
 
 ### Speculative Decoding
 
@@ -416,16 +416,6 @@ Assisted decoding assumes the main and assistant models have the same tokenizer,
 Currently, only greedy search and sampling are supported with assisted decoding, and assisted decoding doesn't support batched inputs.
 To learn more about assisted decoding, check [this blog post](https://huggingface.co/blog/assisted-generation).
 
-#### Universal Assisted Decoding
-
-Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers.
-To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below).
-Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are
-in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above.
-The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer.
-Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings, 
-to ensure the new tokens include the correct prompt suffix.
-
 To enable assisted decoding, set the `assistant_model` argument with a model.
 
 ```python
@@ -445,7 +435,36 @@ To enable assisted decoding, set the `assistant_model` argument with a model.
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```
 
-If the main and assistant models have different tokenizers, use Universal Assisted Decoding.
+When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness,
+just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency.
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
+>>> set_seed(42)  # For reproducibility
+
+>>> prompt = "Alice and Bob"
+>>> checkpoint = "EleutherAI/pythia-1.4b-deduped"
+>>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped"
+
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+>>> inputs = tokenizer(prompt, return_tensors="pt")
+
+>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
+>>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Alice and Bob, a couple of friends of mine, who are both in the same office as']
+```
+
+#### Universal Assisted Decoding
+
+Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers.
+To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below).
+Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are
+in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above.
+The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer.
+Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings,
+to ensure the new tokens include the correct prompt suffix.
 
 ```python
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -465,30 +484,35 @@ If the main and assistant models have different tokenizers, use Universal Assist
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```
 
-When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness,
-just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency.
+#### Prompt Lookup
+
+Alternatively, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed
+to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259).
+
+#### Self-Speculative Decoding
+
+An LLM can be trained to also use its language modeling head with earlier hidden states as input, effectively
+skipping layers to yield a lower-quality output -- a technique called early exiting.
+We use the lower-quality early exit output as an assistant output, and apply self-speculation to fix the output using the remaining layers. The final generation of that self-speculative solution is the same (or has the same distribution) as the original model's generation.
+If the model you're using was trained to do early exit, you can pass
+`assistant_early_exit` (integer). In this case, the assistant model will be the same model but exiting early, hence the
+"self-speculative" name. Because the assistant model is a portion of the target model, caches and weights can be shared, which results in lower memory requirements. As in other assisted generation methods, the final generated result has the same quality as if no assistant had been used.
 
 ```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
->>> set_seed(42)  # For reproducibility
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
 
 >>> prompt = "Alice and Bob"
->>> checkpoint = "EleutherAI/pythia-1.4b-deduped"
->>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped"
+>>> checkpoint = "facebook/layerskip-llama3.2-1B"
 
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 >>> inputs = tokenizer(prompt, return_tensors="pt")
 
 >>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
->>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
->>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
+>>> outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob, a couple of friends of mine, who are both in the same office as']
+['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```
 
-Alternatively, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed
-to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259).
-
 ### DoLa Decoding
 
 **D**ecoding by C**o**ntrasting **La**yers (DoLa) is a contrastive decoding strategy to improve the factuality and reduce the
@@ -508,10 +532,11 @@ See the following examples for DoLa decoding with the 32-layer LLaMA-7B model.
 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
 >>> import torch
+>>> from accelerate.test_utils.testing import get_backend
 
 >>> tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
 >>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16)
->>> device = 'cuda' if torch.cuda.is_available() else 'cpu'
+>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
 >>> model.to(device)
 >>> set_seed(42)
 
diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md
index 20531b990bc341..b1ed1f0d492ab9 100644
--- a/docs/source/en/gguf.md
+++ b/docs/source/en/gguf.md
@@ -86,6 +86,8 @@ For now the supported model architectures are the architectures that have been v
 - GPT2
 - Starcoder2
 - T5
+- Mamba
+- Nemotron
 
 ## Example usage
 
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index aaff45ab65dfb6..341cb417c7b8ac 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -240,6 +240,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                        [Nougat](model_doc/nougat)                        |       ✅        |         ✅         |      ✅      |
 |                 [Nyströmformer](model_doc/nystromformer)                 |       ✅        |         ❌         |      ❌      |
 |                          [OLMo](model_doc/olmo)                          |       ✅        |         ❌         |      ❌      |
+|                [OLMo November 2024](model_doc/olmo_1124)                 |       ✅        |         ❌         |      ❌      |
 |                         [OLMoE](model_doc/olmoe)                         |       ✅        |         ❌         |      ❌      |
 |                   [OmDet-Turbo](model_doc/omdet-turbo)                   |       ✅        |         ❌         |      ❌      |
 |                     [OneFormer](model_doc/oneformer)                     |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
index 946940cb019481..eb25ddb6329755 100644
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -428,13 +428,11 @@ A [`Constraint`] can be used to force the generation to include specific tokens
     - __call__
 
 [[autodoc]] BayesianDetectorConfig
-    - __call__
 
 [[autodoc]] BayesianDetectorModel
-    - __call__
+    - forward
 
 [[autodoc]] SynthIDTextWatermarkingConfig
-    - __call__
 
 [[autodoc]] SynthIDTextWatermarkDetector
     - __call__
diff --git a/docs/source/en/main_classes/image_processor.md b/docs/source/en/main_classes/image_processor.md
index 59a78e68214d6d..320916f1ce9421 100644
--- a/docs/source/en/main_classes/image_processor.md
+++ b/docs/source/en/main_classes/image_processor.md
@@ -18,6 +18,49 @@ rendered properly in your Markdown viewer.
 
 An image processor is in charge of preparing input features for vision models and post processing their outputs. This includes transformations such as resizing, normalization, and conversion to PyTorch, TensorFlow, Flax and Numpy tensors. It may also include model specific post-processing such as converting logits to segmentation masks.
 
+Fast image processors are available for a few models and more will be added in the future. They are based on the [torchvision](https://pytorch.org/vision/stable/index.html) library and provide a significant speed-up, especially when processing on GPU.
+They have the same API as the base image processors and can be used as drop-in replacements.
+To use a fast image processor, you need to install the `torchvision` library, and set the `use_fast` argument to `True` when instantiating the image processor:
+
+```python
+from transformers import AutoImageProcessor
+
+processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True)
+```
+
+When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise.
+
+```python
+from torchvision.io import read_image
+from transformers import DetrImageProcessorFast
+
+images = read_image("image.jpg")
+processor = DetrImageProcessorFast.from_pretrained("facebook/detr-resnet-50")
+images_processed = processor(images, return_tensors="pt", device="cuda")
+```
+
+Here are some speed comparisons between the base and fast image processors for the `DETR` and `RT-DETR` models, and how they impact overall inference time:
+
+<div class="flex">
+  <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_detr_fast_padded.png" />
+  </div>
+  <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_detr_fast_batched_compiled.png" />
+  </div>
+</div>
+
+<div class="flex">
+  <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_rt_detr_fast_single.png" />
+  </div>
+  <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_rt_detr_fast_batched.png" />
+  </div>
+</div>
+
+These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon.com/ec2/instance-types/g5/), utilizing an NVIDIA A10G Tensor Core GPU.
+
 
 ## ImageProcessingMixin
 
diff --git a/docs/source/en/main_classes/pipelines.md b/docs/source/en/main_classes/pipelines.md
index d5d132aaaba566..59e474fcc49f75 100644
--- a/docs/source/en/main_classes/pipelines.md
+++ b/docs/source/en/main_classes/pipelines.md
@@ -478,6 +478,12 @@ Pipelines available for multimodal tasks include the following.
     - __call__
     - all
 
+### ImageTextToTextPipeline
+
+[[autodoc]] ImageTextToTextPipeline
+    - __call__
+    - all
+
 ### MaskGenerationPipeline
 
 [[autodoc]] MaskGenerationPipeline
diff --git a/docs/source/en/main_classes/tokenizer.md b/docs/source/en/main_classes/tokenizer.md
index 2ad7e450404e77..83d2ae5df6a7fb 100644
--- a/docs/source/en/main_classes/tokenizer.md
+++ b/docs/source/en/main_classes/tokenizer.md
@@ -51,6 +51,25 @@ token space (e.g., getting the index of the token comprising a given character o
 to a given token).
 
 
+# Multimodal Tokenizer
+
+Apart from that each tokenizer can be a "multimodal" tokenizer which means that the tokenizer will hold all relevant special tokens
+as part of tokenizer attributes for easier access. For example, if the tokenizer is loaded from a vision-language model like LLaVA, you will
+be able to access `tokenizer.image_token_id` to obtain the special image token used as a placeholder. 
+
+To enable extra special tokens for any type of tokenizer, you have to add the following lines and save the tokenizer. Extra special tokens do not
+have to be modality related and can ne anything that the model often needs access to. In the below code, tokenizer at `output_dir` will have direct access
+to three more special tokens.  
+
+```python
+vision_tokenizer = AutoTokenizer.from_pretrained(
+    "llava-hf/llava-1.5-7b-hf",
+    extra_special_tokens={"image_token": "<image>", "boi_token": "<image_start>", "eoi_token": "<image_end>"}
+)
+print(vision_tokenizer.image_token, vision_tokenizer.image_token_id)
+("<image>", 32000)
+```
+
 ## PreTrainedTokenizer
 
 [[autodoc]] PreTrainedTokenizer
diff --git a/docs/source/en/model_doc/blip-2.md b/docs/source/en/model_doc/blip-2.md
index b57c69ca6b321b..4125d372d55ad5 100644
--- a/docs/source/en/model_doc/blip-2.md
+++ b/docs/source/en/model_doc/blip-2.md
@@ -40,6 +40,10 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/5
 - BLIP-2 can be used for conditional text generation given an image and an optional text prompt. At inference time, it's recommended to use the [`generate`] method.
 - One can use [`Blip2Processor`] to prepare images for the model, and decode the predicted tokens ID's back to text.
 
+> [!NOTE]
+> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `<image>` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `<image>` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings.
+The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042).
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BLIP-2.
diff --git a/docs/source/en/model_doc/deformable_detr.md b/docs/source/en/model_doc/deformable_detr.md
index 82ef251d478b95..5ed99dfe81d1c0 100644
--- a/docs/source/en/model_doc/deformable_detr.md
+++ b/docs/source/en/model_doc/deformable_detr.md
@@ -54,6 +54,12 @@ If you're interested in submitting a resource to be included here, please feel f
     - preprocess
     - post_process_object_detection
 
+## DeformableDetrImageProcessorFast
+
+[[autodoc]] DeformableDetrImageProcessorFast
+    - preprocess
+    - post_process_object_detection
+
 ## DeformableDetrFeatureExtractor
 
 [[autodoc]] DeformableDetrFeatureExtractor
diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md
index b5fc634b621626..904a96bc786f07 100644
--- a/docs/source/en/model_doc/instructblip.md
+++ b/docs/source/en/model_doc/instructblip.md
@@ -33,6 +33,10 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m
 
 InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but important difference: it also feeds the text prompt (instruction) to the Q-Former.
 
+> [!NOTE]
+> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `<image>` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `<image>` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings.
+The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042).
+
 ## InstructBlipConfig
 
 [[autodoc]] InstructBlipConfig
diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md
index aa93feb6b6dced..8b2207ce176566 100644
--- a/docs/source/en/model_doc/instructblipvideo.md
+++ b/docs/source/en/model_doc/instructblipvideo.md
@@ -35,6 +35,10 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m
 
 - The model was trained by sampling 4 frames per video, so it's recommended to sample 4 frames
 
+> [!NOTE]
+> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `<image>` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `<image>` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings.
+The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042).
+
 ## InstructBlipVideoConfig
 
 [[autodoc]] InstructBlipVideoConfig
diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md
index 99950a2ffd8e93..dec19ca5ef45db 100644
--- a/docs/source/en/model_doc/llava.md
+++ b/docs/source/en/model_doc/llava.md
@@ -40,6 +40,13 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/
 
 - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.
 
+
+> [!NOTE]
+> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
+Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
+The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
+
+
 ### Single image inference
 
 For best results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows:
@@ -85,10 +92,10 @@ LLaVa also supports batched inference. Here is how you can do it:
 import requests
 from PIL import Image
 import torch
-from transformers import AutoProcessor, LLavaForConditionalGeneration
+from transformers import AutoProcessor, LlavaForConditionalGeneration
 
 # Load the model in half-precision
-model = LLavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto")
+model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto")
 processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
 
 # Get two different images
diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
index b9146fbd33478a..88bd63e7101f17 100644
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@@ -53,6 +53,12 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/
 </Tip>
 
 
+> [!NOTE]
+> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
+Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
+The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
+
+
 - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. Below is an example of how to do that and the list of formats accepted by each checkpoint.
 
 We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows:
diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md
index fe905dfb7932ab..f8a149f12b6779 100644
--- a/docs/source/en/model_doc/llava_next_video.md
+++ b/docs/source/en/model_doc/llava_next_video.md
@@ -50,6 +50,12 @@ The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tre
 </Tip>
 
 
+> [!NOTE]
+> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
+Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
+The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
+
+
 - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use tokenizer's `apply_chat_template` to format your prompts correctly. Below is an example of how to do that.
 
 We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. Each content field has to be a list of dicts, as follows:
diff --git a/docs/source/en/model_doc/mllama.md b/docs/source/en/model_doc/mllama.md
index 9cb038ed2e3453..4a6080ea2ce03a 100644
--- a/docs/source/en/model_doc/mllama.md
+++ b/docs/source/en/model_doc/mllama.md
@@ -30,6 +30,25 @@ The Llama 3.2-Vision collection of multimodal large language models (LLMs) is a
 - The text passed to the processor should have the `"<|image|>"` tokens where the images should be inserted.
 - The processor has its own `apply_chat_template` method to convert chat messages to text that can then be passed as text to the processor.
 
+
+<Tip warning={true}>
+
+Mllama has an extra token used as a placeholder for image positions in the text. It means that input ids and an input embedding layer will have an extra token. But since the weights for input and output embeddings are not tied, the `lm_head` layer has one less token and will fail if you want to calculate loss on image tokens or apply some logit processors. In case you are training, make sure to mask out special `"<|image|>"` tokens in the `labels` as the model should not be trained on predicting them.
+
+Otherwise if you see CUDA-side index erros when generating, use the below code to expand the `lm_head` by one more token. 
+
+
+```python
+old_embeddings = model.get_output_embeddings()
+
+num_tokens = model.vocab_size + 1
+resized_embeddings = model._get_resized_lm_head(old_embeddings, new_num_tokens=num_tokens, mean_resizing=True)
+resized_embeddings.requires_grad_(old_embeddings.weight.requires_grad)
+model.set_output_embeddings(resized_embeddings)
+```
+</Tip>
+
+
 ## Usage Example
 
 #### Instruct model
diff --git a/docs/source/en/model_doc/olmo_1124.md b/docs/source/en/model_doc/olmo_1124.md
new file mode 100644
index 00000000000000..f36ec438e57a84
--- /dev/null
+++ b/docs/source/en/model_doc/olmo_1124.md
@@ -0,0 +1,46 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# OLMo November 2024
+
+## Overview
+
+The OLMo November 2024 model is a successor of the OLMo model, which was proposed in
+[OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838).
+
+ The architectural changes from the original OLMo model to this model are:
+
+- RMSNorm is used instead of standard layer norm.
+- Norm is applied to attention queries and keys.
+- Norm is applied after attention/feedforward layers rather than before.
+
+This model was contributed by [shanearora](https://huggingface.co/shanearora).
+The original code can be found [here](https://github.com/allenai/OLMo/tree/main/olmo).
+
+
+## Olmo1124Config
+
+[[autodoc]] Olmo1124Config
+
+## Olmo1124Model
+
+[[autodoc]] Olmo1124Model
+    - forward
+
+## Olmo1124ForCausalLM
+
+[[autodoc]] Olmo1124ForCausalLM
+    - forward
diff --git a/docs/source/en/model_doc/rt_detr.md b/docs/source/en/model_doc/rt_detr.md
index 5540266c6215de..6a1545e123297c 100644
--- a/docs/source/en/model_doc/rt_detr.md
+++ b/docs/source/en/model_doc/rt_detr.md
@@ -46,7 +46,7 @@ Initially, an image is processed using a pre-trained convolutional neural networ
 >>> from PIL import Image
 >>> from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
 
->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' 
+>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
 >>> image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
@@ -57,7 +57,7 @@ Initially, an image is processed using a pre-trained convolutional neural networ
 >>> with torch.no_grad():
 ...     outputs = model(**inputs)
 
->>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3)
+>>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3)
 
 >>> for result in results:
 ...     for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
@@ -95,6 +95,12 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
     - preprocess
     - post_process_object_detection
 
+## RTDetrImageProcessorFast
+
+[[autodoc]] RTDetrImageProcessorFast
+    - preprocess
+    - post_process_object_detection
+
 ## RTDetrModel
 
 [[autodoc]] RTDetrModel
diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
index b9aab2f1b929f2..59e451adceb817 100644
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -86,24 +86,32 @@ model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/sup
 
 inputs = processor(images, return_tensors="pt")
 outputs = model(**inputs)
-
-for i in range(len(images)):
-    image_mask = outputs.mask[i]
-    image_indices = torch.nonzero(image_mask).squeeze()
-    image_keypoints = outputs.keypoints[i][image_indices]
-    image_scores = outputs.scores[i][image_indices]
-    image_descriptors = outputs.descriptors[i][image_indices]
+image_sizes = [(image.height, image.width) for image in images]
+outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
+
+for output in outputs:
+    for keypoints, scores, descriptors in zip(output["keypoints"], output["scores"], output["descriptors"]):
+        print(f"Keypoints: {keypoints}")
+        print(f"Scores: {scores}")
+        print(f"Descriptors: {descriptors}")
 ```
 
-You can then print the keypoints on the image to visualize the result :
+You can then print the keypoints on the image of your choice to visualize the result:
 ```python
-import cv2
-for keypoint, score in zip(image_keypoints, image_scores):
-    keypoint_x, keypoint_y = int(keypoint[0].item()), int(keypoint[1].item())
-    color = tuple([score.item() * 255] * 3)
-    image = cv2.circle(image, (keypoint_x, keypoint_y), 2, color)
-cv2.imwrite("output_image.png", image)
+import matplotlib.pyplot as plt
+
+plt.axis("off")
+plt.imshow(image_1)
+plt.scatter(
+    outputs[0]["keypoints"][:, 0],
+    outputs[0]["keypoints"][:, 1],
+    c=outputs[0]["scores"] * 100,
+    s=outputs[0]["scores"] * 50,
+    alpha=0.8
+)
+plt.savefig(f"output_image.png")
 ```
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/ZtFmphEhx8tcbEQqOolyE.png)
 
 This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
 The original code can be found [here](https://github.com/magicleap/SuperPointPretrainedNetwork).
@@ -123,6 +131,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] SuperPointImageProcessor
 
 - preprocess
+- post_process_keypoint_detection
 
 ## SuperPointForKeypointDetection
 
diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md
index 1c4b5b4b874dd7..105307196effd0 100644
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@@ -54,6 +54,12 @@ This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanT
 The original code can be found [here](https://github.com/PKU-YuanGroup/Video-LLaVA).
 
 
+> [!NOTE]
+> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
+Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
+The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
+
+
 ## Usage example
 
 ### Single Media Mode
diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md
index b3e76cd292e40a..328310f3e26b77 100644
--- a/docs/source/en/model_doc/vipllava.md
+++ b/docs/source/en/model_doc/vipllava.md
@@ -39,6 +39,12 @@ This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada)
 
 - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.
 
+> [!NOTE]
+> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
+Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
+The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
+
+
 - For better results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows:
 
 ```python
diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md
index ec5802cfee372e..076fc2ccdd571a 100644
--- a/docs/source/en/model_sharing.md
+++ b/docs/source/en/model_sharing.md
@@ -43,7 +43,7 @@ As a result, you can load a specific model version with the `revision` parameter
 
 ```py
 >>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
+...     "julien-c/EsperBERTo-small", revision="4c77982"  # tag name, or branch name, or commit hash
 ... )
 ```
 
diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md
new file mode 100644
index 00000000000000..9975094411527a
--- /dev/null
+++ b/docs/source/en/perf_infer_gpu_multi.md
@@ -0,0 +1,68 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Multi-GPU inference
+
+Built-in Tensor Parallelism (TP) is now available with certain models using PyTorch. Tensor parallelism shards a model onto multiple GPUs, enabling larger model sizes, and parallelizes computations such as matrix multiplication.
+
+To enable tensor parallel, pass the argument `tp_plan="auto"` to [`~AutoModelForCausalLM.from_pretrained`]:
+
+```python
+import os
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+# Initialize distributed
+rank = int(os.environ["RANK"])
+device = torch.device(f"cuda:{rank}")
+torch.distributed.init_process_group("nccl", device_id=device)
+
+# Retrieve tensor parallel model
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    tp_plan="auto",
+)
+
+# Prepare input tokens
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+prompt = "Can I help"
+inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+
+# Distributed run
+outputs = model(inputs)
+```
+
+You can use `torchrun` to launch the above script with multiple processes, each mapping to a GPU:
+
+```
+torchrun --nproc-per-node 4 demo.py
+```
+
+PyTorch tensor parallel is currently supported for the following models:
+* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
+
+You can request to add tensor parallel support for another model by opening a GitHub Issue or Pull Request.
+
+### Expected speedups
+
+You can benefit from considerable speedups for inference, especially for inputs with large batch size or long sequences.
+
+For a single forward pass on [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) with a sequence length of 512 and various batch sizes, the expected speedup is as follows:
+
+<div style="text-align: center">
+<img src="huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct, seqlen = 512, python, w_ compile.png">
+</div>
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 67bd31fdaeede5..84109746f95998 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -77,6 +77,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron)
 * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
+* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model)
 * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
 * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
 * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
@@ -260,6 +261,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
 * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
+* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model)
 * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
 * [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt)
 * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
diff --git a/docs/source/en/perf_train_cpu.md b/docs/source/en/perf_train_cpu.md
index 14a52792d1f7d8..7ef98932d537ac 100644
--- a/docs/source/en/perf_train_cpu.md
+++ b/docs/source/en/perf_train_cpu.md
@@ -18,11 +18,11 @@ rendered properly in your Markdown viewer.
 This guide focuses on training large models efficiently on CPU.
 
 ## Mixed precision with IPEX
-Mixed precision uses single (fp32) and half-precision (bf16/fp16) data types in a model to accelerate training or inference while still preserving much of the single-precision accuracy. Modern CPUs such as 3rd and 4th Gen Intel® Xeon® Scalable processors natively support bf16, so you should get more performance out of the box by enabling mixed precision training with bf16.
+Mixed precision uses single (fp32) and half-precision (bf16/fp16) data types in a model to accelerate training or inference while still preserving much of the single-precision accuracy. Modern CPUs such as 3rd, 4th, and 5th Gen Intel® Xeon® Scalable processors natively support bf16. 6th Gen Intel® Xeon® Scalable processors natively support bf16 and fp16. You should get more performance out of the box by enabling mixed precision training with bf16 or fp16.
 
 To further maximize training performance, you can use Intel® Extension for PyTorch (IPEX), which is a library built on PyTorch and adds additional CPU instruction level architecture (ISA) level support such as Intel® Advanced Vector Extensions 512 Vector Neural Network Instructions (Intel® AVX512-VNNI), and Intel® Advanced Matrix Extensions (Intel® AMX) for an extra performance boost on Intel CPUs. However, CPUs with only AVX2 (e.g., AMD or older Intel CPUs) are not guaranteed to have better performance under IPEX.
 
-Auto Mixed Precision (AMP) for CPU backends has been enabled since PyTorch 1.10. AMP support for bf16 on CPUs and bf16 operator optimization is also supported in IPEX and partially upstreamed to the main PyTorch branch. You can get better performance and user experience with IPEX AMP.
+Auto Mixed Precision (AMP) for CPU backends has been enabled since PyTorch 1.10. AMP support for bf16/fp16 on CPUs and bf16/fp16 operator optimization is also supported in IPEX and partially upstreamed to the main PyTorch branch. You can get better performance and user experience with IPEX AMP.
 
 Check more detailed information for [Auto Mixed Precision](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html).
 
@@ -32,10 +32,10 @@ IPEX release is following PyTorch, to install via pip:
 
 | PyTorch Version   | IPEX version   |
 | :---------------: | :----------:   |
-| 2.1.x             |  2.1.100+cpu   |
-| 2.0.x             |  2.0.100+cpu   |
-| 1.13              |  1.13.0+cpu    |
-| 1.12              |  1.12.300+cpu  |
+| 2.5.0             |  2.5.0+cpu     |
+| 2.4.0             |  2.4.0+cpu     |
+| 2.3.0             |  2.3.0+cpu     |
+| 2.2.0             |  2.2.0+cpu     |
 
 Please run `pip list | grep torch` to get your `pytorch_version`, so you can get the `IPEX version_name`.
 ```bash
@@ -46,7 +46,7 @@ You can check the latest versions in [ipex-whl-stable-cpu](https://developer.int
 Check more approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html).
 
 ### Usage in Trainer
-To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex`, `bf16` and `no_cuda` in training command arguments.
+To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex`, `bf16` or `fp16`, and `no_cuda` in training command arguments.
 
 Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
 
diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md
index f528378bd1b875..ed782caca3b1f1 100644
--- a/docs/source/en/perf_train_cpu_many.md
+++ b/docs/source/en/perf_train_cpu_many.md
@@ -30,46 +30,32 @@ Check more detailed information for [oneccl_bind_pt](https://github.com/intel/to
 
 Wheel files are available for the following Python versions:
 
-| Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 |
-| :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: |
-| 2.1.0             |            | √          | √          | √          | √           |
-| 2.0.0             |            | √          | √          | √          | √           |
-| 1.13.0            |            | √          | √          | √          | √           |
-| 1.12.100          |            | √          | √          | √          | √           |
-| 1.12.0            |            | √          | √          | √          | √           |
+| Extension Version | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | Python 3.11 |
+| :---------------: | :--------: | :--------: | :--------: | :---------: | :---------: |
+| 2.5.0             |            | √          | √          | √           | √           |
+| 2.4.0             |            | √          | √          | √           | √           |
+| 2.3.0             |            | √          | √          | √           | √           |
+| 2.2.0             |            | √          | √          | √           | √           |
 
 Please run `pip list | grep torch` to get your `pytorch_version`.
 ```bash
 pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu
 ```
-where `{pytorch_version}` should be your PyTorch version, for instance 2.1.0.
+where `{pytorch_version}` should be your PyTorch version, for instance 2.4.0.
 Check more approaches for [oneccl_bind_pt installation](https://github.com/intel/torch-ccl).
 Versions of oneCCL and PyTorch must match.
 
-<Tip warning={true}>
-
-oneccl_bindings_for_pytorch 1.12.0 prebuilt wheel does not work with PyTorch 1.12.1 (it is for PyTorch 1.12.0)
-PyTorch 1.12.1 should work with oneccl_bindings_for_pytorch 1.12.100
-
-</Tip>
 
 ## Intel® MPI library
 Use this standards-based MPI implementation to deliver flexible, efficient, scalable cluster messaging on Intel® architecture. This component is part of the Intel® oneAPI HPC Toolkit.
 
 oneccl_bindings_for_pytorch is installed along with the MPI tool set. Need to source the environment before using it.
 
-for Intel® oneCCL >= 1.12.0
 ```bash
 oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)")
 source $oneccl_bindings_for_pytorch_path/env/setvars.sh
 ```
 
-for Intel® oneCCL whose version < 1.12.0
-```bash
-torch_ccl_path=$(python -c "import torch; import torch_ccl; import os;  print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))")
-source $torch_ccl_path/env/setvars.sh
-```
-
 #### Intel® Extension for PyTorch installation
 
 Intel Extension for PyTorch (IPEX) provides performance optimizations for CPU training with both Float32 and BFloat16 (refer to the [single CPU section](./perf_train_cpu) to learn more).
@@ -155,7 +141,7 @@ This example assumes that you have:
 The snippet below is an example of a Dockerfile that uses a base image that supports distributed CPU training and then
 extracts a Transformers release to the `/workspace` directory, so that the example scripts are included in the image:
 ```dockerfile
-FROM intel/intel-optimized-pytorch:2.3.0-pip-multinode
+FROM intel/intel-optimized-pytorch:2.4.0-pip-multinode
 
 RUN apt-get update -y && \
     apt-get install -y --no-install-recommends --fix-missing \
@@ -165,7 +151,7 @@ RUN apt-get update -y && \
 WORKDIR /workspace
 
 # Download and extract the transformers code
-ARG HF_TRANSFORMERS_VER="4.44.0"
+ARG HF_TRANSFORMERS_VER="4.46.0"
 RUN pip install --no-cache-dir \
     transformers==${HF_TRANSFORMERS_VER} && \
     mkdir transformers && \
@@ -319,4 +305,4 @@ with the job, the PyTorchJob resource can be deleted from the cluster using `kub
 
 This guide covered running distributed PyTorch training jobs using multiple CPUs on bare metal and on a Kubernetes
 cluster. Both cases utilize Intel Extension for PyTorch and Intel oneCCL Bindings for PyTorch for optimal training
-performance, and can be used as a template to run your own workload on multiple nodes.
\ No newline at end of file
+performance, and can be used as a template to run your own workload on multiple nodes.
diff --git a/docs/source/en/performance.md b/docs/source/en/performance.md
index 94e756cf33ada6..b9176be04ec206 100644
--- a/docs/source/en/performance.md
+++ b/docs/source/en/performance.md
@@ -53,7 +53,7 @@ sections we go through the steps to run inference on CPU and single/multi-GPU se
 
 * [Inference on a single CPU](perf_infer_cpu)
 * [Inference on a single GPU](perf_infer_gpu_one)
-* [Multi-GPU inference](perf_infer_gpu_one)
+* [Multi-GPU inference](perf_infer_gpu_multi)
 * [XLA Integration for TensorFlow Models](tf_xla)
 
 
diff --git a/docs/source/en/perplexity.md b/docs/source/en/perplexity.md
index 7555619fe488d2..ac7ef8504e72b6 100644
--- a/docs/source/en/perplexity.md
+++ b/docs/source/en/perplexity.md
@@ -107,7 +107,8 @@ max_length = model.config.n_positions
 stride = 512
 seq_len = encodings.input_ids.size(1)
 
-nlls = []
+nll_sum = 0.0
+n_tokens = 0
 prev_end_loc = 0
 for begin_loc in tqdm(range(0, seq_len, stride)):
     end_loc = min(begin_loc + max_length, seq_len)
@@ -124,13 +125,19 @@ for begin_loc in tqdm(range(0, seq_len, stride)):
         # to the left by 1.
         neg_log_likelihood = outputs.loss
 
-    nlls.append(neg_log_likelihood)
+    # Accumulate the total negative log-likelihood and the total number of tokens
+    num_valid_tokens = (target_ids != -100).sum().item()  # number of valid tokens in target_ids
+    batch_size = target_ids.size(0)
+    num_loss_tokens = num_valid_tokens - batch_size  # subtract batch_size due to internal label shift
+    nll_sum += neg_log_likelihood * num_loss_tokens
+    n_tokens += num_loss_tokens
 
     prev_end_loc = end_loc
     if end_loc == seq_len:
         break
 
-ppl = torch.exp(torch.stack(nlls).mean())
+avg_nll = nll_sum / n_tokens  # average negative log-likelihood per token
+ppl = torch.exp(avg_nll)
 ```
 
 Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window
@@ -139,5 +146,5 @@ and the better the reported perplexity will typically be.
 
 When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.44`, which is about the same
 as the `19.93` reported in the GPT-2 paper. By using `stride = 512` and thereby employing our striding window
-strategy, this jumps down to `16.45`. This is not only a more favorable score, but is calculated in a way that is
+strategy, this jumps down to `16.44`. This is not only a more favorable score, but is calculated in a way that is
 closer to the true autoregressive decomposition of a sequence likelihood.
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
index ef8ed444d9d49b..0fb72d26058e55 100644
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -45,19 +45,19 @@ In short, supporting a wide range of quantization methods allows you to pick the
 
 Use the table below to help you decide which quantization method to use.
 
-| Quantization method                 | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library                             |
-|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
-| [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | 🟢                      | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
-| [AWQ](./awq) | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
-| [bitsandbytes](./bitsandbytes)     | 🟢            | 🟡 *   |     🟢     | 🟡 *            | 🔴 **    | 🔴    (soon!)          | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
-| [compressed-tensors](./compressed_tensors)                        | 🔴                       | 🟢   |     🟢     | 🟢              | 🔴                     | 🔴                       | 1 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
-| [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
-| GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
-| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
-| [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
-| [Quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/quanto       |
-| [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
-| [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |
+| Quantization method                 | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | Intel GPU | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library                             |
+|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-----------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
+| [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | 🔴         | 🟢                      | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
+| [AWQ](./awq) | 🔴                       | 🟢   | 🟢        | 🟢              | 🔴                     | 🟢         | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
+| [bitsandbytes](./bitsandbytes)     | 🟢            | 🟡 *   |     🟢     | 🟡 *            | 🔴 **    | 🟡 *       | 🔴    (soon!)          | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
+| [compressed-tensors](./compressed_tensors)                        | 🔴                       | 🟢   |     🟢     | 🟢              | 🔴                     | 🔴         | 🔴                       | 1 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
+| [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴         | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
+| GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
+| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴         | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
+| [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🔴         | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
+| [optimum-quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/optimum-quanto       |
+| [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴         | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
+| [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       | 🔴         |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |
 
 <Tip>
 
diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md
index 18135b2ec2fce7..37df4ed589e840 100644
--- a/docs/source/en/quantization/quanto.md
+++ b/docs/source/en/quantization/quanto.md
@@ -14,21 +14,21 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Quanto
+# Optimum-quanto
 
 <Tip>
 
-Try Quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!
+Try optimum-quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!
 
 </Tip>
 
 
-[🤗 Quanto](https://github.com/huggingface/quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:
+[🤗 optimum-quanto](https://github.com/huggingface/optimum-quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:
 
 - weights quantization (`float8`,`int8`,`int4`,`int2`)
 - activation quantization (`float8`,`int8`)
 - modality agnostic (e.g CV,LLM)
-- device agnostic (e.g CUDA,MPS,CPU)
+- device agnostic (e.g CUDA,XPU,MPS,CPU)
 - compatibility with `torch.compile`
 - easy to add custom kernel for specific device
 - supports quantization aware training
@@ -37,12 +37,12 @@ Try Quanto + transformers with this [notebook](https://colab.research.google.com
 Before you begin, make sure the following libraries are installed:
 
 ```bash
-pip install quanto accelerate transformers
+pip install optimum-quanto accelerate transformers
 ```
 
 Now you can quantize a model by passing [`QuantoConfig`] object in the [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it contains `torch.nn.Linear` layers. 
 
-The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [quanto](https://github.com/huggingface/quanto) library instead. 
+The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [optimum-quanto](https://github.com/huggingface/optimum-quanto) library instead. 
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
@@ -55,7 +55,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cud
 
 Note that serialization is not supported yet with transformers but it is coming soon! If you want to save the model, you can use quanto library instead.
 
-Quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/quanto/tree/main/bench/generation)
+Optimum-quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/optimum-quanto/tree/main/bench/generation)
 
 <div class="flex gap-4">
   <div>
diff --git a/docs/source/en/tasks/idefics.md b/docs/source/en/tasks/idefics.md
index a780124edea9c6..7e3335762ea43b 100644
--- a/docs/source/en/tasks/idefics.md
+++ b/docs/source/en/tasks/idefics.md
@@ -386,9 +386,9 @@ The use and prompting for the conversational use is very similar to using the ba
 ```py
 >>> import torch
 >>> from transformers import IdeficsForVisionText2Text, AutoProcessor
+>>> from accelerate.test_utils.testing import get_backend
 
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
-
+>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
 >>> checkpoint = "HuggingFaceM4/idefics-9b-instruct"
 >>> model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device)
 >>> processor = AutoProcessor.from_pretrained(checkpoint)
diff --git a/docs/source/en/tasks/image_captioning.md b/docs/source/en/tasks/image_captioning.md
index 633ccc491ebb35..9a78967cb5198d 100644
--- a/docs/source/en/tasks/image_captioning.md
+++ b/docs/source/en/tasks/image_captioning.md
@@ -256,8 +256,9 @@ image
 Prepare image for the model.
 
 ```python
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
+from accelerate.test_utils.testing import get_backend
+# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
+device, _, _ = get_backend()
 inputs = processor(images=image, return_tensors="pt").to(device)
 pixel_values = inputs.pixel_values
 ```
diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md
index 514ec3fbfe0b93..49fdc9db60d4d7 100644
--- a/docs/source/en/tasks/image_classification.md
+++ b/docs/source/en/tasks/image_classification.md
@@ -26,7 +26,7 @@ after a natural disaster, monitoring crop health, or helping screen medical imag
 
 This guide illustrates how to:
 
-1. Fine-tune [ViT](model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image.
+1. Fine-tune [ViT](../model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image.
 2. Use your fine-tuned model for inference.
 
 <Tip>
diff --git a/docs/source/en/tasks/image_feature_extraction.md b/docs/source/en/tasks/image_feature_extraction.md
index c9d794b0b2be38..80b701588b26b4 100644
--- a/docs/source/en/tasks/image_feature_extraction.md
+++ b/docs/source/en/tasks/image_feature_extraction.md
@@ -43,8 +43,9 @@ Let's see the pipeline in action. First, initialize the pipeline. If you don't p
 ```python
 import torch
 from transformers import pipeline
-
-DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+from accelerate.test_utils.testing import get_backend
+# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
+DEVICE, _, _ = get_backend()
 pipe = pipeline(task="image-feature-extraction", model_name="google/vit-base-patch16-384", device=DEVICE, pool=True)
 ```
 
diff --git a/docs/source/en/tasks/image_text_to_text.md b/docs/source/en/tasks/image_text_to_text.md
index 261abf947290d1..041efb06c575b4 100644
--- a/docs/source/en/tasks/image_text_to_text.md
+++ b/docs/source/en/tasks/image_text_to_text.md
@@ -120,6 +120,46 @@ print(generated_texts)
 ## ['User: What do we see in this image? \nAssistant: In this image we can see two cats on the nets. \nUser: And how about this image? \nAssistant: In this image we can see flowers, plants and insect.']
 ```
 
+## Pipeline
+
+The fastest way to get started is to use the [`Pipeline`] API. Specify the `"image-text-to-text"` task and the model you want to use.
+
+```python
+from transformers import pipeline
+pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+```
+
+The example below uses chat templates to format the text inputs.
+
+```python
+messages = [
+     {
+         "role": "user",
+         "content": [
+             {
+                 "type": "image",
+                 "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
+             },
+             {"type": "text", "text": "Describe this image."},
+         ],
+     },
+     {
+         "role": "assistant",
+         "content": [
+             {"type": "text", "text": "There's a pink flower"},
+         ],
+     },
+ ]
+```
+
+Pass the chat template formatted text and image to [`Pipeline`] and set `return_full_text=False` to remove the input from the generated output.
+
+```python
+outputs = pipe(text=messages, max_new_tokens=20, return_full_text=False)
+outputs[0]["generated_text"]
+#  with a yellow center in the foreground. The flower is surrounded by red and white flowers with green stems
+```
+
 ## Streaming
 
 We can use [text streaming](./generation_strategies#streaming) for a better generation experience. Transformers supports streaming with the [`TextStreamer`] or [`TextIteratorStreamer`] classes. We will use the [`TextIteratorStreamer`] with IDEFICS-8B.
diff --git a/docs/source/en/tasks/image_to_image.md b/docs/source/en/tasks/image_to_image.md
index 0bb74b36980e0b..f1c62e47aebf24 100644
--- a/docs/source/en/tasks/image_to_image.md
+++ b/docs/source/en/tasks/image_to_image.md
@@ -37,8 +37,9 @@ We can now initialize the pipeline with a [Swin2SR model](https://huggingface.co
 ```python
 from transformers import pipeline
 import torch
-
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+from accelerate.test_utils.testing import get_backend
+# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
+device, _, _ = get_backend()
 pipe = pipeline(task="image-to-image", model="caidas/swin2SR-lightweight-x2-64", device=device)
 ```
 
diff --git a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
index 530e92d81f5c0d..17fb363df8e2a0 100644
--- a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
+++ b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
@@ -58,7 +58,7 @@ from transformers import TrainingArguments, Trainer
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-
+from accelerate.test_utils.testing import get_backend
 
 class ImageDistilTrainer(Trainer):
     def __init__(self, teacher_model=None, student_model=None, temperature=None, lambda_param=None,  *args, **kwargs):
@@ -66,7 +66,7 @@ class ImageDistilTrainer(Trainer):
         self.teacher = teacher_model
         self.student = student_model
         self.loss_function = nn.KLDivLoss(reduction="batchmean")
-        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
         self.teacher.to(device)
         self.teacher.eval()
         self.temperature = temperature
diff --git a/docs/source/en/tasks/mask_generation.md b/docs/source/en/tasks/mask_generation.md
index 82202f58bca607..db16e035e303e0 100644
--- a/docs/source/en/tasks/mask_generation.md
+++ b/docs/source/en/tasks/mask_generation.md
@@ -125,9 +125,9 @@ the processor.
 ```python
 from transformers import SamModel, SamProcessor
 import torch
-
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
+from accelerate.test_utils.testing import get_backend
+# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
+device, _, _ = get_backend()
 model = SamModel.from_pretrained("facebook/sam-vit-base").to(device)
 processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 ```
diff --git a/docs/source/en/tasks/monocular_depth_estimation.md b/docs/source/en/tasks/monocular_depth_estimation.md
index 3ded3179154aae..edd22122f32bd6 100644
--- a/docs/source/en/tasks/monocular_depth_estimation.md
+++ b/docs/source/en/tasks/monocular_depth_estimation.md
@@ -53,8 +53,9 @@ Instantiate a pipeline from a [checkpoint on the Hugging Face Hub](https://huggi
 ```py
 >>> from transformers import pipeline
 >>> import torch
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> from accelerate.test_utils.testing import get_backend
+# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
+>>> device, _, _ = get_backend()
 >>> checkpoint = "depth-anything/Depth-Anything-V2-base-hf"
 >>> pipe = pipeline("depth-estimation", model=checkpoint, device=device)
 ```
diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index fdc81896bc1924..c307dd3334fe92 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -1488,7 +1488,9 @@ Now that you have finetuned a model, evaluated it, and uploaded it to the Huggin
 
 Load model and image processor from the Hugging Face Hub (skip to use already trained in this session):
 ```py
->>> device = "cuda"
+>>> from accelerate.test_utils.testing import get_backend
+# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
+>>> device, _, _ = get_backend()
 >>> model_repo = "qubvel-hf/detr_finetuned_cppe5"
 
 >>> image_processor = AutoImageProcessor.from_pretrained(model_repo)
diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md
index 912577589486ce..a21ff62edf1a56 100644
--- a/docs/source/en/tasks/semantic_segmentation.md
+++ b/docs/source/en/tasks/semantic_segmentation.md
@@ -689,7 +689,9 @@ Reload the dataset and load an image for inference.
 We will now see how to infer without a pipeline. Process the image with an image processor and place the `pixel_values` on a GPU:
 
 ```py
->>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # use GPU if available, otherwise use a CPU
+>>> from accelerate.test_utils.testing import get_backend
+# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
+>>> device, _, _ = get_backend()
 >>> encoding = image_processor(image, return_tensors="pt")
 >>> pixel_values = encoding.pixel_values.to(device)
 ```
diff --git a/docs/source/en/tasks/text-to-speech.md b/docs/source/en/tasks/text-to-speech.md
index 188d4ea5f9ee68..e25da4e19efeaa 100644
--- a/docs/source/en/tasks/text-to-speech.md
+++ b/docs/source/en/tasks/text-to-speech.md
@@ -282,10 +282,10 @@ containing the corresponding speaker embedding.
 >>> import os
 >>> import torch
 >>> from speechbrain.inference.classifiers import EncoderClassifier
+>>> from accelerate.test_utils.testing import get_backend
 
 >>> spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
 >>> speaker_model = EncoderClassifier.from_hparams(
 ...     source=spk_model_name,
 ...     run_opts={"device": device},
diff --git a/docs/source/en/tasks/visual_question_answering.md b/docs/source/en/tasks/visual_question_answering.md
index 7083d8c98b932e..87dbfb751bfa98 100644
--- a/docs/source/en/tasks/visual_question_answering.md
+++ b/docs/source/en/tasks/visual_question_answering.md
@@ -363,10 +363,11 @@ GPU, if available, which we didn't need to do earlier when training, as [`Traine
 ```py
 >>> from transformers import AutoProcessor, Blip2ForConditionalGeneration
 >>> import torch
+>>> from accelerate.test_utils.testing import get_backend
 
 >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
 >>> model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
 >>> model.to(device)
 ```
 
diff --git a/docs/source/en/tasks_explained.md b/docs/source/en/tasks_explained.md
index 7c836f70cfc427..1cc60ba096f04f 100644
--- a/docs/source/en/tasks_explained.md
+++ b/docs/source/en/tasks_explained.md
@@ -182,7 +182,7 @@ There are three main components to Mask2Former:
 
     The mask predictions are generated by combining the pixel-embeddings with the final decoder hidden states. The sigmoid cross-entropy and dice loss is calculated between the logits and the ground truth mask to find the most likely mask.
 
-Ready to try your hand at object detection? Check out our complete [image segmentation guide](tasks/semantic_segmentation) to learn how to finetune SegFormer and use it for inference!
+Ready to try your hand at image segmentation? Check out our complete [image segmentation guide](tasks/semantic_segmentation) to learn how to finetune SegFormer and use it for inference!
 
 ### Depth estimation
 
@@ -292,4 +292,4 @@ Ready to try your hand at translation? Check out our complete [translation guide
 
 For more information about text generation, check out the [text generation strategies](generation_strategies) guide!
 
-</Tip>
\ No newline at end of file
+</Tip>
diff --git a/docs/source/en/testing.md b/docs/source/en/testing.md
index 1da8a62456ee2c..9e85f2248e16fc 100644
--- a/docs/source/en/testing.md
+++ b/docs/source/en/testing.md
@@ -428,7 +428,7 @@ pytest --instafail
 
 ### To GPU or not to GPU
 
-On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""`:
+On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""` for CUDA GPUs:
 
 ```bash
 CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py
@@ -441,10 +441,12 @@ second gpu if you have gpus `0` and `1`, you can run:
 CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py
 ```
 
+For Intel GPUs, use `ZE_AFFINITY_MASK` instead of `CUDA_VISIBLE_DEVICES` in the above example.
+
 This is handy when you want to run different tasks on different GPUs.
 
 Some tests must be run on CPU-only, others on either CPU or GPU or TPU, yet others on multiple-GPUs. The following skip
-decorators are used to set the requirements of tests CPU/GPU/TPU-wise:
+decorators are used to set the requirements of tests CPU/GPU/XPU/TPU-wise:
 
 - `require_torch` - this test will run only under torch
 - `require_torch_gpu` - as `require_torch` plus requires at least 1 GPU
diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index f9ea3337699444..e3a66f42042485 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -174,7 +174,7 @@ trainer = Trainer(
     processing_class=tokenizer,
     data_collator=data_collator,
     compute_metrics=compute_metrics,
-    callback=[EarlyStoppingCallback()],
+    callbacks=[EarlyStoppingCallback()],
 )
 ```
 
@@ -252,7 +252,70 @@ trainer = Trainer(..., args=training_args)
 
 NEFTune is disabled after training to restore the original embedding layer to avoid any unexpected behavior.
 
-## GaLore
+## Liger Kernel
+
+[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed.
+
+<Tip>
+Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples)
+</Tip>
+
+First make sure to install Liger official repository:
+```bash
+pip install liger-kernel
+```
+
+You should pass `use_liger_kernel=True` to apply liger kernel on your model, for example:
+
+```py
+from transformers import TrainingArguments
+
+training_args = TrainingArguments(
+    output_dir="your-model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+    use_liger_kernel=True
+)
+```
+
+The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger_kernel` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value.
+
+
+## Optimizers
+
+You can choose a built-in optimizer for training using:
+
+```python
+from transformers import TrainingArguments
+training_args = TrainingArguments(..., optim="adamw_torch")
+```
+
+See [`OptimizerNames`](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py) for a full list of choices. We include advanced examples in the sections below.
+
+You can also use an arbitrary PyTorch optimizer via:
+
+```python
+import torch
+
+optimizer_cls = torch.optim.AdamW
+optimizer_kwargs = {
+    "lr": 4e-3,
+    "betas": (0.9, 0.999),
+    "weight_decay": 0.05,
+}
+
+from transformers import Trainer
+trainer = Trainer(..., optimizer_cls_and_kwargs=(optimizer_cls, optimizer_kwargs))
+```
+
+### GaLore
 
 Gradient Low-Rank Projection (GaLore) is a memory-efficient low-rank training strategy that allows full-parameter learning but is more memory-efficient than common low-rank adaptation methods, such as LoRA.
 
@@ -382,42 +445,7 @@ trainer.train()
 
 Note layerwise optimization is a bit experimental and does not support DDP (Distributed Data Parallel), thus you can run the training script only on a single GPU. Please see [this appropriate section](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) for more details. Other features such as gradient clipping, DeepSpeed, etc might not be supported out of the box. Please [raise an issue on GitHub](https://github.com/huggingface/transformers/issues) if you encounter such issue.
 
-## Liger Kernel
-
-[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed.
-
-<Tip>
-Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples)
-</Tip>
-
-First make sure to install Liger official repository:
-```bash
-pip install liger-kernel
-```
-
-You should pass `use_liger_kernel=True` to apply liger kernel on your model, for example:
-
-```py
-from transformers import TrainingArguments
-
-training_args = TrainingArguments(
-    output_dir="your-model",
-    learning_rate=2e-5,
-    per_device_train_batch_size=16,
-    per_device_eval_batch_size=16,
-    num_train_epochs=2,
-    weight_decay=0.01,
-    eval_strategy="epoch",
-    save_strategy="epoch",
-    load_best_model_at_end=True,
-    push_to_hub=True,
-    use_liger_kernel=True
-)
-```
-
-The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger_kernel` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value.
-
-## LOMO optimizer
+### LOMO optimizer
 
 The LOMO optimizers have been introduced in [Full Parameter Fine-Tuning for Large Language Models with Limited Resources](https://hf.co/papers/2306.09782) and [AdaLomo: Low-memory Optimization with Adaptive Learning Rate](https://hf.co/papers/2310.10195).
 They both consist of an efficient full-parameter fine-tuning method. These optimizers fuse the gradient computation and the parameter update in one step to reduce memory usage. Supported optimizers for LOMO are `"lomo"` and `"adalomo"`. First either install LOMO from pypi `pip install lomo-optim` or install it from source with `pip install git+https://github.com/OpenLMLab/LOMO.git`.
@@ -467,7 +495,7 @@ trainer = trl.SFTTrainer(
 trainer.train()
 ```
 
-## GrokAdamW optimizer
+### GrokAdamW optimizer
 
 The GrokAdamW optimizer is designed to enhance training performance and stability, particularly for models that benefit from grokking signal functions. To use GrokAdamW, first install the optimizer package with `pip install grokadamw`.
 
@@ -518,7 +546,7 @@ trainer.train()
 
 This script demonstrates how to fine-tune the `google/gemma-2b` model on the IMDB dataset using the GrokAdamW optimizer. The `TrainingArguments` are configured to use GrokAdamW, and the dataset is passed to the `Trainer` for training.
 
-## Schedule Free Optimizer
+### Schedule Free Optimizer
 
 The Schedule Free optimizers have been introduced in [The Road Less Scheduled](https://hf.co/papers/2405.15682).
 Schedule-Free learning replaces the momentum of the base optimizer with a combination of averaging and interpolation, to completely remove the need to anneal the learning rate with a traditional schedule.
diff --git a/docs/source/en/training.md b/docs/source/en/training.md
index aacf174fbd6be0..6fcf6809204b5a 100644
--- a/docs/source/en/training.md
+++ b/docs/source/en/training.md
@@ -287,9 +287,10 @@ model.fit(tf_dataset)
 At this point, you may need to restart your notebook or execute the following code to free some memory:
 
 ```py
+from accelerate.utils.memory import clear_device_cache
 del model
 del trainer
-torch.cuda.empty_cache()
+clear_device_cache()
 ```
 
 Next, manually postprocess `tokenized_dataset` to prepare it for training.
@@ -364,8 +365,9 @@ Lastly, specify `device` to use a GPU if you have access to one. Otherwise, trai
 
 ```py
 >>> import torch
+>>> from accelerate.test_utils.testing import get_backend
 
->>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
 >>> model.to(device)
 ```
 
diff --git a/docs/source/es/model_sharing.md b/docs/source/es/model_sharing.md
index 43cf0b8eddb8f7..77ee523094f452 100644
--- a/docs/source/es/model_sharing.md
+++ b/docs/source/es/model_sharing.md
@@ -43,7 +43,7 @@ Como resultado, puedes cargar una versión específica del modelo con el paráme
 
 ```py
 >>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
+...     "julien-c/EsperBERTo-small", revision="4c77982"  # tag name, or branch name, or commit hash
 ... )
 ```
 
diff --git a/docs/source/hi/_toctree.yml b/docs/source/hi/_toctree.yml
index 546a8663cc4d88..72759457a5c83a 100644
--- a/docs/source/hi/_toctree.yml
+++ b/docs/source/hi/_toctree.yml
@@ -1,3 +1,7 @@
 - sections:
   - local: pipeline_tutorial
-    title: पाइपलाइनों के साथ अनुमान चलाएँ
\ No newline at end of file
+    title: पाइपलाइनों के साथ अनुमान चलाएँ
+  - local: accelerate
+    title: 🤗 Accelerate के साथ वितरित प्रशिक्षण सेट करें
+  - local: tflite
+    title: TFLite में निर्यात करें
\ No newline at end of file
diff --git a/docs/source/hi/accelerate.md b/docs/source/hi/accelerate.md
new file mode 100644
index 00000000000000..3d568217a1299b
--- /dev/null
+++ b/docs/source/hi/accelerate.md
@@ -0,0 +1,136 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# वितरित प्रशिक्षण के साथ 🤗 Accelerate
+
+जैसे-जैसे मॉडल बड़े होते हैं, समानांतरता सीमित हार्डवेयर पर बड़े मॉडल को प्रशिक्षित करने और प्रशिक्षण की गति को कई आदेशों के आकार में तेज करने के लिए एक रणनीति के रूप में उभरी है। हगिंग फेस में, हमने उपयोगकर्ताओं को किसी भी प्रकार के वितरित सेटअप पर 🤗 ट्रांसफार्मर्स मॉडल को आसानी से प्रशिक्षित करने में मदद करने के लिए [🤗 Accelerate](https://huggingface.co/docs/accelerate) पुस्तकालय बनाया है, चाहे वह एक मशीन पर कई GPU हों या कई मशीनों में कई GPU। इस ट्यूटोरियल में, जानें कि अपने मूल PyTorch प्रशिक्षण लूप को कैसे अनुकूलित किया जाए ताकि वितरित वातावरण में प्रशिक्षण सक्षम हो सके।
+
+## सेटअप
+
+🤗 Accelerate स्थापित करके शुरू करें:
+
+```bash
+pip install accelerate
+```
+
+फिर एक [`~accelerate.Accelerator`] ऑब्जेक्ट आयात करें और बनाएं। [`~accelerate.Accelerator`] स्वचालित रूप से आपके वितरित सेटअप के प्रकार का पता लगाएगा और प्रशिक्षण के लिए सभी आवश्यक घटकों को प्रारंभ करेगा। आपको अपने मॉडल को किसी डिवाइस पर स्पष्ट रूप से रखने की आवश्यकता नहीं है।
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## तेजी लाने की तैयारी
+
+अगला कदम सभी प्रासंगिक प्रशिक्षण वस्तुओं को [`~accelerate.Accelerator.prepare`] विधि में पास करना है। इसमें आपके प्रशिक्षण और मूल्यांकन DataLoaders, एक मॉडल और एक ऑप्टिमाइज़र शामिल हैं:
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+...     train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## बैकवर्ड
+
+अंतिम जोड़ यह है कि आपके प्रशिक्षण लूप में सामान्य `loss.backward()` को 🤗 Accelerate के [`~accelerate.Accelerator.backward`] विधि से बदलें:
+
+```py
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         accelerator.backward(loss)
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+जैसा कि आप निम्नलिखित कोड में देख सकते हैं, आपको वितरित प्रशिक्षण सक्षम करने के लिए अपने प्रशिक्षण लूप में केवल चार अतिरिक्त कोड की पंक्तियाँ जोड़ने की आवश्यकता है!
+
+```diff
++ from accelerate import Accelerator
+  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+  optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++     train_dataloader, eval_dataloader, model, optimizer
++ )
+
+  num_epochs = 3
+  num_training_steps = num_epochs * len(train_dataloader)
+  lr_scheduler = get_scheduler(
+      "linear",
+      optimizer=optimizer,
+      num_warmup_steps=0,
+      num_training_steps=num_training_steps
+  )
+
+  progress_bar = tqdm(range(num_training_steps))
+
+  model.train()
+  for epoch in range(num_epochs):
+      for batch in train_dataloader:
+-         batch = {k: v.to(device) for k, v in batch.items()}
+          outputs = model(**batch)
+          loss = outputs.loss
+-         loss.backward()
++         accelerator.backward(loss)
+
+          optimizer.step()
+          lr_scheduler.step()
+          optimizer.zero_grad()
+          progress_bar.update(1)
+```
+
+## प्रशिक्षण
+
+एक बार जब आपने प्रासंगिक कोड की पंक्तियाँ जोड़ दी हैं, तो अपने प्रशिक्षण को स्क्रिप्ट या कोलैबोरेटरी जैसे नोटबुक में लॉन्च करें।
+
+### स्क्रिप्ट के साथ प्रशिक्षण
+
+यदि आप स्क्रिप्ट से अपना प्रशिक्षण चला रहे हैं, तो एक कॉन्फ़िगरेशन फ़ाइल बनाने और सहेजने के लिए निम्नलिखित कमांड चलाएँ:
+
+```bash
+accelerate config
+```
+
+फिर अपने प्रशिक्षण को इस तरह लॉन्च करें:
+
+```bash
+accelerate launch train.py
+```
+
+### नोटबुक के साथ प्रशिक्षण
+
+🤗 Accelerate एक नोटबुक में भी चल सकता है यदि आप Colaboratory के TPU का उपयोग करने की योजना बना रहे हैं। प्रशिक्षण के लिए जिम्मेदार सभी कोड को एक फ़ंक्शन में लपेटें, और इसे [`~accelerate.notebook_launcher`] में पास करें:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+🤗 Accelerate और इसकी समृद्ध सुविधाओं के बारे में अधिक जानकारी के लिए, [दस्तावेज़ीकरण](https://huggingface.co/docs/accelerate) देखें।
diff --git a/docs/source/hi/tflite.md b/docs/source/hi/tflite.md
new file mode 100644
index 00000000000000..5a84bed94266db
--- /dev/null
+++ b/docs/source/hi/tflite.md
@@ -0,0 +1,55 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TFLite में निर्यात करें
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/guide) एक हल्का ढांचा है जो मशीन लर्निंग मॉडल को संसाधन-सीमित उपकरणों, जैसे मोबाइल फोन, एम्बेडेड सिस्टम और इंटरनेट ऑफ थिंग्स (IoT) उपकरणों पर तैनात करने के लिए है। TFLite को इन उपकरणों पर सीमित गणनात्मक शक्ति, मेमोरी और ऊर्जा खपत के साथ मॉडल को कुशलता से ऑप्टिमाइज़ और चलाने के लिए डिज़ाइन किया गया है। एक TensorFlow Lite मॉडल को एक विशेष कुशल पोर्टेबल प्रारूप में दर्शाया जाता है जिसे `.tflite` फ़ाइल एक्सटेंशन द्वारा पहचाना जाता है।
+
+🤗 Optimum में `exporters.tflite` मॉड्यूल के माध्यम से 🤗 Transformers मॉडल को TFLite में निर्यात करने की कार्यक्षमता है। समर्थित मॉडल आर्किटेक्चर की सूची के लिए, कृपया [🤗 Optimum दस्तावेज़](https://huggingface.co/docs/optimum/exporters/tflite/overview) देखें।
+
+TFLite में एक मॉडल निर्यात करने के लिए, आवश्यक निर्भरताएँ स्थापित करें:
+
+```bash
+pip install optimum[exporters-tf]
+```
+
+सभी उपलब्ध तर्कों की जांच करने के लिए, [🤗 Optimum दस्तावेज़](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model) देखें,
+या कमांड लाइन में मदद देखें:
+
+```bash
+optimum-cli export tflite --help
+```
+
+यदि आप 🤗 Hub से एक मॉडल का चेकपॉइंट निर्यात करना चाहते हैं, उदाहरण के लिए, `google-bert/bert-base-uncased`, निम्नलिखित कमांड चलाएँ:
+
+```bash
+optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
+```
+
+आपको प्रगति को दर्शाते हुए लॉग दिखाई देंगे और यह दिखाएंगे कि परिणामस्वरूप `model.tflite` कहाँ सहेजा गया है, जैसे:
+
+```bash
+Validating TFLite model...
+	-[✓] TFLite model output names match reference model (logits)
+	- Validating TFLite Model output "logits":
+		-[✓] (1, 128, 30522) matches (1, 128, 30522)
+		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
+The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
+- logits: max diff = 5.817413330078125e-05.
+ The exported model was saved at: bert_tflite
+```
+
+उपरोक्त उदाहरण 🤗 Hub से एक चेकपॉइंट निर्यात करने को दर्शाता है। जब एक स्थानीय मॉडल निर्यात करते हैं, तो पहले सुनिश्चित करें कि आपने मॉडल के वज़न और टोकनाइज़र फ़ाइलों को एक ही निर्देशिका (`local_path`) में सहेजा है। CLI का उपयोग करते समय, चेकपॉइंट नाम के बजाय `model` तर्क में `local_path` पास करें।
diff --git a/docs/source/it/model_sharing.md b/docs/source/it/model_sharing.md
index 81257717ed9a70..6505658616baa5 100644
--- a/docs/source/it/model_sharing.md
+++ b/docs/source/it/model_sharing.md
@@ -43,7 +43,7 @@ Come risultato, puoi caricare una specifica versione di un modello con il parame
 
 ```py
 >>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # nome di un tag, di un branch, o commit hash
+...     "julien-c/EsperBERTo-small", revision="4c77982"  # nome di un tag, di un branch, o commit hash
 ... )
 ```
 
diff --git a/docs/source/ja/main_classes/pipelines.md b/docs/source/ja/main_classes/pipelines.md
index bfb9922057d318..3980becebbde36 100644
--- a/docs/source/ja/main_classes/pipelines.md
+++ b/docs/source/ja/main_classes/pipelines.md
@@ -481,6 +481,12 @@ my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline)
     - __call__
     - all
 
+### ImageTextToTextPipeline
+
+[[autodoc]] ImageTextToTextPipeline
+    - __call__
+    - all
+
 ### VisualQuestionAnsweringPipeline
 
 [[autodoc]] VisualQuestionAnsweringPipeline
diff --git a/docs/source/ja/model_sharing.md b/docs/source/ja/model_sharing.md
index aa8f7a3d1e3327..16d47057052b95 100644
--- a/docs/source/ja/model_sharing.md
+++ b/docs/source/ja/model_sharing.md
@@ -43,7 +43,7 @@ Model Hubの組み込みバージョニングはgitおよび[git-lfs](https://gi
 
 ```py
 >>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # タグ名、またはブランチ名、またはコミットハッシュ
+...     "julien-c/EsperBERTo-small", revision="4c77982"  # タグ名、またはブランチ名、またはコミットハッシュ
 ... )
 ```
 
diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 27102f123dd871..e924ebe34d1c2a 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -206,8 +206,8 @@
       title: 다중 CPU에서 훈련하기
     - local: perf_train_tpu_tf
       title: TensorFlow로 TPU에서 훈련하기
-    - local: in_translation
-      title: (번역중) PyTorch training on Apple silicon
+    - local: perf_train_special
+      title: Apple 실리콘에서 PyTorch 학습
     - local: perf_hardware
       title: 훈련용 사용자 맞춤형 하드웨어
     - local: hpo_train
@@ -322,12 +322,12 @@
         title: (번역중) ALBERT
       - local: model_doc/bart
         title: BART
-      - local: in_translation
-        title: (번역중) BARThez
+      - local: model_doc/barthez
+        title: BARThez
       - local: model_doc/bartpho
         title: BARTpho
-      - local: in_translation
-        title: (번역중) BERT
+      - local: model_doc/bert
+        title: BERT
       - local: in_translation
         title: (번역중) BertGeneration
       - local: model_doc/bert-japanese
@@ -358,8 +358,8 @@
         title: (번역중) CodeGen
       - local: model_doc/cohere
         title: Cohere
-      - local: in_translation
-        title: (번역중) ConvBERT
+      - local: model_doc/convbert
+        title: ConvBERT
       - local: in_translation
         title: (번역중) CPM
       - local: in_translation
@@ -448,8 +448,8 @@
         title: Mamba
       - local: model_doc/mamba2
         title: Mamba2
-      - local: in_translation
-        title: (번역중) MarianMT
+      - local: model_doc/marian
+        title: MarianMT
       - local: in_translation
         title: (번역중) MarkupLM
       - local: in_translation
@@ -677,6 +677,10 @@
       title: (번역중) 오디오 모델
     - isExpanded: false
       sections:
+      - local: model_doc/timesformer
+        title: TimeSformer
+      - local: in_translation
+        title: (번역중) VideoMAE
       - local: model_doc/vivit
         title: ViViT
       title: (번역중) 비디오 모델
diff --git a/docs/source/ko/model_doc/barthez.md b/docs/source/ko/model_doc/barthez.md
new file mode 100644
index 00000000000000..131db38856cc1b
--- /dev/null
+++ b/docs/source/ko/model_doc/barthez.md
@@ -0,0 +1,60 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BARThez [[barthez]]
+
+## 개요 [[overview]]
+
+BARThez 모델은 2020년 10월 23일, Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis에 의해 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321)에서 제안되었습니다.
+
+이 논문의 초록:
+
+
+*자기지도 학습에 의해 가능해진 귀납적 전이 학습은 자연어 처리(NLP) 분야 전반에 걸쳐 큰 반향을 일으켰으며, 
+BERT와 BART와 같은 모델들은 수많은 자연어 이해 작업에서 새로운 최첨단 성과를 기록했습니다. 일부 주목할 만한 예외가 있지만, 
+대부분의 사용 가능한 모델과 연구는 영어에 집중되어 있었습니다. 본 연구에서는 BARThez를 소개합니다. 
+이는 (우리가 아는 한) 프랑스어를 위한 첫 번째 BART 모델입니다. 
+BARThez는 과거 연구에서 얻은 매우 큰 프랑스어 단일 언어 말뭉치로 사전훈련되었으며, 
+BART의 변형 방식에 맞게 조정되었습니다. 
+CamemBERT 및 FlauBERT와 같은 기존의 BERT 기반 프랑스어 모델과 달리, BARThez는 생성 작업에 특히 적합합니다. 
+이는 인코더뿐만 아니라 디코더도 사전훈련되었기 때문입니다. 
+우리는 FLUE 벤치마크에서의 판별 작업 외에도 이 논문과 함께 공개하는 새로운 요약 데이터셋인 OrangeSum에서 BARThez를 평가했습니다. 
+또한 이미 사전훈련된 다국어 BART의 사전훈련을 BARThez의 말뭉치로 계속 진행하였으며, 
+결과적으로 얻어진 모델인 mBARTHez가 기본 BARThez보다 유의미한 성능 향상을 보였고, 
+CamemBERT 및 FlauBERT와 동등하거나 이를 능가함을 보였습니다.*
+
+이 모델은 [moussakam](https://huggingface.co/moussakam)이 기여했습니다. 저자의 코드는 [여기](https://github.com/moussaKam/BARThez)에서 찾을 수 있습니다.
+
+<Tip>
+
+BARThez 구현은 🤗 BART와 동일하나, 토큰화에서 차이가 있습니다. 구성 클래스와 그 매개변수에 대한 정보는 [BART 문서](bart)를 참조하십시오. 
+BARThez 전용 토크나이저는 아래에 문서화되어 있습니다.
+
+</Tip>
+
+## 리소스 [[resources]]
+
+- BARThez는 🤗 BART와 유사한 방식으로 시퀀스-투-시퀀스 작업에 맞춰 미세 조정될 수 있습니다. 다음을 확인하세요:
+  [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
+
+
+## BarthezTokenizer [[bartheztokenizer]]
+
+[[autodoc]] BarthezTokenizer
+
+## BarthezTokenizerFast [[bartheztokenizerfast]]
+
+[[autodoc]] BarthezTokenizerFast
diff --git a/docs/source/ko/model_doc/bert.md b/docs/source/ko/model_doc/bert.md
new file mode 100644
index 00000000000000..531d3e3dd63947
--- /dev/null
+++ b/docs/source/ko/model_doc/bert.md
@@ -0,0 +1,340 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BERT[[BERT]]
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=bert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-bert-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/bert-base-uncased">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## 개요[[Overview]]
+
+BERT 모델은 Jacob Devlin. Ming-Wei Chang, Kenton Lee, Kristina Touranova가 제안한 논문 [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)에서 소개되었습니다. BERT는 사전 학습된 양방향 트랜스포머로,  Toronto Book Corpus와 Wikipedia로 구성된 대규모 코퍼스에서 마스킹된 언어 모델링과 다음 문장 예측(Next Sentence Prediction) 목표를 결합해 학습되었습니다.
+
+해당 논문의 초록입니다:
+
+*우리는 BERT(Bidirectional Encoder Representations from Transformers)라는 새로운 언어 표현 모델을 소개합니다. 최근의 다른 언어 표현 모델들과 달리, BERT는 모든 계층에서 양방향으로 양쪽 문맥을 조건으로 사용하여 비지도 학습된 텍스트에서 깊이 있는 양방향 표현을 사전 학습하도록 설계되었습니다. 그 결과, 사전 학습된 BERT 모델은 추가적인 출력 계층 하나만으로 질문 응답, 언어 추론과 같은 다양한 작업에서 미세 조정될 수 있으므로, 특정 작업을 위해 아키텍처를 수정할 필요가 없습니다.*
+
+*BERT는 개념적으로 단순하면서도 실증적으로 강력한 모델입니다. BERT는 11개의 자연어 처리 과제에서 새로운 최고 성능을 달성했으며, GLUE 점수를 80.5% (7.7% 포인트 절대 개선)로, MultiNLI 정확도를 86.7% (4.6% 포인트 절대 개선), SQuAD v1.1 질문 응답 테스트에서 F1 점수를 93.2 (1.5% 포인트 절대 개선)로, SQuAD v2.0에서 F1 점수를 83.1 (5.1% 포인트 절대 개선)로 향상시켰습니다.*
+
+이 모델은 [thomwolf](https://huggingface.co/thomwolf)가 기여하였습니다. 원본 코드는 [여기](https://github.com/google-research/bert)에서 확인할 수 있습니다.
+
+## 사용 팁[[Usage tips]]
+
+- BERT는 절대 위치 임베딩을 사용하는 모델이므로 입력을 왼쪽이 아니라 오른쪽에서 패딩하는 것이 일반적으로 권장됩니다.
+- BERT는 마스킹된 언어 모델(MLM)과 Next Sentence Prediction(NSP) 목표로 학습되었습니다. 이는 마스킹된 토큰 예측과 전반적인 자연어 이해(NLU)에 뛰어나지만, 텍스트 생성에는 최적화되어있지 않습니다.    
+- BERT의 사전 학습 과정에서는 입력 데이터를 무작위로 마스킹하여 일부 토큰을 마스킹합니다. 전체 토큰 중 약 15%가 다음과 같은 방식으로 마스킹됩니다:
+
+    * 80% 확률로 마스크 토큰으로 대체
+    * 10% 확률로 임의의 다른 토큰으로 대체
+    * 10% 확률로 원래 토큰 그대로 유지
+
+- 모델의 주요 목표는 원본 문장을 예측하는 것이지만, 두 번째 목표가 있습니다: 입력으로 문장 A와 B (사이에는 구분 토큰이 있음)가 주어집니다. 이 문장 쌍이 연속될 확률은 50%이며, 나머지 50%는 서로 무관한 문장들입니다. 모델은 이 두 문장이 아닌지를 예측해야 합니다.
+
+### Scaled Dot Product Attention(SDPA) 사용하기 [[Using Scaled Dot Product Attention (SDPA)]]
+
+Pytorch는 `torch.nn.functional`의 일부로 Scaled Dot Product Attention(SDPA) 연산자를 기본적으로 제공합니다. 이 함수는 입력과 하드웨어에 따라 여러 구현 방식을 사용할 수 있습니다. 자세한 내용은 [공식 문서](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)나 [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)에서 확인할 수 있습니다.
+
+`torch>=2.1.1`에서는 구현이 가능한 경우 SDPA가 기본적으로 사용되지만, `from_pretrained()`함수에서 `attn_implementation="sdpa"`를 설정하여 SDPA를 명시적으로 사용하도록 지정할 수도 있습니다.
+
+```
+from transformers import BertModel
+
+model = BertModel.from_pretrained("bert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa")
+...
+```
+
+최적 성능 향상을 위해 모델을 반정밀도(예: `torch.float16` 또는 `torch.bfloat16`)로 불러오는 것을 권장합니다.
+
+로컬 벤치마크 (A100-80GB, CPUx12, RAM 96.6GB, PyTorch 2.2.0, OS Ubuntu 22.04)에서 `float16`을 사용해 학습 및 추론을 수행한 결과, 다음과 같은 속도 향상이 관찰되었습니다.
+
+#### 학습 [[Training]]
+
+|batch_size|seq_len|Time per batch (eager - s)|Time per batch (sdpa - s)|Speedup (%)|Eager peak mem (MB)|sdpa peak mem (MB)|Mem saving (%)|
+|----------|-------|--------------------------|-------------------------|-----------|-------------------|------------------|--------------|
+|4         |256    |0.023                     |0.017                    |35.472     |939.213            |764.834           |22.800        |
+|4         |512    |0.023                     |0.018                    |23.687     |1970.447           |1227.162          |60.569        |
+|8         |256    |0.023                     |0.018                    |23.491     |1594.295           |1226.114          |30.028        |
+|8         |512    |0.035                     |0.025                    |43.058     |3629.401           |2134.262          |70.054        |
+|16        |256    |0.030                     |0.024                    |25.583     |2874.426           |2134.262          |34.680        |
+|16        |512    |0.064                     |0.044                    |46.223     |6964.659           |3961.013          |75.830        |
+
+#### 추론 [[Inference]]
+
+|batch_size|seq_len|Per token latency eager (ms)|Per token latency SDPA (ms)|Speedup (%)|Mem eager (MB)|Mem BT (MB)|Mem saved (%)|
+|----------|-------|----------------------------|---------------------------|-----------|--------------|-----------|-------------|
+|1         |128    |5.736                       |4.987                      |15.022     |282.661       |282.924    |-0.093       |
+|1         |256    |5.689                       |4.945                      |15.055     |298.686       |298.948    |-0.088       |
+|2         |128    |6.154                       |4.982                      |23.521     |314.523       |314.785    |-0.083       |
+|2         |256    |6.201                       |4.949                      |25.303     |347.546       |347.033    |0.148        |
+|4         |128    |6.049                       |4.987                      |21.305     |378.895       |379.301    |-0.107       |
+|4         |256    |6.285                       |5.364                      |17.166     |443.209       |444.382    |-0.264       |
+
+
+
+## 자료[[Resources]]
+
+BERT를 시작하는 데 도움이 되는 Hugging Face와 community 자료 목록(🌎로 표시됨) 입니다. 여기에 포함될 자료를 제출하고 싶다면 PR(Pull Request)를 열어주세요. 리뷰 해드리겠습니다! 자료는 기존 자료를 복제하는 대신 새로운 내용을 담고 있어야 합니다.
+
+<PipelineTag pipeline="text-classification"/>
+
+- [BERT 텍스트 분류 (다른 언어로)](https://www.philschmid.de/bert-text-classification-in-a-different-language)에 대한 블로그 포스트.
+- [다중 레이블 텍스트 분류를 위한 BERT (및 관련 모델) 미세 조정](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb)에 대한 노트북.
+- [PyTorch를 이용해 BERT를 다중 레이블 분류를 위해 미세 조정하는 방법](htt기ps://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)에 대한 노트북. 🌎
+- [BERT로 EncoderDecoder 모델을 warm-start하여 요약하기](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)에 대한 노트북.
+- [`BertForSequenceClassification`]이  [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)에서 지원됩니다.
+- [`TFBertForSequenceClassification`]이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)에서 지원됩니다.
+- [`FlaxBertForSequenceClassification`]이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb)에서 지원됩니다.
+- [텍스트 분류 작업 가이드](../tasks/sequence_classification)
+
+<PipelineTag pipeline="token-classification"/>
+
+- [Keras와 함께 Hugging Face Transformers를 사용하여 비영리 BERT를 개체명 인식(NER)용으로 미세 조정하는 방법](https://www.philschmid.de/huggingface-transformers-keras-tf)에 대한 블로그 포스트.
+- [BERT를 개체명 인식을 위해 미세 조정하기](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb)에 대한 노트북. 각 단어의 첫 번째 wordpiece에만 레이블을 지정하여 학습하는 방법을 설명합니다. 모든 wordpiece에 레이블을 전파하는 방법은 [이 버전](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT.ipynb)에서 확인할 수 있습니다.
+- [`BertForTokenClassification`]이  [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification)와  [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)에서 지원됩니다.
+- [`TFBertForTokenClassification`]이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)에서 지원됩니다.
+- [`FlaxBertForTokenClassification`]이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification)에서 지원됩니다.
+- 🤗 Hugging Face 코스의 [토큰 분류 챕터](https://huggingface.co/course/chapter7/2?fw=pt).
+- [토큰 분류 작업 가이드](../tasks/token_classification)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [`BertForMaskedLM`]이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)에서 지원됩니다.
+- [`TFBertForMaskedLM`]이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) 와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)에서 지원됩니다.
+- [`FlaxBertForMaskedLM`]이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb)에서 지원됩니다.
+- 🤗 Hugging Face 코스의 [마스킹된 언어 모델링 챕터](https://huggingface.co/course/chapter7/3?fw=pt).
+- [마스킹된 언어 모델링 작업 가이드](../tasks/masked_language_modeling)
+
+<PipelineTag pipeline="question-answering"/>
+
+- [`BertForQuestionAnswering`]이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)에서 지원됩니다.
+- [`TFBertForQuestionAnswering`]이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) 와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)에서 지원됩니다.
+- [`FlaxBertForQuestionAnswering`]이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering)에서 지원됩니다.
+- 🤗 Hugging Face 코스의 [질문 답변 챕터](https://huggingface.co/course/chapter7/7?fw=pt).
+- [질문 답변 작업 가이드](../tasks/question_answering)
+
+**다중 선택**
+- [`BertForMultipleChoice`]이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)에서 지원됩니다.
+- [`TFBertForMultipleChoice`]이 [에제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)에서 지원됩니다.
+- [다중 선택 작업 가이드](../tasks/multiple_choice)
+
+⚡️ **추론**
+- [Hugging Face Transformers와 AWS Inferentia를 사용하여 BERT 추론을 가속화하는 방법](https://huggingface.co/blog/bert-inferentia-sagemaker)에 대한 블로그 포스트.
+- [GPU에서 DeepSpeed-Inference로 BERT 추론을 가속화하는 방법](https://www.philschmid.de/bert-deepspeed-inference)에 대한 블로그 포스트.
+
+⚙️ **사전 학습**
+- [Hugging Face Optimum으로 Transformers를 ONMX로 변환하는 방법](https://www.philschmid.de/pre-training-bert-habana)에 대한 블로그 포스트.
+
+🚀 **배포**
+- [Hugging Face Optimum으로 Transformers를 ONMX로 변환하는 방법](https://www.philschmid.de/convert-transformers-to-onnx)에 대한 블로그 포스트.
+- [AWS에서 Hugging Face Transformers를 위한 Habana Gaudi 딥러닝 환경 설정 방법](https://www.philschmid.de/getting-started-habana-gaudi#conclusion)에 대한 블로그 포스트.
+- [Hugging Face Transformers, Amazon SageMaker 및 Terraform 모듈을 이용한 BERT 자동 확장](https://www.philschmid.de/terraform-huggingface-amazon-sagemaker-advanced)에 대한 블로그 포스트.
+- [Hugging Face, AWS Lambda, Docker를 활용하여 서버리스 BERT 설정하는 방법](https://www.philschmid.de/serverless-bert-with-huggingface-aws-lambda-docker)에 대한 블로그 포스트.
+- [Amazon SageMaker와 Training Compiler를 사용하여 Hugging Face Transformers에서 BERT 미세 조정하는 방법](https://www.philschmid.de/huggingface-amazon-sagemaker-training-compiler)에 대한 블로그.
+- [Amazon SageMaker를 사용한 Transformers와 BERT의 작업별 지식 증류](https://www.philschmid.de/knowledge-distillation-bert-transformers)에 대한 블로그 포스트.
+
+## BertConfig
+
+[[autodoc]] BertConfig
+    - all
+
+## BertTokenizer
+
+[[autodoc]] BertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+<frameworkcontent>
+<pt>
+
+## BertTokenizerFast
+
+[[autodoc]] BertTokenizerFast
+
+</pt>
+<tf>
+
+## TFBertTokenizer
+
+[[autodoc]] TFBertTokenizer
+
+</tf>
+</frameworkcontent>
+
+## Bert specific outputs
+
+[[autodoc]] models.bert.modeling_bert.BertForPreTrainingOutput
+
+[[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
+
+[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
+
+
+<frameworkcontent>
+<pt>
+
+## BertModel
+
+[[autodoc]] BertModel
+    - forward
+
+## BertForPreTraining
+
+[[autodoc]] BertForPreTraining
+    - forward
+
+## BertLMHeadModel
+
+[[autodoc]] BertLMHeadModel
+    - forward
+
+## BertForMaskedLM
+
+[[autodoc]] BertForMaskedLM
+    - forward
+
+## BertForNextSentencePrediction
+
+[[autodoc]] BertForNextSentencePrediction
+    - forward
+
+## BertForSequenceClassification
+
+[[autodoc]] BertForSequenceClassification
+    - forward
+
+## BertForMultipleChoice
+
+[[autodoc]] BertForMultipleChoice
+    - forward
+
+## BertForTokenClassification
+
+[[autodoc]] BertForTokenClassification
+    - forward
+
+## BertForQuestionAnswering
+
+[[autodoc]] BertForQuestionAnswering
+    - forward
+
+</pt>
+<tf>
+
+## TFBertModel
+
+[[autodoc]] TFBertModel
+    - call
+
+## TFBertForPreTraining
+
+[[autodoc]] TFBertForPreTraining
+    - call
+
+## TFBertModelLMHeadModel
+
+[[autodoc]] TFBertLMHeadModel
+    - call
+
+## TFBertForMaskedLM
+
+[[autodoc]] TFBertForMaskedLM
+    - call
+
+## TFBertForNextSentencePrediction
+
+[[autodoc]] TFBertForNextSentencePrediction
+    - call
+
+## TFBertForSequenceClassification
+
+[[autodoc]] TFBertForSequenceClassification
+    - call
+
+## TFBertForMultipleChoice
+
+[[autodoc]] TFBertForMultipleChoice
+    - call
+
+## TFBertForTokenClassification
+
+[[autodoc]] TFBertForTokenClassification
+    - call
+
+## TFBertForQuestionAnswering
+
+[[autodoc]] TFBertForQuestionAnswering
+    - call
+
+</tf>
+<jax>
+
+## FlaxBertModel
+
+[[autodoc]] FlaxBertModel
+    - __call__
+
+## FlaxBertForPreTraining
+
+[[autodoc]] FlaxBertForPreTraining
+    - __call__
+
+## FlaxBertForCausalLM
+
+[[autodoc]] FlaxBertForCausalLM
+    - __call__
+
+## FlaxBertForMaskedLM
+
+[[autodoc]] FlaxBertForMaskedLM
+    - __call__
+
+## FlaxBertForNextSentencePrediction
+
+[[autodoc]] FlaxBertForNextSentencePrediction
+    - __call__
+
+## FlaxBertForSequenceClassification
+
+[[autodoc]] FlaxBertForSequenceClassification
+    - __call__
+
+## FlaxBertForMultipleChoice
+
+[[autodoc]] FlaxBertForMultipleChoice
+    - __call__
+
+## FlaxBertForTokenClassification
+
+[[autodoc]] FlaxBertForTokenClassification
+    - __call__
+
+## FlaxBertForQuestionAnswering
+
+[[autodoc]] FlaxBertForQuestionAnswering
+    - __call__
+
+</jax>
+</frameworkcontent>
+
+
diff --git a/docs/source/ko/model_doc/convbert.md b/docs/source/ko/model_doc/convbert.md
new file mode 100644
index 00000000000000..ec64a369b56a3f
--- /dev/null
+++ b/docs/source/ko/model_doc/convbert.md
@@ -0,0 +1,135 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ConvBERT [[convbert]]
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=convbert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-convbert-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/conv-bert-base">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## 개요 [[overview]]
+
+ConvBERT 모델은 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan에 의해 제안되었으며, 제안 논문 제목은 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496)입니다.
+
+논문의 초록은 다음과 같습니다:
+
+*BERT와 그 변형 모델과 같은 사전 학습된 언어 모델들은 최근 다양한 자연어 이해 과제에서 놀라운 성과를 이루었습니다. 그러나 BERT는 글로벌 셀프 어텐션 블록에 크게 의존하기 때문에 메모리 사용량이 많고 계산 비용이 큽니다. 모든 어텐션 헤드가 글로벌 관점에서 어텐션 맵을 생성하기 위해 입력 시퀀스 전체를 탐색하지만, 일부 헤드는 로컬 종속성만 학습할 필요가 있다는 것을 발견했습니다. 이는 불필요한 계산이 포함되어 있음을 의미합니다. 따라서 우리는 이러한 self-attention 헤드들을 대체하여 로컬 종속성을 직접 모델링하기 위해 새로운 span 기반 동적 컨볼루션을 제안합니다. 새로운 컨볼루션 헤드와 나머지 self-attention 헤드들이 결합하여 글로벌 및 로컬 문맥 학습에 더 효율적인 혼합 어텐션 블록을 구성합니다. 우리는 BERT에 이 혼합 어텐션 설계를 적용하여 ConvBERT 모델을 구축했습니다. 실험 결과, ConvBERT는 다양한 다운스트림 과제에서 BERT 및 그 변형 모델보다 더 우수한 성능을 보였으며, 훈련 비용과 모델 파라미터 수가 더 적었습니다. 특히 ConvBERTbase 모델은 GLUE 스코어 86.4를 달성하여 ELECTRAbase보다 0.7 높은 성과를 보이며, 훈련 비용은 1/4 이하로 줄었습니다. 코드와 사전 학습된 모델은 공개될 예정입니다.*
+
+이 모델은 [abhishek](https://huggingface.co/abhishek)에 의해 기여되었으며, 원본 구현은 여기에서 찾을 수 있습니다 : https://github.com/yitu-opensource/ConvBert
+
+
+
+## 사용 팁 [[usage-tips]]
+ConvBERT 훈련 팁은 BERT와 유사합니다. 사용 팁은 [BERT 문서](bert).를 참고하십시오.
+
+
+## 리소스 [[resources]]
+
+- [텍스트 분류 작업 가이드 (Text classification task guide)](../tasks/sequence_classification)
+- [토큰 분류 작업 가이드 (Token classification task guide)](../tasks/token_classification)
+- [질의응답 작업 가이드 (Question answering task guide)](../tasks/question_answering)
+- [마스킹된 언어 모델링 작업 가이드 (Masked language modeling task guide)](../tasks/masked_language_modeling)
+- [다중 선택 작업 가이드 (Multiple choice task guide)](../tasks/multiple_choice)
+
+## ConvBertConfig [[transformers.ConvBertConfig]]
+
+[[autodoc]] ConvBertConfig
+
+## ConvBertTokenizer [[transformers.ConvBertTokenizer]]
+
+[[autodoc]] ConvBertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## ConvBertTokenizerFast [[transformers.ConvBertTokenizerFast]]
+
+[[autodoc]] ConvBertTokenizerFast
+
+<frameworkcontent>
+<pt>
+
+## ConvBertModel [[transformers.ConvBertModel]]
+
+[[autodoc]] ConvBertModel
+    - forward
+
+## ConvBertForMaskedLM [[transformers.ConvBertForMaskedLM]]
+
+[[autodoc]] ConvBertForMaskedLM
+    - forward
+
+## ConvBertForSequenceClassification [[transformers.ConvBertForSequenceClassification]]
+
+[[autodoc]] ConvBertForSequenceClassification
+    - forward
+
+## ConvBertForMultipleChoice [[transformers.ConvBertForMultipleChoice]]
+
+[[autodoc]] ConvBertForMultipleChoice
+    - forward
+
+## ConvBertForTokenClassification [[transformers.ConvBertForTokenClassification]]
+
+[[autodoc]] ConvBertForTokenClassification
+    - forward
+
+## ConvBertForQuestionAnswering [[transformers.ConvBertForQuestionAnswering]]
+
+[[autodoc]] ConvBertForQuestionAnswering
+    - forward
+
+</pt>
+<tf>
+
+## TFConvBertModel [[transformers.TFConvBertModel]]
+
+[[autodoc]] TFConvBertModel
+    - call
+
+## TFConvBertForMaskedLM [[transformers.TFConvBertForMaskedLM]]
+
+[[autodoc]] TFConvBertForMaskedLM 
+    - call
+
+## TFConvBertForSequenceClassification [[transformers.TFConvBertForSequenceClassification]]
+
+[[autodoc]] TFConvBertForSequenceClassification
+    - call
+
+## TFConvBertForMultipleChoice [[transformers.TFConvBertForMultipleChoice]]
+
+[[autodoc]] TFConvBertForMultipleChoice
+    - call
+
+## TFConvBertForTokenClassification [[transformers.TFConvBertForTokenClassification]]
+
+[[autodoc]] TFConvBertForTokenClassification
+    - call
+
+## TFConvBertForQuestionAnswering [[transformers.TFConvBertForQuestionAnswering]]
+
+[[autodoc]] TFConvBertForQuestionAnswering
+    - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/model_doc/marian.md b/docs/source/ko/model_doc/marian.md
new file mode 100644
index 00000000000000..79a9641401d01e
--- /dev/null
+++ b/docs/source/ko/model_doc/marian.md
@@ -0,0 +1,217 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MarianMT[[MarianMT]]
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=marian">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-marian-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/opus-mt-zh-en">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## 개요[[Overview]]
+
+BART와 동일한 모델을 사용하는 번역 모델 프레임워크입니다. 번역 결과는 각 모델 카드의 테스트 세트와 유사하지만, 정확히 일치하지는 않을 수 있습니다. 이 모델은 [sshleifer](https://huggingface.co/sshleifer)가 제공했습니다.
+
+
+## 구현 노트[[Implementation Notes]]
+
+- 각 모델은 약 298 MB를 차지하며, 1,000개 이상의 모델이 제공됩니다.
+- 지원되는 언어 쌍 목록은 [여기](https://huggingface.co/Helsinki-NLP)에서 확인할 수 있습니다.
+- 모델들은 [Jörg Tiedemann](https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann)에 의해 [Marian](https://marian-nmt.github.io/) C++ 라이브러리를 이용하여 학습되었습니다. 이 라이브러리는 빠른 학습과 번역을 지원합니다.
+- 모든 모델은 6개 레이어로 이루어진 Transformer 기반의 인코더-디코더 구조입니다. 각 모델의 성능은 모델 카드에 기입되어 있습니다.
+- BPE 전처리가 필요한 80개의 OPUS 모델은 지원되지 않습니다.
+- 모델링 코드는 [`BartForConditionalGeneration`]을 기반으로 하며, 일부 수정사항이 반영되어 있습니다:
+
+  - 정적 (사인 함수 기반) 위치 임베딩 사용 (`MarianConfig.static_position_embeddings=True`)
+  - 임베딩 레이어 정규화 생략 (`MarianConfig.normalize_embedding=False`)
+  - 모델은 생성 시 프리픽스로 `pad_token_id` (해당 토큰 임베딩 값은 0)를 사용하여 시작합니다 (Bart는
+    `<s/>`를 사용),
+- Marian 모델을 PyTorch로 대량 변환하는 코드는 `convert_marian_to_pytorch.py`에서 찾을 수 있습니다.
+
+
+## 모델 이름 규칙[[Naming]]
+
+- 모든 모델 이름은 `Helsinki-NLP/opus-mt-{src}-{tgt}` 형식을 따릅니다.
+- 모델의 언어 코드 표기는 일관되지 않습니다. 두 자리 코드는 일반적으로 [여기](https://developers.google.com/admin-sdk/directory/v1/languages)에서 찾을 수 있으며, 세 자리 코드는 "언어 코드 {code}"로 구글 검색을 통해 찾습니다. 
+- `es_AR`과 같은 형태의 코드는 `code_{region}` 형식을 의미합니다. 여기서의 예시는 아르헨티나의 스페인어를 의미합니다.
+- 모델 변환은 두 단계로 이루어졌습니다. 처음 1,000개 모델은 ISO-639-2 코드를 사용하고, 두 번째 그룹은 ISO-639-5와 ISO-639-2 코드를 조합하여 언어를 식별합니다.
+
+
+## 예시[[Examples]]
+
+- Marian 모델은 라이브러리의 다른 번역 모델들보다 크기가 작아 파인튜닝 실험과 통합 테스트에 유용합니다.
+- [GPU에서 파인튜닝하기](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/train_distil_marian_enro.sh)
+
+## 다국어 모델 사용법[[Multilingual Models]]
+
+- 모든 모델 이름은`Helsinki-NLP/opus-mt-{src}-{tgt}` 형식을 따릅니다.
+- 다중 언어 출력을 지원하는 모델의 경우, 출력을 원하는 언어의 언어 코드를 `src_text`의 시작 부분에 추가하여 지정해야 합니다.
+- 모델 카드에서 지원되는 언어 코드의 목록을 확인할 수 있습니다! 예를 들어 [opus-mt-en-roa](https://huggingface.co/Helsinki-NLP/opus-mt-en-roa)에서 확인할 수 있습니다.
+- `Helsinki-NLP/opus-mt-roa-en`처럼 소스 측에서만 다국어를 지원하는 모델의 경우, 별도의 언어 코드 지정이 필요하지 않습니다.
+
+[Tatoeba-Challenge 리포지토리](https://github.com/Helsinki-NLP/Tatoeba-Challenge)의 새로운 다국적 모델은 3자리 언어 코드를 사용합니다:
+
+
+```python
+>>> from transformers import MarianMTModel, MarianTokenizer
+
+>>> src_text = [
+...     ">>fra<< this is a sentence in english that we want to translate to french",
+...     ">>por<< This should go to portuguese",
+...     ">>esp<< And this to Spanish",
+... ]
+
+>>> model_name = "Helsinki-NLP/opus-mt-en-roa"
+>>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+>>> print(tokenizer.supported_language_codes)
+['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
+
+>>> model = MarianMTModel.from_pretrained(model_name)
+>>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
+>>> [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+["c'est une phrase en anglais que nous voulons traduire en français",
+ 'Isto deve ir para o português.',
+ 'Y esto al español']
+```
+
+허브에 있는 모든 사전 학습된 모델을 확인하는 코드입니다:
+
+```python
+from huggingface_hub import list_models
+
+model_list = list_models()
+org = "Helsinki-NLP"
+model_ids = [x.id for x in model_list if x.id.startswith(org)]
+suffix = [x.split("/")[1] for x in model_ids]
+old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
+```
+
+## 구형 다국어 모델[[Old Style Multi-Lingual Models]]
+
+이 모델들은 OPUS-MT-Train 리포지토리의 구형 다국어 모델들입니다. 각 언어 그룹에 포함된 언어들은 다음과 같습니다:
+
+```python no-style
+['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
+ 'Helsinki-NLP/opus-mt-ROMANCE-en',
+ 'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
+ 'Helsinki-NLP/opus-mt-de-ZH',
+ 'Helsinki-NLP/opus-mt-en-CELTIC',
+ 'Helsinki-NLP/opus-mt-en-ROMANCE',
+ 'Helsinki-NLP/opus-mt-es-NORWAY',
+ 'Helsinki-NLP/opus-mt-fi-NORWAY',
+ 'Helsinki-NLP/opus-mt-fi-ZH',
+ 'Helsinki-NLP/opus-mt-fi_nb_no_nn_ru_sv_en-SAMI',
+ 'Helsinki-NLP/opus-mt-sv-NORWAY',
+ 'Helsinki-NLP/opus-mt-sv-ZH']
+GROUP_MEMBERS = {
+ 'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
+ 'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
+ 'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+ 'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+ 'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
+ 'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
+ 'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
+}
+```
+
+영어를 여러 로망스 언어로 번역하는 예제입니다. 여기서는 구형 2자리 언어 코드를 사용합니다:
+
+
+```python
+>>> from transformers import MarianMTModel, MarianTokenizer
+
+>>> src_text = [
+...     ">>fr<< this is a sentence in english that we want to translate to french",
+...     ">>pt<< This should go to portuguese",
+...     ">>es<< And this to Spanish",
+... ]
+
+>>> model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
+>>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+
+>>> model = MarianMTModel.from_pretrained(model_name)
+>>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
+>>> tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+["c'est une phrase en anglais que nous voulons traduire en français", 
+ 'Isto deve ir para o português.',
+ 'Y esto al español']
+```
+
+## 자료[[Resources]]
+
+- [번역 작업 가이드](../tasks/translation)
+- [요약 작업 가이드](../tasks/summarization)
+- [언어 모델링 작업 가이드](../tasks/language_modeling)
+
+## MarianConfig
+
+[[autodoc]] MarianConfig
+
+## MarianTokenizer
+
+[[autodoc]] MarianTokenizer
+    - build_inputs_with_special_tokens
+
+<frameworkcontent>
+<pt>
+
+## MarianModel
+
+[[autodoc]] MarianModel
+    - forward
+
+## MarianMTModel
+
+[[autodoc]] MarianMTModel
+    - forward
+
+## MarianForCausalLM
+
+[[autodoc]] MarianForCausalLM
+    - forward
+
+</pt>
+<tf>
+
+## TFMarianModel
+
+[[autodoc]] TFMarianModel
+    - call
+
+## TFMarianMTModel
+
+[[autodoc]] TFMarianMTModel
+    - call
+
+</tf>
+<jax>
+
+## FlaxMarianModel
+
+[[autodoc]] FlaxMarianModel
+    - __call__
+
+## FlaxMarianMTModel
+
+[[autodoc]] FlaxMarianMTModel
+    - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/ko/model_doc/timesformer.md b/docs/source/ko/model_doc/timesformer.md
new file mode 100644
index 00000000000000..aa75cee447a47f
--- /dev/null
+++ b/docs/source/ko/model_doc/timesformer.md
@@ -0,0 +1,51 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TimeSformer [[timesformer]]
+
+## 개요 [[overview]]
+
+TimeSformer 모델은 Facebook Research에서 제안한 [TimeSformer: Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095)에서 소개되었습니다. 이 연구는 첫 번째 비디오 Transformer로서, 행동 인식 분야에서 중요한 이정표가 되었습니다. 또한 Transformer 기반의 비디오 이해 및 분류 논문에 많은 영감을 주었습니다.
+
+논문의 초록은 다음과 같습니다. 
+
+*우리는 공간과 시간에 걸쳐 셀프 어텐션만을 사용하는 합성곱이 없는(convolution-free) 비디오 분류 방법을 제안합니다. 이 방법은 “TimeSformer”라고 불리며, 표준 Transformer 아키텍처를 비디오에 적용하여 프레임 수준 패치 시퀀스로부터 직접 시공간적 특징을 학습할 수 있게 합니다. 우리의 실험적 연구는 다양한 셀프 어텐션 방식을 비교하며, 시간적 어텐션과 공간적 어텐션을 각각의 블록 내에서 별도로 적용하는 “분할 어텐션” 방식이 고려된 설계 선택 중 가장 우수한 비디오 분류 정확도를 제공한다는 것을 시사합니다. 이 혁신적인 설계에도 불구하고, TimeSformer는 Kinetics-400 및 Kinetics-600을 포함한 여러 행동 인식 벤치마크에서 최첨단 결과를 달성했으며, 현재까지 보고된 가장 높은 정확도를 기록했습니다. 마지막으로, 3D 합성곱 네트워크와 비교했을 때, TimeSformer는 더 빠르게 학습할 수 있으며, 약간의 정확도 저하를 감수하면 테스트 효율성이 크게 향상되고, 1분 이상의 긴 비디오 클립에도 적용할 수 있습니다. 코드와 모델은 다음 링크에서 확인할 수 있습니다: [https URL 링크](https://github.com/facebookresearch/TimeSformer).*
+
+이 모델은 [fcakyon](https://huggingface.co/fcakyon)이 기여하였습니다.
+원본 코드는 [여기](https://github.com/facebookresearch/TimeSformer)에서 확인할 수 있습니다.
+
+## 사용 팁 [[usage-tips]]
+
+다양한 사전 학습된 모델의 변형들이 있습니다. 사용하려는 데이터셋에 맞춰 사전 학습된 모델을 선택해야 합니다. 또한, 모델 크기에 따라 클립당 입력 프레임 수가 달라지므로, 사전 학습된 모델을 선택할 때 이 매개변수를 고려해야 합니다.
+
+
+## 리소스 [[resources]]
+
+- [Video classification task guide](../tasks/video_classification)
+
+## TimesformerConfig [[transformers.TimesformerConfig]]
+
+[[autodoc]] TimesformerConfig
+
+## TimesformerModel [[transformers.TimesformerModel]]
+
+[[autodoc]] TimesformerModel
+    - forward
+
+## TimesformerForVideoClassification [[transformers.TimesformerForVideoClassification]]
+
+[[autodoc]] TimesformerForVideoClassification
+    - forward
\ No newline at end of file
diff --git a/docs/source/ko/model_sharing.md b/docs/source/ko/model_sharing.md
index 868cc3b231de93..38115077966275 100644
--- a/docs/source/ko/model_sharing.md
+++ b/docs/source/ko/model_sharing.md
@@ -43,7 +43,7 @@ picture-in-picture" allowfullscreen></iframe>
 
 ```py
 >>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
+...     "julien-c/EsperBERTo-small", revision="4c77982"  # tag name, or branch name, or commit hash
 ... )
 ```
 
diff --git a/docs/source/ko/perf_train_special.md b/docs/source/ko/perf_train_special.md
new file mode 100644
index 00000000000000..188db542f7c01f
--- /dev/null
+++ b/docs/source/ko/perf_train_special.md
@@ -0,0 +1,63 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Apple 실리콘에서 Pytorch 학습 [[PyTorch training on Apple silicon]]
+
+이전에는 Mac에서 모델을 학습할 때 CPU만 사용할 수 있었습니다. 그러나 이제 PyTorch v1.12의 출시로 Apple의 실리콘 GPU를 사용하여 훨씬 더 빠른 성능으로 모델을 학습할 수 있게 되었습니다. 이는 Pytorch에서 Apple의 Metal Performance Shaders (MPS)를 백엔드로 통합하면서 가능해졌습니다. [MPS 백엔드](https://pytorch.org/docs/stable/notes/mps.html)는 Pytorch 연산을 Metal 세이더로 구현하고 이 모듈들을 mps 장치에서 실행할 수 있도록 지원합니다.
+
+<Tip warning={true}>
+
+일부 Pytorch 연산들은 아직 MPS에서 지원되지 않아 오류가 발생할 수 있습니다. 이를 방지하려면 환경 변수 `PYTORCH_ENABLE_MPS_FALLBACK=1` 를 설정하여 CPU 커널을 대신 사용하도록 해야 합니다(이때 `UserWarning`이 여전히 표시될 수 있습니다). 
+
+<br>
+
+다른 오류가 발생할 경우 [PyTorch](https://github.com/pytorch/pytorch/issues) 리포지토리에 이슈를 등록해주세요. 현재 [`Trainer`]는 MPS 백엔드만 통합하고 있습니다.
+
+</Tip>
+
+`mps` 장치를 이용하면 다음과 같은 이점들을 얻을 수 있습니다:
+
+* 로컬에서 더 큰 네트워크나 배치 크기로 학습 가능
+* GPU의 통합 메모리 아키텍처로 인해 메모리에 직접 접근할 수 있어 데이터 로딩 지연 감소
+* 클라우드 기반 GPU나 추가 GPU가 필요 없으므로 비용 절감 가능
+
+Pytorch가 설치되어 있는지 확인하고 시작하세요. MPS 가속은 macOS 12.3 이상에서 지원됩니다.
+
+```bash
+pip install torch torchvision torchaudio
+```
+
+[`TrainingArguments`]는 `mps` 장치가 사용 가능한 경우 이를 기본적으로 사용하므로 장치를 따로 설정할 필요가 없습니다. 예를 들어, MPS 백엔드를 자동으로 활성화하여 [run_glue.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py) 스크립트를 아무 수정 없이 실행할 수 있습니다.
+
+```diff
+export TASK_NAME=mrpc
+
+python examples/pytorch/text-classification/run_glue.py \
+  --model_name_or_path google-bert/bert-base-cased \
+  --task_name $TASK_NAME \
+- --use_mps_device \
+  --do_train \
+  --do_eval \
+  --max_seq_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$TASK_NAME/ \
+  --overwrite_output_dir
+```
+
+`gloco`와 `nccl`과 같은 [분산 학습 백엔드](https://pytorch.org/docs/stable/distributed.html#backends)는 `mps` 장치에서 지원되지 않으므로, MPS 백엔드에서는 단일 GPU로만 학습이 가능합니다.
+
+Mac에서 가속된 PyTorch 학습에 대한 더 자세한 내용은 [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/) 블로그 게시물에서 확인할 수 있습니다.
diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml
index 07c97e51550cb7..7ebddf0339146e 100644
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@@ -86,6 +86,8 @@
     title: 🤗Transformers能做什么
   - local: tokenizer_summary
     title: 分词器的摘要
+  - local: attention
+    title: 注意力机制
   title: 概念指南
 - sections:
   - sections:
diff --git a/docs/source/zh/attention.md b/docs/source/zh/attention.md
new file mode 100644
index 00000000000000..357a574a2d2e69
--- /dev/null
+++ b/docs/source/zh/attention.md
@@ -0,0 +1,37 @@
+<!--版权2023年HuggingFace团队保留所有权利。
+
+根据Apache许可证第2.0版（“许可证”）许可；除非符合许可证，否则您不得使用此文件。您可以在以下网址获取许可证的副本：
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+除非适用法律要求或书面同意，否则按“按原样”分发的软件，无论是明示还是暗示的，都没有任何担保或条件。请参阅许可证以了解特定语言下的权限和限制。
+
+⚠️ 请注意，本文件虽然使用Markdown编写，但包含了特定的语法，适用于我们的doc-builder（类似于MDX），可能无法在您的Markdown查看器中正常渲染。
+
+-->
+
+# 注意力机制
+
+大多数 transformer 模型使用完全注意力机制，该机制采用正方形的注意力矩阵。当输入很长的文本时，这将导致巨大的计算瓶颈。Longformer 和 Reformer 是提高注意力机制效率的改进模型，它们使用稀疏化的注意力矩阵来加速训练。
+
+## 局部敏感哈希注意力机制（LSH attention）
+
+[Reformer](model_doc/reformer)使用LSH（局部敏感哈希）的注意力机制。在计算softmax(QK^t)时，只有矩阵QK^t中的最大元素（在softmax维度上）会做出有用的贡献。所以对于Q中的每个查询q，我们只需要考虑K中与q接近的键k，这里使用了一个哈希函数来确定q和k是否接近。注意力掩码被修改以掩盖当前的词符（token）（除了第一个位置之外），因为这样会使得查询和键相等（因此非常相似）。由于哈希可能会有些随机性，所以在实践中使用多个哈希函数（由n_rounds参数确定）,然后一起求平均。
+
+## 局部注意力机制（Local attention）
+[Longformer](model_doc/longformer)使用局部注意力机制:通常情况下，局部上下文（例如，左边和右边的两个词符是什么？）对于给定词符的操作已经足够了。此外，通过堆叠具有小窗口的注意力层，最后一层将拥有不仅仅是窗口内词符的感受野，这使得它们能构建整个句子的表示。
+
+一些预先选定的输入词符也被赋予全局注意力:对于这些少数词符，注意力矩阵可以访问所有词符（tokens），并且这个过程是对称的:所有其他词符除了它们局部窗口内的词符之外，也可以访问这些特定的词符。这在论文的图2d中有展示，下面是一个样本注意力掩码：
+
+<div class="flex justify-center">
+    <img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"/>
+</div>
+
+使用参数更少的注意力矩阵，可以让模型处理更长的输入序列。
+
+## 其他技巧
+
+### 轴向位置编码
+
+[Reformer](model_doc/reformer)模型使用轴向位置编码：在传统的transformer模型中，位置编码矩阵E的大小是\\(l\\)乘以\\(d\\)，其中\\(l\\)是序列长度，\\(d\\)是隐藏状态的维度。如果你有非常长的文本，这个矩阵可能会非常大，将会占用大量的GPU显存。为了缓解这个问题，轴向位置编码将这个大矩阵E分解成两个较小的矩阵E1和E2，它们的维度分别是\\(l_{1} \times d_{1}\\) 和\\(l_{2} \times d_{2}\\)，满足\\(l_{1} \times l_{2} = l\\)和\\(d_{1} + d_{2} = d\\)（通过长度的乘积，最终得到的矩阵要小得多）。在E中，对于时间步\\(j\\) 的嵌入是通过连接E1中时间步 \\(j \% l1\\) 的嵌入和E2中时间步\\(j // l1\\)的嵌入来获得的。
+
diff --git a/docs/source/zh/main_classes/pipelines.md b/docs/source/zh/main_classes/pipelines.md
index 370b50d2469604..bc16709d8b4832 100644
--- a/docs/source/zh/main_classes/pipelines.md
+++ b/docs/source/zh/main_classes/pipelines.md
@@ -455,6 +455,12 @@ See [`TokenClassificationPipeline`] for all details.
     - __call__
     - all
 
+### ImageTextToTextPipeline
+
+[[autodoc]] ImageTextToTextPipeline
+    - __call__
+    - all
+
 ### MaskGenerationPipeline
 
 [[autodoc]] MaskGenerationPipeline
diff --git a/docs/source/zh/model_sharing.md b/docs/source/zh/model_sharing.md
index e28a000c11535e..35e317bcac3646 100644
--- a/docs/source/zh/model_sharing.md
+++ b/docs/source/zh/model_sharing.md
@@ -43,7 +43,7 @@ Model Hub的内置版本控制基于git和[git-lfs](https://git-lfs.github.com/)
 
 ```py
 >>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
+...     "julien-c/EsperBERTo-small", revision="4c77982"  # tag name, or branch name, or commit hash
 ... )
 ```
 
diff --git a/examples/modular-transformers/configuration_my_new_model.py b/examples/modular-transformers/configuration_my_new_model.py
index 3c7848e69569cc..7042c586cbb636 100644
--- a/examples/modular-transformers/configuration_my_new_model.py
+++ b/examples/modular-transformers/configuration_my_new_model.py
@@ -1,9 +1,9 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_my_new_model.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_my_new_model.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 
 from ...configuration_utils import PretrainedConfig
 from ...modeling_rope_utils import rope_config_validation
@@ -130,6 +130,16 @@ class MyNewModelConfig(PretrainedConfig):
 
     model_type = "my_new_model"
     keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `MyNewModelModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
 
     def __init__(
         self,
@@ -158,6 +168,13 @@ def __init__(
         new_param=0,
         **kwargs,
     ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
@@ -187,11 +204,3 @@ def __init__(
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         rope_config_validation(self)
         self.new_param = new_param
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/examples/modular-transformers/configuration_my_new_model2.py b/examples/modular-transformers/configuration_my_new_model2.py
index 5fef1cecc702d7..eddd7fe47973ef 100644
--- a/examples/modular-transformers/configuration_my_new_model2.py
+++ b/examples/modular-transformers/configuration_my_new_model2.py
@@ -1,9 +1,9 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_my_new_model2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_my_new_model2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 
 from ...configuration_utils import PretrainedConfig
 from ...modeling_rope_utils import rope_config_validation
@@ -11,106 +11,6 @@
 
 class MyNewModel2Config(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`MyNewModel2Model`]. It is used to instantiate an MyNewModel2
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the MyNewModel2-7B.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the MyNewModel2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`MyNewModel2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. MyNewModel2 1 supports up to 2048 tokens,
-            MyNewModel2 2 up to 4096, CodeMyNewModel2 up to 16384.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*):
-            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            End of stream token id.
-        pretraining_tp (`int`, *optional*, defaults to 1):
-            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
-            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
-            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
-            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'my_new_model23'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'my_new_model23'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'my_new_model23'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'my_new_model23'. Scaling factor applied to high frequency components of the RoPE
-        attention_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        mlp_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
-        head_dim (`int`, *optional*):
-            The attention head dimension. If None, it will default to hidden_size // num_heads
     This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the Gemma-7B.
@@ -121,7 +21,6 @@ class MyNewModel2Config(PretrainedConfig):
         vocab_size (`int`, *optional*, defaults to 256000):
             Vocabulary size of the Gemma model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`GemmaModel`]
-
     ```python
     >>> from transformers import GemmaModel, GemmaConfig
     >>> # Initializing a Gemma gemma-7b style configuration
@@ -134,6 +33,16 @@ class MyNewModel2Config(PretrainedConfig):
 
     model_type = "my_new_model2"
     keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `MyNewModel2Model`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
 
     def __init__(
         self,
diff --git a/examples/modular-transformers/configuration_new_model.py b/examples/modular-transformers/configuration_new_model.py
index 8bc8ef52cee62f..4d164fe3e75f18 100644
--- a/examples/modular-transformers/configuration_new_model.py
+++ b/examples/modular-transformers/configuration_new_model.py
@@ -1,9 +1,9 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_new_model.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_new_model.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # Example where we only want to overwrite the defaults of an init
 
 from ...configuration_utils import PretrainedConfig
@@ -104,6 +104,13 @@ def __init__(
         attention_dropout=0.0,
         **kwargs,
     ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
@@ -121,14 +128,6 @@ def __init__(
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
 
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
     @property
     def num_heads(self):
         return self.num_attention_heads
diff --git a/examples/modular-transformers/modeling_dummy.py b/examples/modular-transformers/modeling_dummy.py
index b5b1fc6aec85e6..0b373d4e6eab01 100644
--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@@ -1,26 +1,23 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_dummy.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dummy.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 from typing import List, Optional, Tuple, Union
 
 import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
 from torch import nn
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...modeling_attn_mask_utils import AttentionMaskConverter
-from ...modeling_flash_attention_utils import _flash_attention_forward
-from ...modeling_outputs import (
-    BaseModelOutputWithPast,
-)
+from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward
+from ...modeling_outputs import BaseModelOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -33,59 +30,6 @@
 logger = logging.get_logger(__name__)
 
 
-def _prepare_4d_causal_attention_mask_with_cache_position(
-    attention_mask: torch.Tensor,
-    sequence_length: int,
-    target_length: int,
-    dtype: torch.dtype,
-    device: torch.device,
-    min_dtype: float,
-    cache_position: torch.Tensor,
-    batch_size: int,
-):
-    """
-    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-
-    Args:
-        attention_mask (`torch.Tensor`):
-            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
-        sequence_length (`int`):
-            The sequence length being processed.
-        target_length (`int`):
-            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
-        dtype (`torch.dtype`):
-            The dtype to use for the 4D attention mask.
-        device (`torch.device`):
-            The device to plcae the 4D attention mask on.
-        min_dtype (`float`):
-            The minimum value representable with the dtype `dtype`.
-        cache_position (`torch.Tensor`):
-            Indices depicting the position of the input sequence tokens in the sequence.
-        batch_size (`torch.Tensor`):
-            Batch size.
-    """
-    if attention_mask is not None and attention_mask.dim() == 4:
-        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-        causal_mask = attention_mask
-    else:
-        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
-        if sequence_length != 1:
-            causal_mask = torch.triu(causal_mask, diagonal=1)
-        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
-        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
-        if attention_mask is not None:
-            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-            mask_length = attention_mask.shape[-1]
-            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
-            padding_mask = padding_mask == 0
-            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                padding_mask, min_dtype
-            )
-
-    return causal_mask
-
-
 class DummyRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -193,6 +137,22 @@ def forward(self, x, position_ids):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
+class DummyMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 4]
@@ -227,40 +187,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
 
 
-class DummyMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.config.pretraining_tp > 1:
-            slice = self.intermediate_size // self.config.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat(
-                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
-            )
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [
-                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
-            ]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
@@ -319,31 +245,14 @@ def forward(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
 
-        if self.config.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
-            query_slices = self.q_proj.weight.split(
-                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
-            )
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
 
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
         if position_embeddings is None:
             logger.warning_once(
@@ -385,12 +294,7 @@ def forward(
 
         attn_output = attn_output.reshape(bsz, q_len, -1)
 
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
+        attn_output = self.o_proj(attn_output)
 
         if not output_attentions:
             attn_weights = None
@@ -423,6 +327,7 @@ def forward(
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if isinstance(past_key_value, StaticCache):
             raise ValueError(
@@ -507,6 +412,7 @@ def forward(
             sliding_window=getattr(self, "sliding_window", None),
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
             is_causal=self.is_causal,
+            **kwargs,
         )
 
         attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
@@ -561,9 +467,10 @@ def forward(
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
 
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
         if position_embeddings is None:
             logger.warning_once(
@@ -847,7 +754,10 @@ def __init__(self, config: DummyConfig):
         )
         self.norm = DummyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = DummyRotaryEmbedding(config=config)
+
         self.gradient_checkpointing = False
+        if getattr(config, "pretraining_tp", 1) != 1:
+            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -871,6 +781,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -926,7 +837,7 @@ def forward(
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
 
-        for decoder_layer in self.layers:
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
@@ -952,6 +863,7 @@ def forward(
                     use_cache=use_cache,
                     cache_position=cache_position,
                     position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
                 )
 
             hidden_states = layer_outputs[0]
@@ -1011,10 +923,9 @@ def _update_causal_mask(
                 return None
 
         dtype, device = input_tensor.dtype, input_tensor.device
-        min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1023,13 +934,12 @@ def _update_causal_mask(
             )
 
         # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
-        causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
             attention_mask,
             sequence_length=sequence_length,
             target_length=target_length,
             dtype=dtype,
             device=device,
-            min_dtype=min_dtype,
             cache_position=cache_position,
             batch_size=input_tensor.shape[0],
         )
@@ -1043,6 +953,63 @@ def _update_causal_mask(
             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
             causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
 
         return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
diff --git a/examples/modular-transformers/modeling_dummy_bert.py b/examples/modular-transformers/modeling_dummy_bert.py
index 611d7be961f7e4..e18e6a19e8a3fa 100644
--- a/examples/modular-transformers/modeling_dummy_bert.py
+++ b/examples/modular-transformers/modeling_dummy_bert.py
@@ -1,27 +1,20 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_dummy_bert.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dummy_bert.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 import os
 from typing import List, Optional, Tuple, Union
 
 import torch
-import torch.utils.checkpoint
 from packaging import version
 from torch import nn
 
 from ...activations import ACT2FN
-from ...modeling_attn_mask_utils import (
-    _prepare_4d_attention_mask_for_sdpa,
-    _prepare_4d_causal_attention_mask_for_sdpa,
-)
-from ...modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
-)
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
+from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
@@ -40,79 +33,6 @@
 _CONFIG_FOR_DOC = "DummyBertConfig"
 
 
-def load_tf_weights_in_dummy_bert(model, config, tf_checkpoint_path):
-    """Load tf checkpoints in a pytorch model."""
-    try:
-        import re
-
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info(f"Loading TF weight {name} with shape {shape}")
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(
-            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
-            for n in name
-        ):
-            logger.info(f"Skipping {'/'.join(name)}")
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info(f"Skipping {'/'.join(name)}")
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if m_name[-11:] == "_embeddings":
-            pointer = getattr(pointer, "weight")
-        elif m_name == "kernel":
-            array = np.transpose(array)
-        try:
-            if pointer.shape != array.shape:
-                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-        except ValueError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info(f"Initialize PyTorch weight {name}")
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
 class DummyBertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
 
@@ -706,6 +626,79 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return pooled_output
 
 
+def load_tf_weights_in_dummy_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except ValueError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
 class DummyBertPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -871,26 +864,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
-        r"""
-        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        """
         r"""
         encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
@@ -1027,7 +1000,6 @@ def forward(
 
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
-        return super().forward(input_ids)
 
         return BaseModelOutputWithPoolingAndCrossAttentions(
             last_hidden_state=sequence_output,
diff --git a/examples/modular-transformers/modeling_my_new_model2.py b/examples/modular-transformers/modeling_my_new_model2.py
index 49cdd274162092..189e090094c76c 100644
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@@ -1,25 +1,20 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_my_new_model2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_my_new_model2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 from typing import List, Optional, Tuple, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import _flash_attention_forward
-from ...modeling_outputs import (
-    BaseModelOutputWithPast,
-    SequenceClassifierOutputWithPast,
-)
+from ...modeling_outputs import BaseModelOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_start_docstrings,
@@ -30,6 +25,9 @@
 from .configuration_my_new_model2 import MyNewModel2Config
 
 
+logger = logging.get_logger(__name__)
+
+
 class MyNewModel2RMSNorm(nn.Module):
     def __init__(self, dim: int, eps: float = 1e-6):
         super().__init__()
@@ -50,9 +48,6 @@ def extra_repr(self):
         return f"{tuple(self.weight.shape)}, eps={self.eps}"
 
 
-logger = logging.get_logger(__name__)
-
-
 class MyNewModel2RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -448,59 +443,6 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-def _prepare_4d_causal_attention_mask_with_cache_position(
-    attention_mask: torch.Tensor,
-    sequence_length: int,
-    target_length: int,
-    dtype: torch.dtype,
-    device: torch.device,
-    min_dtype: float,
-    cache_position: torch.Tensor,
-    batch_size: int,
-):
-    """
-    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-
-    Args:
-        attention_mask (`torch.Tensor`):
-            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
-        sequence_length (`int`):
-            The sequence length being processed.
-        target_length (`int`):
-            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
-        dtype (`torch.dtype`):
-            The dtype to use for the 4D attention mask.
-        device (`torch.device`):
-            The device to plcae the 4D attention mask on.
-        min_dtype (`float`):
-            The minimum value representable with the dtype `dtype`.
-        cache_position (`torch.Tensor`):
-            Indices depicting the position of the input sequence tokens in the sequence.
-        batch_size (`torch.Tensor`):
-            Batch size.
-    """
-    if attention_mask is not None and attention_mask.dim() == 4:
-        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-        causal_mask = attention_mask
-    else:
-        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
-        if sequence_length != 1:
-            causal_mask = torch.triu(causal_mask, diagonal=1)
-        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
-        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
-        if attention_mask is not None:
-            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-            mask_length = attention_mask.shape[-1]
-            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
-            padding_mask = padding_mask == 0
-            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                padding_mask, min_dtype
-            )
-
-    return causal_mask
-
-
 MY_NEW_MODEL2_ATTENTION_CLASSES = {
     "eager": MyNewModel2Attention,
     "flash_attention_2": MyNewModel2FlashAttention2,
@@ -725,7 +667,10 @@ def __init__(self, config: MyNewModel2Config):
             [MyNewModel2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self.norm = MyNewModel2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
         self.gradient_checkpointing = False
+        if getattr(config, "pretraining_tp", 1) != 1:
+            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -810,7 +755,7 @@ def forward(
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
 
-        for decoder_layer in self.layers:
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
@@ -893,10 +838,9 @@ def _update_causal_mask(
                 return None
 
         dtype, device = input_tensor.dtype, input_tensor.device
-        min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -905,13 +849,12 @@ def _update_causal_mask(
             )
 
         # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
-        causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
             attention_mask,
             sequence_length=sequence_length,
             target_length=target_length,
             dtype=dtype,
             device=device,
-            min_dtype=min_dtype,
             cache_position=cache_position,
             batch_size=input_tensor.shape[0],
         )
@@ -925,10 +868,67 @@ def _update_causal_mask(
             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
             causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
 
         return causal_mask
 
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
 
 @add_start_docstrings(
     """
@@ -1019,27 +1019,8 @@ def forward(
 
         loss = None
         if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
         if not return_dict:
             output = (pooled_logits,) + transformer_outputs[1:]
             return ((loss,) + output) if loss is not None else output
diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py
index 640331ace1d57b..4556308f1ea077 100644
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@@ -8,7 +8,6 @@
 from typing import ClassVar, List, Optional, Tuple, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 
 from ...cache_utils import Cache, StaticCache
@@ -18,92 +17,15 @@
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    logging,
     replace_return_docstrings,
 )
-from .configuration_new_task_model import NewTaskModelConfig
-
-
-if is_flash_attn_2_available():
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
 from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_new_task_model import NewTaskModelConfig
 
 
-logger = logging.get_logger(__name__)
-
 _CONFIG_FOR_DOC = "NewTaskModelConfig"
 
 
-# Adapted from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
-# But NewTaskModel has no causal mask on prefix
-def _prepare_4d_causal_attention_mask_with_cache_position(
-    attention_mask: torch.Tensor,
-    sequence_length: int,
-    target_length: int,
-    dtype: torch.dtype,
-    device: torch.device,
-    min_dtype: float,
-    cache_position: torch.Tensor,
-    batch_size: int,
-    is_training: bool = False,
-    token_type_ids: torch.Tensor = None,
-):
-    """
-    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-
-    Args:
-        attention_mask (`torch.Tensor`):
-            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
-        sequence_length (`int`):
-            The sequence length being processed.
-        target_length (`int`):
-            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
-        dtype (`torch.dtype`):
-            The dtype to use for the 4D attention mask.
-        device (`torch.device`):
-            The device to plcae the 4D attention mask on.
-        min_dtype (`float`):
-            The minimum value representable with the dtype `dtype`.
-        cache_position (`torch.Tensor`):
-            Indices depicting the position of the input sequence tokens in the sequence.
-        batch_size (`torch.Tensor`):
-            Batch size.
-        is_training (`bool`):
-            Whether the model is in training mode or in inference. The condition is checked by presence/absence of `token_type_ids/labels`
-    """
-    if attention_mask is not None and attention_mask.dim() == 4:
-        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-        causal_mask = attention_mask
-    else:
-        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
-        # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
-        if sequence_length != 1:
-            if is_training:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            else:
-                causal_mask[:, :sequence_length] = 0.0
-
-        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
-        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
-        if attention_mask is not None:
-            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-            mask_length = attention_mask.shape[-1]
-            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
-            padding_mask = padding_mask == 0
-            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                padding_mask, min_dtype
-            )
-            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
-            if is_training:
-                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
-                )
-    return causal_mask
-
-
 @dataclass
 class NewTaskModelCausalLMOutputWithPast(ModelOutput):
     """
@@ -182,12 +104,12 @@ class NewTaskModelPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["NewTaskModelMultiModalProjector"]
     _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = False
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
-    _supports_sdpa = True
     _supports_cache_class = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
 
     def _init_weights(self, module):
         # important: this ported version of NewTaskModelisn't meant for training from scratch - only
@@ -210,14 +132,6 @@ def _init_weights(self, module):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 
-    @property
-    def _supports_sdpa(self):
-        """
-        Retrieve language_model's attribute to check whether the model supports
-        SDPA or not.
-        """
-        return self.language_model._supports_sdpa
-
 
 NEW_TASK_MODEL_INPUTS_DOCSTRING = r"""
     Args:
@@ -301,11 +215,8 @@ def __init__(self, config):
         self.vision_tower = AutoModel.from_config(config=config.vision_config)
         self.multi_modal_projector = NewTaskModelMultiModalProjector(config)
         self.vocab_size = config.text_config.vocab_size
-        self._attn_implementation = config._attn_implementation
 
-        language_model = AutoModelForCausalLM.from_config(
-            config=config.text_config, attn_implementation=self._attn_implementation
-        )
+        language_model = AutoModelForCausalLM.from_config(config=config.text_config)
 
         if language_model._tied_weights_keys is not None:
             self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
@@ -344,6 +255,11 @@ def tie_weights(self):
     def _update_causal_mask(
         self, attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training: bool = False
     ):
+        if self.config.text_config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
         using_static_cache = isinstance(past_key_values, StaticCache)
         dtype = inputs_embeds.dtype
         min_dtype = torch.finfo(dtype).min
@@ -388,6 +304,22 @@ def _update_causal_mask(
                 )
         return causal_mask
 
+    def get_image_features(self, pixel_values: torch.FloatTensor):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        image_outputs = self.vision_tower(pixel_values)
+        selected_image_feature = image_outputs.last_hidden_state
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = image_features / (self.config.hidden_size**0.5)
+        return image_features
+
     @add_start_docstrings_to_model_forward(NEW_TASK_MODEL_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=NewTaskModelCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -426,9 +358,9 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import AutoProcessor, NewTaskModelForNewTask
+        >>> from transformers import AutoProcessor, NewTaskModelForConditionalGeneration
 
-        >>> model = NewTaskModelForNewTask.from_pretrained("google/NewTaskModel-test-224px-hf")
+        >>> model = NewTaskModelForConditionalGeneration.from_pretrained("google/NewTaskModel-test-224px-hf")
         >>> processor = AutoProcessor.from_pretrained("google/NewTaskModel-test-224px-hf")
 
         >>> prompt = "answer en Where is the cow standing?"
@@ -484,6 +416,7 @@ def prepare_inputs_for_generation(
         num_logits_to_keep=None,
         **kwargs,
     ):
+        # Overwritten -- custom `position_ids` and `pixel_values` handling
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -493,33 +426,10 @@ def prepare_inputs_for_generation(
             cache_position=cache_position,
             use_cache=use_cache,
             num_logits_to_keep=num_logits_to_keep,
+            token_type_ids=token_type_ids,
             **kwargs,
         )
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
-            if model_inputs["inputs_embeds"] is not None:
-                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
-                device = model_inputs["inputs_embeds"].device
-            else:
-                batch_size, sequence_length = model_inputs["input_ids"].shape
-                device = model_inputs["input_ids"].device
-
-            dtype = self.get_output_embeddings().weight.dtype
-            min_dtype = torch.finfo(dtype).min
-
-            model_inputs["attention_mask"] = _prepare_4d_causal_attention_mask_with_cache_position(
-                attention_mask,
-                sequence_length=sequence_length,
-                target_length=past_key_values.get_max_length(),
-                dtype=dtype,
-                device=device,
-                min_dtype=min_dtype,
-                cache_position=cache_position,
-                batch_size=batch_size,
-            )
-
-        model_inputs["token_type_ids"] = token_type_ids
-
         # position_ids in NewTaskModel are 1-indexed
         if model_inputs.get("position_ids") is not None:
             model_inputs["position_ids"] += 1
diff --git a/examples/modular-transformers/modeling_roberta.py b/examples/modular-transformers/modeling_roberta.py
new file mode 100644
index 00000000000000..e50cf60c3a4ed4
--- /dev/null
+++ b/examples/modular-transformers/modeling_roberta.py
@@ -0,0 +1,1014 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_roberta.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_roberta.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import math
+import os
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from packaging import version
+
+from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
+from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    get_torch_version,
+    logging,
+)
+from .configuration_roberta import RobertaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google-roberta/roberta-base-uncased"
+_CONFIG_FOR_DOC = "RobertaConfig"
+
+
+class RobertaEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, config.pad_token_id
+        )
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+        self.pad_token_id = config.pad_token_id
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class RobertaSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class RobertaSdpaSelfAttention(RobertaSelfAttention):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__(config, position_embedding_type=position_embedding_type)
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0")
+
+    # Adapted from RobertaSelfAttention
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
+            logger.warning_once(
+                "RobertaSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
+                "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to "
+                "the manual attention implementation, but specifying the manual implementation will be required from "
+                "Transformers version v5.0.0 onwards. This warning can be removed using the argument "
+                '`attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+            )
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention
+        # mask needs to be such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        attention_mask = encoder_attention_mask if is_cross_attention else attention_mask
+
+        # Check `seq_length` of `past_key_value` == `len(current_states)` to support prefix tuning
+        if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]:
+            key_layer, value_layer = past_key_value
+        else:
+            key_layer = self.transpose_for_scores(self.key(current_states))
+            value_layer = self.transpose_for_scores(self.value(current_states))
+            if past_key_value is not None and not is_cross_attention:
+                key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+                value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
+        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
+        # Reference: https://github.com/pytorch/pytorch/issues/112577
+        if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
+            query_layer = query_layer.contiguous()
+            key_layer = key_layer.contiguous()
+            value_layer = value_layer.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create
+        # a causal mask in case tgt_len == 1.
+        is_causal = (
+            True if self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 else False
+        )
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout_prob if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size)
+
+        outputs = (attn_output,)
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class RobertaSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+ROBERTA_SELF_ATTENTION_CLASSES = {
+    "eager": RobertaSelfAttention,
+    "sdpa": RobertaSdpaSelfAttention,
+}
+
+
+class RobertaAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
+        self.output = RobertaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class RobertaIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class RobertaOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class RobertaLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = RobertaAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = RobertaAttention(config, position_embedding_type="absolute")
+        self.intermediate = RobertaIntermediate(config)
+        self.output = RobertaOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class RobertaEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class RobertaPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+def load_tf_weights_in_roberta(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except ValueError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class RobertaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RobertaConfig
+    load_tf_weights = load_tf_weights_in_roberta
+    base_model_prefix = "roberta"
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+ROBERTA_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`RobertaConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ROBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`or `(batch_size, sequence_length, target_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Roberta Model transformer outputting raw hidden-states without any specific head on top.",
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaModel(RobertaPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    _no_split_modules = ["RobertaEmbeddings", "RobertaLayer"]
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = RobertaEmbeddings(config)
+        self.encoder = RobertaEncoder(config)
+
+        self.pooler = RobertaPooler(config) if add_pooling_layer else None
+
+        self.attn_implementation = config._attn_implementation
+        self.position_embedding_type = config.position_embedding_type
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)
+
+        use_sdpa_attention_masks = (
+            self.attn_implementation == "sdpa"
+            and self.position_embedding_type == "absolute"
+            and head_mask is None
+            and not output_attentions
+        )
+
+        # Expand the attention mask
+        if use_sdpa_attention_masks and attention_mask.dim() == 2:
+            # Expand the attention mask for SDPA.
+            # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+            if self.config.is_decoder:
+                extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    input_shape,
+                    embedding_output,
+                    past_key_values_length,
+                )
+            else:
+                extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+        else:
+            # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+            # ourselves in which case we just need to make it broadcastable to all heads.
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+
+            if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2:
+                # Expand the attention mask for SDPA.
+                # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+                encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
diff --git a/examples/modular-transformers/modeling_super.py b/examples/modular-transformers/modeling_super.py
index d91bdb1820c2a3..7ad606280dcc96 100644
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@@ -1,26 +1,23 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_diff_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the diff. If any change should be done, please apply the change to the
-#                           diff.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_super.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_super.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 from typing import List, Optional, Tuple, Union
 
 import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
 from torch import nn
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache, StaticCache
 from ...modeling_attn_mask_utils import AttentionMaskConverter
-from ...modeling_flash_attention_utils import _flash_attention_forward
-from ...modeling_outputs import (
-    BaseModelOutputWithPast,
-)
+from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward
+from ...modeling_outputs import BaseModelOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -33,59 +30,6 @@
 logger = logging.get_logger(__name__)
 
 
-def _prepare_4d_causal_attention_mask_with_cache_position(
-    attention_mask: torch.Tensor,
-    sequence_length: int,
-    target_length: int,
-    dtype: torch.dtype,
-    device: torch.device,
-    min_dtype: float,
-    cache_position: torch.Tensor,
-    batch_size: int,
-):
-    """
-    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-
-    Args:
-        attention_mask (`torch.Tensor`):
-            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
-        sequence_length (`int`):
-            The sequence length being processed.
-        target_length (`int`):
-            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
-        dtype (`torch.dtype`):
-            The dtype to use for the 4D attention mask.
-        device (`torch.device`):
-            The device to plcae the 4D attention mask on.
-        min_dtype (`float`):
-            The minimum value representable with the dtype `dtype`.
-        cache_position (`torch.Tensor`):
-            Indices depicting the position of the input sequence tokens in the sequence.
-        batch_size (`torch.Tensor`):
-            Batch size.
-    """
-    if attention_mask is not None and attention_mask.dim() == 4:
-        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-        causal_mask = attention_mask
-    else:
-        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
-        if sequence_length != 1:
-            causal_mask = torch.triu(causal_mask, diagonal=1)
-        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
-        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
-        if attention_mask is not None:
-            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-            mask_length = attention_mask.shape[-1]
-            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
-            padding_mask = padding_mask == 0
-            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                padding_mask, min_dtype
-            )
-
-    return causal_mask
-
-
 class SuperRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -123,7 +67,7 @@ def __init__(
         if config is None:
             logger.warning_once(
                 "`SuperRotaryEmbedding` can now be fully parameterized by passing the model config through the "
-                "`config` argument. All other arguments will be removed in v4.45"
+                "`config` argument. All other arguments will be removed in v4.46"
             )
             self.rope_kwargs = {
                 "rope_type": rope_type,
@@ -193,6 +137,22 @@ def forward(self, x, position_ids):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
+class SuperMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -227,40 +187,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
 
 
-class SuperMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.config.pretraining_tp > 1:
-            slice = self.intermediate_size // self.config.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat(
-                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
-            )
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [
-                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
-            ]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
@@ -302,7 +228,7 @@ def __init__(self, config: SuperConfig, layer_idx: Optional[int] = None):
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
 
-        # TODO (joao): remove in v4.45 (RoPE is computed in the model, not in the decoder layers)
+        # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
         self.rotary_emb = SuperRotaryEmbedding(config=self.config)
 
     def forward(
@@ -314,42 +240,25 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.45
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
 
-        if self.config.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
-            query_slices = self.q_proj.weight.split(
-                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
-            )
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
 
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
         if position_embeddings is None:
             logger.warning_once(
                 "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                 "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
                 "removed and `position_embeddings` will be mandatory."
             )
             cos, sin = self.rotary_emb(value_states, position_ids)
@@ -385,12 +294,7 @@ def forward(
 
         attn_output = attn_output.reshape(bsz, q_len, -1)
 
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
+        attn_output = self.o_proj(attn_output)
 
         if not output_attentions:
             attn_weights = None
@@ -422,7 +326,8 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.45
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if isinstance(past_key_value, StaticCache):
             raise ValueError(
@@ -449,7 +354,7 @@ def forward(
             logger.warning_once(
                 "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                 "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
                 "removed and `position_embeddings` will be mandatory."
             )
             cos, sin = self.rotary_emb(value_states, position_ids)
@@ -507,6 +412,7 @@ def forward(
             sliding_window=getattr(self, "sliding_window", None),
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
             is_causal=self.is_causal,
+            **kwargs,
         )
 
         attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
@@ -535,7 +441,7 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.45
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
@@ -561,15 +467,16 @@ def forward(
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
 
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
         if position_embeddings is None:
             logger.warning_once(
                 "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                 "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
                 "removed and `position_embeddings` will be mandatory."
             )
             cos, sin = self.rotary_emb(value_states, position_ids)
@@ -644,7 +551,7 @@ def forward(
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.45
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
@@ -790,7 +697,8 @@ def _init_weights(self, module):
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
             Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
@@ -846,7 +754,10 @@ def __init__(self, config: SuperConfig):
         )
         self.norm = SuperRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = SuperRotaryEmbedding(config=config)
+
         self.gradient_checkpointing = False
+        if getattr(config, "pretraining_tp", 1) != 1:
+            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -916,10 +827,9 @@ def _update_causal_mask(
                 return None
 
         dtype, device = input_tensor.dtype, input_tensor.device
-        min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -928,13 +838,12 @@ def _update_causal_mask(
             )
 
         # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
-        causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
             attention_mask,
             sequence_length=sequence_length,
             target_length=target_length,
             dtype=dtype,
             device=device,
-            min_dtype=min_dtype,
             cache_position=cache_position,
             batch_size=input_tensor.shape[0],
         )
@@ -948,6 +857,63 @@ def _update_causal_mask(
             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
             causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
 
         return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
diff --git a/examples/modular-transformers/modular_roberta.py b/examples/modular-transformers/modular_roberta.py
index a3e0218f9320a0..13dca4845c132b 100644
--- a/examples/modular-transformers/modular_roberta.py
+++ b/examples/modular-transformers/modular_roberta.py
@@ -13,8 +13,5 @@ def __init__(self, config):
 
 
 class RobertaModel(BertModel):
-    def __init__(self, config):
+    def __init__(self, config, add_pooling_layer=True):
         super().__init__(self, config)
-        # Error out here. Why? Because `RobertaEmbeddings` is defined but not used.
-        # no, because it's defined, and RobertaModel should use RobertaEmbedding
-        # here if initialized that way it won't use the new embedding.
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index 3bed494b75c608..8353333ef827ed 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -141,10 +141,6 @@ class DataTrainingArguments:
         default=None,
         metadata={"help": "An optional input evaluation data file (a jsonlines file)."},
     )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input testing data file (a jsonlines file)."},
-    )
     max_seq_length: Optional[int] = field(
         default=128,
         metadata={
@@ -190,9 +186,6 @@ def __post_init__(self):
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-            if self.test_file is not None:
-                extension = self.test_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
 
 
 dataset_name_mapping = {
@@ -315,9 +308,6 @@ def main():
         if data_args.validation_file is not None:
             data_files["validation"] = data_args.validation_file
             extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
         dataset = load_dataset(
             extension,
             data_files=data_files,
@@ -387,8 +377,6 @@ def _freeze_params(module):
         column_names = dataset["train"].column_names
     elif training_args.do_eval:
         column_names = dataset["validation"].column_names
-    elif training_args.do_predict:
-        column_names = dataset["test"].column_names
     else:
         logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
         return
@@ -490,29 +478,6 @@ def filter_corrupt_images(examples):
         # Transform images on the fly as doing it on the whole dataset takes too much time.
         eval_dataset.set_transform(transform_images)
 
-    if training_args.do_predict:
-        if "test" not in dataset:
-            raise ValueError("--do_predict requires a test dataset")
-        test_dataset = dataset["test"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(test_dataset), data_args.max_eval_samples)
-            test_dataset = test_dataset.select(range(max_eval_samples))
-
-        test_dataset = test_dataset.filter(
-            filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
-        )
-        test_dataset = test_dataset.map(
-            function=tokenize_captions,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=[col for col in column_names if col != image_column],
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on test dataset",
-        )
-
-        # Transform images on the fly as doing it on the whole dataset takes too much time.
-        test_dataset.set_transform(transform_images)
-
     # 8. Initialize our trainer
     trainer = Trainer(
         model=model,
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 2c60b359bd106b..a7193e23dddc39 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -331,7 +331,7 @@ def main():
     config = AutoConfig.from_pretrained(
         args.model_name_or_path,
         num_labels=len(labels),
-        i2label=id2label,
+        id2label=id2label,
         label2id=label2id,
         finetuning_task="image-classification",
         trust_remote_code=args.trust_remote_code,
diff --git a/examples/pytorch/instance-segmentation/README.md b/examples/pytorch/instance-segmentation/README.md
index 72eb5a5befb4fb..339d7591523de7 100644
--- a/examples/pytorch/instance-segmentation/README.md
+++ b/examples/pytorch/instance-segmentation/README.md
@@ -148,7 +148,7 @@ with torch.no_grad():
     outputs = model(**inputs)
 
 # Post-process outputs
-outputs = image_processor.post_process_instance_segmentation(outputs, target_sizes=[image.size[::-1]])
+outputs = image_processor.post_process_instance_segmentation(outputs, target_sizes=[(image.height, image.width)])
 
 print("Mask shape: ", outputs[0]["segmentation"].shape)
 print("Mask values: ", outputs[0]["segmentation"].unique())
diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index a54f3d03cab21b..33bb1d658595e5 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -1,5 +1,5 @@
 absl-py==1.0.0
-aiohttp==3.10.2
+aiohttp==3.10.11
 aiosignal==1.2.0
 alembic==1.7.7
 appdirs==1.4.4
@@ -233,7 +233,7 @@ urllib3==1.26.19
 wasabi==0.9.0
 wcwidth==0.2.5
 websocket-client==1.3.1
-Werkzeug==3.0.3
+Werkzeug==3.0.6
 wrapt==1.14.0
 xxhash==3.0.0
 yarl==1.7.2
diff --git a/i18n/README_ko.md b/i18n/README_ko.md
index e2a9b80d0d3ecc..b9502db5dda845 100644
--- a/i18n/README_ko.md
+++ b/i18n/README_ko.md
@@ -15,10 +15,15 @@ limitations under the License.
 -->
 
 <p align="center">
-    <br>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
-    <br>
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
+    <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
+    <img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
+  </picture>
+  <br/>
+  <br/>
 </p>
+
 <p align="center">
     <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
     <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
@@ -45,19 +50,25 @@ limitations under the License.
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
-
     </p>
 </h4>
 
 <h3 align="center">
-    <p> Jax, Pytorch, TensorFlow를 위한 최첨단 자연어처리</p>
+    <p> Jax, Pytorch, TensorFlow를 위한 최첨단 머신러닝</p>
 </h3>
 
 <h3 align="center">
     <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
 </h3>
 
-🤗 Transformers는 분류, 정보 추출, 질문 답변, 요약, 번역, 문장 생성 등을 100개 이상의 언어로 수행할 수 있는 수천개의 사전학습된 모델을 제공합니다. 우리의 목표는 모두가 최첨단의 NLP 기술을 쉽게 사용하는 것입니다.
+🤗 Transformers는 텍스트, 비전, 오디오와 같은 다양한 분야에서 여러 과제를 수행하는 수천 개의 사전 학습된 모델을 제공합니다.
+
+제공되는 모델을 통해 다음 과제를 수행할 수 있습니다.
+- 📝 텍스트: 100개 이상의 언어들로, 텍스트 분류, 정보 추출, 질문 답변, 요약, 번역 및 문장 생성
+- 🖼️ 이미지: 이미지 분류(Image Classification), 객체 탐지(Object Detection) 및 분할(Segmentation)
+- 🗣️ 오디오: 음성 인식(Speech Recognition) 및 오디오 분류(Audio Classification)
+
+Transformer의 모델은 표를 통한 질의응답(Table QA), 광학 문자 인식(Optical Character Recognition), 스캔 한 문서에서 정보 추출, 비디오 분류 및 시각적 질의응답과 같은 **여러 분야가 결합된** 과제 또한 수행할 수 있습니다.
 
 🤗 Transformers는 이러한 사전학습 모델을 빠르게 다운로드해 특정 텍스트에 사용하고, 원하는 데이터로 fine-tuning해 커뮤니티나 우리의 [모델 허브](https://huggingface.co/models)에 공유할 수 있도록 API를 제공합니다. 또한, 모델 구조를 정의하는 각 파이썬 모듈은 완전히 독립적이여서 연구 실험을 위해 손쉽게 수정할 수 있습니다.
 
@@ -65,9 +76,11 @@ limitations under the License.
 
 ## 온라인 데모
 
-대부분의 모델을 [모델 허브](https://huggingface.co/models) 페이지에서 바로 테스트해볼 수 있습니다. 공개 및 비공개 모델을 위한 [비공개 모델 호스팅, 버전 관리, 추론 API](https://huggingface.co/pricing)도 제공합니다.
+대부분의 모델을 [모델 허브](https://huggingface.co/models) 페이지에서 바로 테스트해 볼 수 있습니다. 공개 및 비공개 모델을 위한 [비공개 모델 호스팅, 버전 관리, 추론 API](https://huggingface.co/pricing)도 제공합니다.
 
-예시:
+아래 몇 가지 예시가 있습니다:  
+
+자연어 처리:   
 - [BERT로 마스킹된 단어 완성하기](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Electra를 이용한 개체명 인식](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
 - [GPT-2로 텍스트 생성하기](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
@@ -76,45 +89,100 @@ limitations under the License.
 - [DistilBERT를 이용한 질문 답변](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
 - [T5로 번역하기](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
+컴퓨터 비전:
+- [ViT와 함께하는 이미지 분류](https://huggingface.co/google/vit-base-patch16-224)
+- [DETR로 객체 탐지하기](https://huggingface.co/facebook/detr-resnet-50)
+- [SegFormer로 의미적 분할(semantic segmentation)하기](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
+- [Mask2Former로 판옵틱 분할(panoptic segmentation)하기](https://huggingface.co/facebook/mask2former-swin-large-coco-panoptic)
+- [Depth Anything으로 깊이 추정(depth estimation)하기](https://huggingface.co/docs/transformers/main/model_doc/depth_anything)
+- [VideoMAE와 함께하는 비디오 분류](https://huggingface.co/docs/transformers/model_doc/videomae)
+- [OneFormer로 유니버설 분할(universal segmentation)하기](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
+
+오디오:
+- [Whisper와 함께하는 자동 음성 인식](https://huggingface.co/openai/whisper-large-v3)
+- [Wav2Vec2로 키워드 검출(keyword spotting)하기](https://huggingface.co/superb/wav2vec2-base-superb-ks)
+- [Audio Spectrogram Transformer로 오디오 분류하기](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
+
+멀티 모달(Multimodal Task):
+- [TAPAS로 표 안에서 질문 답변하기](https://huggingface.co/google/tapas-base-finetuned-wtq)
+- [ViLT와 함께하는 시각적 질의응답](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
+- [LLaVa로 이미지에 설명 넣기](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
+- [SigLIP와 함께하는 제로 샷(zero-shot) 이미지 분류](https://huggingface.co/google/siglip-so400m-patch14-384)
+- [LayoutLM으로 문서 안에서 질문 답변하기](https://huggingface.co/impira/layoutlm-document-qa)
+- [X-CLIP과 함께하는 제로 샷(zero-shot) 비디오 분류](https://huggingface.co/docs/transformers/model_doc/xclip)
+- [OWLv2로 진행하는 제로 샷(zero-shot) 객체 탐지](https://huggingface.co/docs/transformers/en/model_doc/owlv2)
+- [CLIPSeg로 진행하는 제로 샷(zero-shot) 이미지 분할](https://huggingface.co/docs/transformers/model_doc/clipseg)
+- [SAM과 함께하는 자동 마스크 생성](https://huggingface.co/docs/transformers/model_doc/sam)
+
 **[Transformer와 글쓰기](https://transformer.huggingface.co)** 는 이 저장소의 텍스트 생성 능력에 관한 Hugging Face 팀의 공식 데모입니다.
 
-## Hugging Face 팀의 커스텀 지원을 원한다면
+## Transformers를 사용한 100개의 프로젝트
+
+Transformers는 사전 학습된 모델들을 이용하는 도구를 넘어 Transformers와 함께 빌드 된 프로젝트 및 Hugging Face Hub를 위한 하나의 커뮤니티입니다. 우리는 Transformers를 통해 개발자, 연구자, 학생, 교수, 엔지니어 및 모든 이들이 꿈을 품은 프로젝트(Dream Project)를 빌드 할 수 있길 바랍니다.
 
-<a target="_blank" href="https://huggingface.co/support">
-    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+Transformers에 달린 100,000개의 별을 축하하기 위해, 우리는 커뮤니티를 주목하고자 Transformers를 품고 빌드 된 100개의 어마어마한 프로젝트들을 선별하여 [awesome-transformers](https://github.com/huggingface/transformers/blob/main/awesome-transformers.md) 페이지에 나열하였습니다.
+
+만일 소유한 혹은 사용하고 계신 프로젝트가 이 리스트에 꼭 등재되어야 한다고 믿으신다면, PR을 열고 추가하여 주세요!
+
+## 조직 안에서 AI 사용에 대해 진지하게 고민 중이신가요? Hugging Face Enterprise Hub을 통해 더 빨리 구축해 보세요.
+
+<a target="_blank" href="https://huggingface.co/enterprise">
+    <img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925">
 </a><br>
 
 ## 퀵 투어
 
-원하는 텍스트에 바로 모델을 사용할 수 있도록, 우리는 `pipeline` API를 제공합니다. Pipeline은 사전학습 모델과 그 모델을 학습할 때 적용한 전처리 방식을 하나로 합칩니다. 다음은 긍정적인 텍스트와 부정적인 텍스트를 분류하기 위해 pipeline을 사용한 간단한 예시입니다:
+주어진 입력(텍스트, 이미지, 오디오, ...)에 바로 모델을 사용할 수 있도록, 우리는 `pipeline` API를 제공합니다. Pipeline은 사전학습 모델과 그 모델을 학습할 때 적용한 전처리 방식을 하나로 합칩니다. 다음은 긍정적인 텍스트와 부정적인 텍스트를 분류하기 위해 pipeline을 사용한 간단한 예시입니다:
 
 ```python
 >>> from transformers import pipeline
 
-# Allocate a pipeline for sentiment-analysis
+# 감정 분석 파이프라인을 할당하세요
 >>> classifier = pipeline('sentiment-analysis')
 >>> classifier('We are very happy to introduce pipeline to the transformers repository.')
 [{'label': 'POSITIVE', 'score': 0.9996980428695679}]
 ```
 
-코드의 두번째 줄은 pipeline이 사용하는 사전학습 모델을 다운로드하고 캐시로 저장합니다. 세번째 줄에선 그 모델이 주어진 텍스트를 평가합니다. 여기서 모델은 99.97%의 확률로 텍스트가 긍정적이라고 평가했습니다.
+코드의 두 번째 줄은 pipeline이 사용하는 사전학습 모델을 다운로드하고 캐시로 저장합니다. 세 번째 줄에선 그 모델이 주어진 텍스트를 평가합니다. 여기서 모델은 99.97%의 확률로 텍스트가 긍정적이라고 평가했습니다.
 
-많은 NLP 과제들을 `pipeline`으로 바로 수행할 수 있습니다. 예를 들어, 질문과 문맥이 주어지면 손쉽게 답변을 추출할 수 있습니다:
+자연어 처리(NLP) 뿐만 아니라 컴퓨터 비전, 발화(Speech) 과제들을 사전 학습된 `pipeline`으로 바로 수행할 수 있습니다. 예를 들어, 사진에서 손쉽게 객체들을 탐지할 수 있습니다.:
 
 ``` python
+>>> import requests
+>>> from PIL import Image
 >>> from transformers import pipeline
 
-# Allocate a pipeline for question-answering
->>> question_answerer = pipeline('question-answering')
->>> question_answerer({
-...     'question': 'What is the name of the repository ?',
-...     'context': 'Pipeline has been included in the huggingface/transformers repository'
-... })
-{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
-
+# 귀여운 고양이가 있는 이미지를 다운로드하세요
+>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
+>>> image_data = requests.get(url, stream=True).raw
+>>> image = Image.open(image_data)
+
+# 객체 감지를 위한 파이프라인을 할당하세요
+>>> object_detector = pipeline('object-detection')
+>>> object_detector(image)
+[{'score': 0.9982201457023621,
+  'label': 'remote',
+  'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
+ {'score': 0.9960021376609802,
+  'label': 'remote',
+  'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
+ {'score': 0.9954745173454285,
+  'label': 'couch',
+  'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
+ {'score': 0.9988006353378296,
+  'label': 'cat',
+  'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
+ {'score': 0.9986783862113953,
+  'label': 'cat',
+  'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
 ```
+위와 같이, 우리는 이미지에서 탐지된 객체들에 대하여 객체를 감싸는 박스와 확률 리스트를 얻을 수 있습니다. 왼쪽이 원본 이미지이며 오른쪽은 해당 이미지에 탐지된 결과를 표시하였습니다.
+<h3 align="center">
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
+</h3>
 
-답변뿐만 아니라, 여기에 사용된 사전학습 모델은 확신도와 토크나이즈된 문장 속 답변의 시작점, 끝점까지 반환합니다. [이 튜토리얼](https://huggingface.co/docs/transformers/task_summary)에서 `pipeline` API가 지원하는 다양한 과제를 확인할 수 있습니다.
+[이 튜토리얼](https://huggingface.co/docs/transformers/ko/task_summary)에서 `pipeline` API가 지원하는 다양한 과제를 확인할 수 있습니다.
 
 코드 3줄로 원하는 과제에 맞게 사전학습 모델을 다운로드 받고 사용할 수 있습니다. 다음은 PyTorch 버전입니다:
 ```python
@@ -139,24 +207,24 @@ limitations under the License.
 
 토크나이저는 사전학습 모델의 모든 전처리를 책임집니다. 그리고 (위의 예시처럼) 1개의 스트링이나 리스트도 처리할 수 있습니다. 토크나이저는 딕셔너리를 반환하는데, 이는 다운스트림 코드에 사용하거나 언패킹 연산자 ** 를 이용해 모델에 바로 전달할 수도 있습니다.
 
-모델 자체는 일반적으로 사용되는 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)나 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)입니다. [이 튜토리얼](https://huggingface.co/transformers/training.html)은 이러한 모델을 표준적인 PyTorch나 TensorFlow 학습 과정에서 사용하는 방법, 또는 새로운 데이터로 fine-tune하기 위해 `Trainer` API를 사용하는 방법을 설명해줍니다.
+모델 자체는 일반적으로 사용되는 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)이나 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)입니다. [이 튜토리얼](https://huggingface.co/docs/transformers/ko/training)은 이러한 모델을 표준적인 PyTorch나 TensorFlow 학습 과정에서 사용하는 방법, 또는 새로운 데이터로 파인 튜닝하기 위해 `Trainer` API를 사용하는 방법을 설명해 줍니다.
 
 ## 왜 transformers를 사용해야 할까요?
 
 1. 손쉽게 사용할 수 있는 최첨단 모델:
-    - NLU와 NLG 과제에서 뛰어난 성능을 보입니다.
-    - 교육자 실무자에게 진입 장벽이 낮습니다.
+    - 자연어 이해(NLU)와 생성(NLG), 컴퓨터 비전, 오디오 과제에서 뛰어난 성능을 보입니다.
+    - 교육자와 실무자에게 진입 장벽이 낮습니다.
     - 3개의 클래스만 배우면 바로 사용할 수 있습니다.
     - 하나의 API로 모든 사전학습 모델을 사용할 수 있습니다.
 
 1. 더 적은 계산 비용, 더 적은 탄소 발자국:
     - 연구자들은 모델을 계속 다시 학습시키는 대신 학습된 모델을 공유할 수 있습니다.
     - 실무자들은 학습에 필요한 시간과 비용을 절약할 수 있습니다.
-    - 수십개의 모델 구조, 2,000개 이상의 사전학습 모델, 100개 이상의 언어로 학습된 모델 등.
+    - 모든 분야를 통틀어서 400,000개 이상의 사전 학습된 모델이 있는 수십 개의 아키텍처.
 
 1. 모델의 각 생애주기에 적합한 프레임워크:
     - 코드 3줄로 최첨단 모델을 학습하세요.
-    - 자유롭게 모델을 TF2.0나 PyTorch 프레임워크로 변환하세요.
+    - 목적에 알맞게 모델을 TF2.0/Pytorch/Jax 프레임 워크 중 하나로 이동시키세요.
     - 학습, 평가, 공개 등 각 단계에 맞는 프레임워크를 원하는대로 선택하세요.
 
 1. 필요한 대로 모델이나 예시를 커스터마이즈하세요:
@@ -167,14 +235,14 @@ limitations under the License.
 ## 왜 transformers를 사용하지 말아야 할까요?
 
 - 이 라이브러리는 신경망 블록을 만들기 위한 모듈이 아닙니다. 연구자들이 여러 파일을 살펴보지 않고 바로 각 모델을 사용할 수 있도록, 모델 파일 코드의 추상화 수준을 적정하게 유지했습니다.
-- 학습 API는 모든 모델에 적용할 수 있도록 만들어지진 않았지만, 라이브러리가 제공하는 모델들에 적용할 수 있도록 최적화되었습니다. 일반적인 머신 러닝을 위해선, 다른 라이브러리를 사용하세요.
+- 학습 API는 모든 모델에 적용할 수 있도록 만들어지진 않았지만, 라이브러리가 제공하는 모델들에 적용할 수 있도록 최적화되었습니다. 일반적인 머신 러닝을 위해선, 다른 라이브러리를 사용하세요(예를 들면, [Accelerate](https://huggingface.co/docs/accelerate/index)).
 - 가능한 많은 사용 예시를 보여드리고 싶어서, [예시 폴더](https://github.com/huggingface/transformers/tree/main/examples)의 스크립트를 준비했습니다. 이 스크립트들을 수정 없이 특정한 문제에 바로 적용하지 못할 수 있습니다. 필요에 맞게 일부 코드를 수정해야 할 수 있습니다.
 
 ## 설치
 
 ### pip로 설치하기
 
-이 저장소는 Python 3.8+, Flax 0.4.1+, PyTorch 1.11+, TensorFlow 2.6+에서 테스트 되었습니다.
+이 저장소는 Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, TensorFlow 2.6+에서 테스트 되었습니다.
 
 [가상 환경](https://docs.python.org/3/library/venv.html)에 🤗 Transformers를 설치하세요. Python 가상 환경에 익숙하지 않다면, [사용자 가이드](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)를 확인하세요.
 
@@ -189,7 +257,7 @@ limitations under the License.
 pip install transformers
 ```
 
-예시들을 체험해보고 싶거나, 최최최첨단 코드를 원하거나, 새로운 버전이 나올 때까지 기다릴 수 없다면 [라이브러리를 소스에서 바로 설치](https://huggingface.co/docs/transformers/installation#installing-from-source)하셔야 합니다.
+예시들을 체험해보고 싶거나, 최최최첨단 코드를 원하거나, 새로운 버전이 나올 때까지 기다릴 수 없다면 [라이브러리를 소스에서 바로 설치](https://huggingface.co/docs/transformers/ko/installation#install-from-source)하셔야 합니다.
 
 ### conda로 설치하기
 
@@ -203,29 +271,30 @@ conda install conda-forge::transformers
 
 Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 방법을 확인하세요.
 
+> **_노트:_** 윈도우 환경에서 캐싱의 이점을 위해 개발자 모드를 활성화할 수 있습니다. 만약 여러분에게 있어서 선택이 아닌 필수라면 [이 이슈](https://github.com/huggingface/huggingface_hub/issues/1062)를 통해 알려주세요.
+
 ## 모델 구조
 
-**🤗 Transformers가 제공하는 [모든 모델 체크포인트](https://huggingface.co/models)** 는 huggingface.co [모델 허브](https://huggingface.co)에 완벽히 연동되어 있습니다. [개인](https://huggingface.co/users)과 [기관](https://huggingface.co/organizations)이 모델 허브에 직접 업로드할 수 있습니다.
+**🤗 Transformers가 제공하는 [모든 모델 체크포인트](https://huggingface.co/models)** 는 huggingface.co [모델 허브](https://huggingface.co/models)에 완벽히 연동되어 있습니다. [개인](https://huggingface.co/users)과 [기관](https://huggingface.co/organizations)이 모델 허브에 직접 업로드할 수 있습니다.
 
 현재 사용 가능한 모델 체크포인트의 개수: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
 
-🤗 Transformers는 다음 모델들을 제공합니다: 각 모델의 요약은 [여기](https://huggingface.co/docs/transformers/model_summary)서 확인하세요.
+🤗 Transformers는 다음 모델들을 제공합니다: 각 모델의 요약은 [여기](https://huggingface.co/docs/transformers/ko/model_summary)서 확인하세요.
 
-각 모델이 Flax, PyTorch, TensorFlow으로 구현되었는지 또는 🤗 Tokenizers 라이브러리가 지원하는 토크나이저를 사용하는지 확인하려면, [이 표](https://huggingface.co/docs/transformers/index#supported-frameworks)를 확인하세요.
+각 모델이 Flax, PyTorch, TensorFlow으로 구현되었는지 또는 🤗 Tokenizers 라이브러리가 지원하는 토크나이저를 사용하는지 확인하려면, [이 표](https://huggingface.co/docs/transformers/ko/index#supported-framework)를 확인하세요.
 
-이 구현은 여러 데이터로 검증되었고 (예시 스크립트를 참고하세요) 오리지널 구현의 성능과 같아야 합니다. [도큐먼트](https://huggingface.co/docs/transformers/examples)의 Examples 섹션에서 성능에 대한 자세한 설명을 확인할 수 있습니다.
+이 구현은 여러 데이터로 검증되었고 (예시 스크립트를 참고하세요) 오리지널 구현의 성능과 같아야 합니다. [도큐먼트](https://github.com/huggingface/transformers/tree/main/examples)의 Examples 섹션에서 성능에 대한 자세한 설명을 확인할 수 있습니다.
 
 ## 더 알아보기
 
 | 섹션 | 설명 |
 |-|-|
-| [도큐먼트](https://huggingface.co/transformers/) | 전체 API 도큐먼트와 튜토리얼 |
-| [과제 요약](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers가 지원하는 과제들 |
-| [전처리 튜토리얼](https://huggingface.co/docs/transformers/preprocessing) | `Tokenizer` 클래스를 이용해 모델을 위한 데이터 준비하기 |
-| [학습과 fine-tuning](https://huggingface.co/docs/transformers/training) | 🤗 Transformers가 제공하는 모델 PyTorch/TensorFlow 학습 과정과 `Trainer` API에서 사용하기 |
-| [퀵 투어: Fine-tuning/사용 스크립트](https://github.com/huggingface/transformers/tree/main/examples) | 다양한 과제에서 모델 fine-tuning하는 예시 스크립트 |
-| [모델 공유 및 업로드](https://huggingface.co/docs/transformers/model_sharing) | 커뮤니티에 fine-tune된 모델을 업로드 및 공유하기 |
-| [마이그레이션](https://huggingface.co/docs/transformers/migration) | `pytorch-transformers`나 `pytorch-pretrained-bert`에서 🤗 Transformers로 이동하기|
+| [도큐먼트](https://huggingface.co/transformers/ko/) | 전체 API 도큐먼트와 튜토리얼 |
+| [과제 요약](https://huggingface.co/docs/transformers/ko/task_summary) | 🤗 Transformers가 지원하는 과제들 |
+| [전처리 튜토리얼](https://huggingface.co/docs/transformers/ko/preprocessing) | `Tokenizer` 클래스를 이용해 모델을 위한 데이터 준비하기 |
+| [학습과 파인 튜닝](https://huggingface.co/docs/transformers/ko/training) | 🤗 Transformers가 제공하는 모델 PyTorch/TensorFlow 학습 과정과 `Trainer` API에서 사용하기 |
+| [퀵 투어: 파인 튜닝/사용 스크립트](https://github.com/huggingface/transformers/tree/main/examples) | 다양한 과제에서 모델을 파인 튜닝하는 예시 스크립트 |
+| [모델 공유 및 업로드](https://huggingface.co/docs/transformers/ko/model_sharing) | 커뮤니티에 파인 튜닝된 모델을 업로드 및 공유하기 |
 
 ## 인용
 
diff --git a/setup.py b/setup.py
index cbfcfd43428524..922258d65efab7 100644
--- a/setup.py
+++ b/setup.py
@@ -117,7 +117,7 @@
     "fugashi>=1.0",
     "GitPython<3.1.19",
     "hf-doc-builder>=0.3.0",
-    "huggingface-hub>=0.23.2,<1.0",
+    "huggingface-hub>=0.24.0,<1.0",
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "isort>=5.5.4",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index cc8b07395024a8..e56959928b4f41 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -620,6 +620,7 @@
     "models.nougat": ["NougatProcessor"],
     "models.nystromformer": ["NystromformerConfig"],
     "models.olmo": ["OlmoConfig"],
+    "models.olmo_1124": ["Olmo1124Config"],
     "models.olmoe": ["OlmoeConfig"],
     "models.omdet_turbo": [
         "OmDetTurboConfig",
@@ -868,6 +869,7 @@
         "ImageClassificationPipeline",
         "ImageFeatureExtractionPipeline",
         "ImageSegmentationPipeline",
+        "ImageTextToTextPipeline",
         "ImageToImagePipeline",
         "ImageToTextPipeline",
         "JsonPipelineDataFormat",
@@ -1184,7 +1186,7 @@
     )
     _import_structure["models.convnext"].extend(["ConvNextFeatureExtractor", "ConvNextImageProcessor"])
     _import_structure["models.deformable_detr"].extend(
-        ["DeformableDetrFeatureExtractor", "DeformableDetrImageProcessor"]
+        ["DeformableDetrFeatureExtractor", "DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast"]
     )
     _import_structure["models.deit"].extend(["DeiTFeatureExtractor", "DeiTImageProcessor"])
     _import_structure["models.deprecated.deta"].append("DetaImageProcessor")
@@ -1228,7 +1230,7 @@
     _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
     _import_structure["models.pvt"].extend(["PvtImageProcessor"])
     _import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"])
-    _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor"])
+    _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor", "RTDetrImageProcessorFast"])
     _import_structure["models.sam"].extend(["SamImageProcessor"])
     _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
     _import_structure["models.seggpt"].extend(["SegGptImageProcessor"])
@@ -2918,6 +2920,13 @@
             "OlmoPreTrainedModel",
         ]
     )
+    _import_structure["models.olmo_1124"].extend(
+        [
+            "Olmo1124ForCausalLM",
+            "Olmo1124Model",
+            "Olmo1124PreTrainedModel",
+        ]
+    )
     _import_structure["models.olmoe"].extend(
         [
             "OlmoeForCausalLM",
@@ -5505,6 +5514,7 @@
         NystromformerConfig,
     )
     from .models.olmo import OlmoConfig
+    from .models.olmo_1124 import Olmo1124Config
     from .models.olmoe import OlmoeConfig
     from .models.omdet_turbo import (
         OmDetTurboConfig,
@@ -5794,6 +5804,7 @@
         ImageClassificationPipeline,
         ImageFeatureExtractionPipeline,
         ImageSegmentationPipeline,
+        ImageTextToTextPipeline,
         ImageToImagePipeline,
         ImageToTextPipeline,
         JsonPipelineDataFormat,
@@ -6089,6 +6100,7 @@
         from .models.deformable_detr import (
             DeformableDetrFeatureExtractor,
             DeformableDetrImageProcessor,
+            DeformableDetrImageProcessorFast,
         )
         from .models.deit import DeiTFeatureExtractor, DeiTImageProcessor
         from .models.deprecated.deta import DetaImageProcessor
@@ -6152,7 +6164,7 @@
         )
         from .models.pvt import PvtImageProcessor
         from .models.qwen2_vl import Qwen2VLImageProcessor
-        from .models.rt_detr import RTDetrImageProcessor
+        from .models.rt_detr import RTDetrImageProcessor, RTDetrImageProcessorFast
         from .models.sam import SamImageProcessor
         from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
         from .models.seggpt import SegGptImageProcessor
@@ -7521,6 +7533,11 @@
             OlmoModel,
             OlmoPreTrainedModel,
         )
+        from .models.olmo_1124 import (
+            Olmo1124ForCausalLM,
+            Olmo1124Model,
+            Olmo1124PreTrainedModel,
+        )
         from .models.olmoe import (
             OlmoeForCausalLM,
             OlmoeModel,
diff --git a/src/transformers/agents/agents.py b/src/transformers/agents/agents.py
index 73b7186d25a3c7..c461c50f29592c 100644
--- a/src/transformers/agents/agents.py
+++ b/src/transformers/agents/agents.py
@@ -1141,11 +1141,10 @@ def step(self):
             )
             self.logger.warning("Print outputs:")
             self.logger.log(32, self.state["print_outputs"])
+            observation = "Print outputs:\n" + self.state["print_outputs"]
             if result is not None:
                 self.logger.warning("Last output from code snippet:")
                 self.logger.log(32, str(result))
-            observation = "Print outputs:\n" + self.state["print_outputs"]
-            if result is not None:
                 observation += "Last output from code snippet:\n" + str(result)[:100000]
             current_step_logs["observation"] = observation
         except Exception as e:
diff --git a/src/transformers/agents/llm_engine.py b/src/transformers/agents/llm_engine.py
index 5c36c2922fa2a1..456c6172a77cb0 100644
--- a/src/transformers/agents/llm_engine.py
+++ b/src/transformers/agents/llm_engine.py
@@ -68,25 +68,84 @@ def get_clean_message_list(message_list: List[Dict[str, str]], role_conversions:
 
 
 class HfApiEngine:
-    """This engine leverages Hugging Face's Inference API service, either serverless or with a dedicated endpoint."""
+    """A class to interact with Hugging Face's Inference API for language model interaction.
+
+    This engine allows you to communicate with Hugging Face's models using the Inference API. It can be used in both serverless mode or with a dedicated endpoint, supporting features like stop sequences and grammar customization.
+
+    Parameters:
+        model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3.1-8B-Instruct"`):
+            The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub.
+        token (`str`, *optional*):
+            The Hugging Face API token for authentication. If not provided, the class will use the token stored in the Hugging Face CLI configuration.
+        max_tokens (`int`, *optional*, defaults to 1500):
+            The maximum number of tokens allowed in the output.
+        timeout (`int`, *optional*, defaults to 120):
+            Timeout for the API request, in seconds.
+
+    Raises:
+        ValueError:
+            If the model name is not provided.
+    """
+
+    def __init__(
+        self,
+        model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        token: Optional[str] = None,
+        max_tokens: Optional[int] = 1500,
+        timeout: Optional[int] = 120,
+    ):
+        """Initialize the HfApiEngine."""
+        if not model:
+            raise ValueError("Model name must be provided.")
 
-    def __init__(self, model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct"):
         self.model = model
-        self.client = InferenceClient(self.model, timeout=120)
+        self.client = InferenceClient(self.model, token=token, timeout=timeout)
+        self.max_tokens = max_tokens
 
     def __call__(
-        self, messages: List[Dict[str, str]], stop_sequences: List[str] = [], grammar: Optional[str] = None
+        self,
+        messages: List[Dict[str, str]],
+        stop_sequences: Optional[List[str]] = [],
+        grammar: Optional[str] = None,
     ) -> str:
+        """Process the input messages and return the model's response.
+
+        This method sends a list of messages to the Hugging Face Inference API, optionally with stop sequences and grammar customization.
+
+        Parameters:
+            messages (`List[Dict[str, str]]`):
+                A list of message dictionaries to be processed. Each dictionary should have the structure `{"role": "user/system", "content": "message content"}`.
+            stop_sequences (`List[str]`, *optional*):
+                A list of strings that will stop the generation if encountered in the model's output.
+            grammar (`str`, *optional*):
+                The grammar or formatting structure to use in the model's response.
+
+        Returns:
+            `str`: The text content of the model's response.
+
+        Example:
+            ```python
+            >>> engine = HfApiEngine(
+            ...     model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+            ...     token="your_hf_token_here",
+            ...     max_tokens=2000
+            ... )
+            >>> messages = [{"role": "user", "content": "Explain quantum mechanics in simple terms."}]
+            >>> response = engine(messages, stop_sequences=["END"])
+            >>> print(response)
+            "Quantum mechanics is the branch of physics that studies..."
+            ```
+        """
         # Get clean message list
         messages = get_clean_message_list(messages, role_conversions=llama_role_conversions)
 
-        # Get LLM output
+        # Send messages to the Hugging Face Inference API
         if grammar is not None:
             response = self.client.chat_completion(
-                messages, stop=stop_sequences, max_tokens=1500, response_format=grammar
+                messages, stop=stop_sequences, max_tokens=self.max_tokens, response_format=grammar
             )
         else:
-            response = self.client.chat_completion(messages, stop=stop_sequences, max_tokens=1500)
+            response = self.client.chat_completion(messages, stop=stop_sequences, max_tokens=self.max_tokens)
 
         response = response.choices[0].message.content
 
diff --git a/src/transformers/agents/monitoring.py b/src/transformers/agents/monitoring.py
index 8e28a72deb2a3e..755418d35a56a3 100644
--- a/src/transformers/agents/monitoring.py
+++ b/src/transformers/agents/monitoring.py
@@ -18,11 +18,19 @@
 from .agents import ReactAgent
 
 
-def pull_message(step_log: dict):
+def pull_message(step_log: dict, test_mode: bool = True):
     try:
         from gradio import ChatMessage
     except ImportError:
-        raise ImportError("Gradio should be installed in order to launch a gradio demo.")
+        if test_mode:
+
+            class ChatMessage:
+                def __init__(self, role, content, metadata=None):
+                    self.role = role
+                    self.content = content
+                    self.metadata = metadata
+        else:
+            raise ImportError("Gradio should be installed in order to launch a gradio demo.")
 
     if step_log.get("rationale"):
         yield ChatMessage(role="assistant", content=step_log["rationale"])
@@ -46,30 +54,40 @@ def pull_message(step_log: dict):
         )
 
 
-def stream_to_gradio(agent: ReactAgent, task: str, **kwargs):
+def stream_to_gradio(agent: ReactAgent, task: str, test_mode: bool = False, **kwargs):
     """Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages."""
 
     try:
         from gradio import ChatMessage
     except ImportError:
-        raise ImportError("Gradio should be installed in order to launch a gradio demo.")
+        if test_mode:
+
+            class ChatMessage:
+                def __init__(self, role, content, metadata=None):
+                    self.role = role
+                    self.content = content
+                    self.metadata = metadata
+        else:
+            raise ImportError("Gradio should be installed in order to launch a gradio demo.")
 
     for step_log in agent.run(task, stream=True, **kwargs):
         if isinstance(step_log, dict):
-            for message in pull_message(step_log):
+            for message in pull_message(step_log, test_mode=test_mode):
                 yield message
 
-    if isinstance(step_log, AgentText):
-        yield ChatMessage(role="assistant", content=f"**Final answer:**\n```\n{step_log.to_string()}\n```")
-    elif isinstance(step_log, AgentImage):
+    final_answer = step_log  # Last log is the run's final_answer
+
+    if isinstance(final_answer, AgentText):
+        yield ChatMessage(role="assistant", content=f"**Final answer:**\n```\n{final_answer.to_string()}\n```")
+    elif isinstance(final_answer, AgentImage):
         yield ChatMessage(
             role="assistant",
-            content={"path": step_log.to_string(), "mime_type": "image/png"},
+            content={"path": final_answer.to_string(), "mime_type": "image/png"},
         )
-    elif isinstance(step_log, AgentAudio):
+    elif isinstance(final_answer, AgentAudio):
         yield ChatMessage(
             role="assistant",
-            content={"path": step_log.to_string(), "mime_type": "audio/wav"},
+            content={"path": final_answer.to_string(), "mime_type": "audio/wav"},
         )
     else:
-        yield ChatMessage(role="assistant", content=str(step_log))
+        yield ChatMessage(role="assistant", content=str(final_answer))
diff --git a/src/transformers/agents/python_interpreter.py b/src/transformers/agents/python_interpreter.py
index fbece2bebd350f..6e90f356cb928e 100644
--- a/src/transformers/agents/python_interpreter.py
+++ b/src/transformers/agents/python_interpreter.py
@@ -848,6 +848,13 @@ def evaluate_ast(
         raise InterpreterError(f"{expression.__class__.__name__} is not supported.")
 
 
+def truncate_print_outputs(print_outputs: str, max_len_outputs: int = MAX_LEN_OUTPUT) -> str:
+    if len(print_outputs) < max_len_outputs:
+        return print_outputs
+    else:
+        return f"Print outputs:\n{print_outputs[:max_len_outputs]}\n_Print outputs have been truncated over the limit of {max_len_outputs} characters._\n"
+
+
 def evaluate_python_code(
     code: str,
     static_tools: Optional[Dict[str, Callable]] = None,
@@ -890,25 +897,12 @@ def evaluate_python_code(
     PRINT_OUTPUTS = ""
     global OPERATIONS_COUNT
     OPERATIONS_COUNT = 0
-    for node in expression.body:
-        try:
+    try:
+        for node in expression.body:
             result = evaluate_ast(node, state, static_tools, custom_tools, authorized_imports)
-        except InterpreterError as e:
-            msg = ""
-            if len(PRINT_OUTPUTS) > 0:
-                if len(PRINT_OUTPUTS) < MAX_LEN_OUTPUT:
-                    msg += f"Print outputs:\n{PRINT_OUTPUTS}\n====\n"
-                else:
-                    msg += f"Print outputs:\n{PRINT_OUTPUTS[:MAX_LEN_OUTPUT]}\n_Print outputs were over {MAX_LEN_OUTPUT} characters, so they have been truncated._\n====\n"
-            msg += f"EXECUTION FAILED:\nEvaluation stopped at line '{ast.get_source_segment(code, node)}' because of the following error:\n{e}"
-            raise InterpreterError(msg)
-        finally:
-            if len(PRINT_OUTPUTS) < MAX_LEN_OUTPUT:
-                state["print_outputs"] = PRINT_OUTPUTS
-            else:
-                state["print_outputs"] = (
-                    PRINT_OUTPUTS[:MAX_LEN_OUTPUT]
-                    + f"\n_Print outputs were over {MAX_LEN_OUTPUT} characters, so they have been truncated._"
-                )
-
-    return result
+        state["print_outputs"] = truncate_print_outputs(PRINT_OUTPUTS, max_len_outputs=MAX_LEN_OUTPUT)
+        return result
+    except InterpreterError as e:
+        msg = truncate_print_outputs(PRINT_OUTPUTS, max_len_outputs=MAX_LEN_OUTPUT)
+        msg += f"EXECUTION FAILED:\nEvaluation stopped at line '{ast.get_source_segment(code, node)}' because of the following error:\n{e}"
+        raise InterpreterError(msg)
diff --git a/src/transformers/agents/tools.py b/src/transformers/agents/tools.py
index a425ffc8f106b2..6d3401bf30e94a 100644
--- a/src/transformers/agents/tools.py
+++ b/src/transformers/agents/tools.py
@@ -14,6 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import ast
 import base64
 import importlib
 import inspect
@@ -22,6 +23,7 @@
 import os
 import tempfile
 from functools import lru_cache, wraps
+from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union
 
 from huggingface_hub import create_repo, get_collection, hf_hub_download, metadata_update, upload_folder
@@ -44,7 +46,7 @@
     is_vision_available,
     logging,
 )
-from .agent_types import handle_agent_inputs, handle_agent_outputs
+from .agent_types import ImageType, handle_agent_inputs, handle_agent_outputs
 
 
 logger = logging.get_logger(__name__)
@@ -87,20 +89,22 @@ def get_repo_type(repo_id, repo_type=None, **hub_kwargs):
 """
 
 
-def validate_after_init(cls):
+def validate_after_init(cls, do_validate_forward: bool = True):
     original_init = cls.__init__
 
     @wraps(original_init)
     def new_init(self, *args, **kwargs):
         original_init(self, *args, **kwargs)
         if not isinstance(self, PipelineTool):
-            self.validate_arguments()
+            self.validate_arguments(do_validate_forward=do_validate_forward)
 
     cls.__init__ = new_init
     return cls
 
 
-@validate_after_init
+CONVERSION_DICT = {"str": "string", "int": "integer", "float": "number"}
+
+
 class Tool:
     """
     A base class for the functions used by the agent. Subclass this and implement the `__call__` method as well as the
@@ -131,35 +135,45 @@ class Tool:
     def __init__(self, *args, **kwargs):
         self.is_initialized = False
 
-    def validate_arguments(self):
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        validate_after_init(cls, do_validate_forward=False)
+
+    def validate_arguments(self, do_validate_forward: bool = True):
         required_attributes = {
             "description": str,
             "name": str,
-            "inputs": Dict,
+            "inputs": dict,
             "output_type": str,
         }
         authorized_types = ["string", "integer", "number", "image", "audio", "any", "boolean"]
 
         for attr, expected_type in required_attributes.items():
             attr_value = getattr(self, attr, None)
+            if attr_value is None:
+                raise TypeError(f"You must set an attribute {attr}.")
             if not isinstance(attr_value, expected_type):
-                raise TypeError(f"You must set an attribute {attr} of type {expected_type.__name__}.")
+                raise TypeError(
+                    f"Attribute {attr} should have type {expected_type.__name__}, got {type(attr_value)} instead."
+                )
         for input_name, input_content in self.inputs.items():
-            assert "type" in input_content, f"Input '{input_name}' should specify a type."
+            assert isinstance(input_content, dict), f"Input '{input_name}' should be a dictionary."
+            assert (
+                "type" in input_content and "description" in input_content
+            ), f"Input '{input_name}' should have keys 'type' and 'description', has only {list(input_content.keys())}."
             if input_content["type"] not in authorized_types:
                 raise Exception(
                     f"Input '{input_name}': type '{input_content['type']}' is not an authorized value, should be one of {authorized_types}."
                 )
-            assert "description" in input_content, f"Input '{input_name}' should have a description."
 
         assert getattr(self, "output_type", None) in authorized_types
-
-        if not isinstance(self, PipelineTool):
-            signature = inspect.signature(self.forward)
-            if not set(signature.parameters.keys()) == set(self.inputs.keys()):
-                raise Exception(
-                    "Tool's 'forward' method should take 'self' as its first argument, then its next arguments should match the keys of tool attribute 'inputs'."
-                )
+        if do_validate_forward:
+            if not isinstance(self, PipelineTool):
+                signature = inspect.signature(self.forward)
+                if not set(signature.parameters.keys()) == set(self.inputs.keys()):
+                    raise Exception(
+                        "Tool's 'forward' method should take 'self' as its first argument, then its next arguments should match the keys of tool attribute 'inputs'."
+                    )
 
     def forward(self, *args, **kwargs):
         return NotImplemented("Write this method in your subclass of `Tool`.")
@@ -240,7 +254,6 @@ def save(self, output_dir):
     def from_hub(
         cls,
         repo_id: str,
-        model_repo_id: Optional[str] = None,
         token: Optional[str] = None,
         **kwargs,
     ):
@@ -258,9 +271,6 @@ def from_hub(
         Args:
             repo_id (`str`):
                 The name of the repo on the Hub where your tool is defined.
-            model_repo_id (`str`, *optional*):
-                If your tool uses a model and you want to use a different model than the default, you can pass a second
-                repo ID or an endpoint url to this argument.
             token (`str`, *optional*):
                 The token to identify you on hf.co. If unset, will use the token generated when running
                 `huggingface-cli login` (stored in `~/.huggingface`).
@@ -346,6 +356,9 @@ def from_hub(
         if tool_class.output_type != custom_tool["output_type"]:
             tool_class.output_type = custom_tool["output_type"]
 
+        if not isinstance(tool_class.inputs, dict):
+            tool_class.inputs = ast.literal_eval(tool_class.inputs)
+
         return tool_class(**kwargs)
 
     def push_to_hub(
@@ -405,6 +418,122 @@ def push_to_hub(
                 repo_type="space",
             )
 
+    @staticmethod
+    def from_space(
+        space_id: str, name: str, description: str, api_name: Optional[str] = None, token: Optional[str] = None
+    ):
+        """
+        Creates a [`Tool`] from a Space given its id on the Hub.
+
+        Args:
+            space_id (`str`):
+                The id of the Space on the Hub.
+            name (`str`):
+                The name of the tool.
+            description (`str`):
+                The description of the tool.
+            api_name (`str`, *optional*):
+                The specific api_name to use, if the space has several tabs. If not precised, will default to the first available api.
+            token (`str`, *optional*):
+                Add your token to access private spaces or increase your GPU quotas.
+        Returns:
+            [`Tool`]:
+                The Space, as a tool.
+
+        Examples:
+        ```
+        image_generator = Tool.from_space(
+            space_id="black-forest-labs/FLUX.1-schnell",
+            name="image-generator",
+            description="Generate an image from a prompt"
+        )
+        image = image_generator("Generate an image of a cool surfer in Tahiti")
+        ```
+        ```
+        face_swapper = Tool.from_space(
+            "tuan2308/face-swap",
+            "face_swapper",
+            "Tool that puts the face shown on the first image on the second image. You can give it paths to images.",
+        )
+        image = face_swapper('./aymeric.jpeg', './ruth.jpg')
+        ```
+        """
+        from gradio_client import Client, handle_file
+        from gradio_client.utils import is_http_url_like
+
+        class SpaceToolWrapper(Tool):
+            def __init__(
+                self,
+                space_id: str,
+                name: str,
+                description: str,
+                api_name: Optional[str] = None,
+                token: Optional[str] = None,
+            ):
+                self.client = Client(space_id, hf_token=token)
+                self.name = name
+                self.description = description
+                space_description = self.client.view_api(return_format="dict", print_info=False)["named_endpoints"]
+
+                # If api_name is not defined, take the first of the available APIs for this space
+                if api_name is None:
+                    api_name = list(space_description.keys())[0]
+                    logger.warning(
+                        f"Since `api_name` was not defined, it was automatically set to the first avilable API: `{api_name}`."
+                    )
+                self.api_name = api_name
+
+                try:
+                    space_description_api = space_description[api_name]
+                except KeyError:
+                    raise KeyError(f"Could not find specified {api_name=} among available api names.")
+
+                self.inputs = {}
+                for parameter in space_description_api["parameters"]:
+                    if not parameter["parameter_has_default"]:
+                        parameter_type = parameter["type"]["type"]
+                        if parameter_type == "object":
+                            parameter_type = "any"
+                        self.inputs[parameter["parameter_name"]] = {
+                            "type": parameter_type,
+                            "description": parameter["python_type"]["description"],
+                        }
+                output_component = space_description_api["returns"][0]["component"]
+                if output_component == "Image":
+                    self.output_type = "image"
+                elif output_component == "Audio":
+                    self.output_type = "audio"
+                else:
+                    self.output_type = "any"
+
+            def sanitize_argument_for_prediction(self, arg):
+                if isinstance(arg, ImageType):
+                    temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+                    arg.save(temp_file.name)
+                    arg = temp_file.name
+                if (isinstance(arg, (str, Path)) and Path(arg).exists() and Path(arg).is_file()) or is_http_url_like(
+                    arg
+                ):
+                    arg = handle_file(arg)
+                return arg
+
+            def forward(self, *args, **kwargs):
+                # Preprocess args and kwargs:
+                args = list(args)
+                for i, arg in enumerate(args):
+                    args[i] = self.sanitize_argument_for_prediction(arg)
+                for arg_name, arg in kwargs.items():
+                    kwargs[arg_name] = self.sanitize_argument_for_prediction(arg)
+
+                output = self.client.predict(*args, api_name=self.api_name, **kwargs)
+                if isinstance(output, tuple) or isinstance(output, list):
+                    return output[
+                        0
+                    ]  # Sometime the space also returns the generation seed, in which case the result is at index 0
+                return output
+
+        return SpaceToolWrapper(space_id, name, description, api_name=api_name, token=token)
+
     @staticmethod
     def from_gradio(gradio_tool):
         """
@@ -414,16 +543,15 @@ def from_gradio(gradio_tool):
 
         class GradioToolWrapper(Tool):
             def __init__(self, _gradio_tool):
-                super().__init__()
                 self.name = _gradio_tool.name
                 self.description = _gradio_tool.description
                 self.output_type = "string"
                 self._gradio_tool = _gradio_tool
-                func_args = list(inspect.signature(_gradio_tool.run).parameters.keys())
-                self.inputs = {key: "" for key in func_args}
-
-            def forward(self, *args, **kwargs):
-                return self._gradio_tool.run(*args, **kwargs)
+                func_args = list(inspect.signature(_gradio_tool.run).parameters.items())
+                self.inputs = {
+                    key: {"type": CONVERSION_DICT[value.annotation], "description": ""} for key, value in func_args
+                }
+                self.forward = self._gradio_tool.run
 
         return GradioToolWrapper(gradio_tool)
 
@@ -435,10 +563,13 @@ def from_langchain(langchain_tool):
 
         class LangChainToolWrapper(Tool):
             def __init__(self, _langchain_tool):
-                super().__init__()
                 self.name = _langchain_tool.name.lower()
                 self.description = _langchain_tool.description
-                self.inputs = parse_langchain_args(_langchain_tool.args)
+                self.inputs = _langchain_tool.args.copy()
+                for input_content in self.inputs.values():
+                    if "title" in input_content:
+                        input_content.pop("title")
+                    input_content["description"] = ""
                 self.output_type = "string"
                 self.langchain_tool = _langchain_tool
 
@@ -805,15 +936,6 @@ def __call__(
             return response.json()
 
 
-def parse_langchain_args(args: Dict[str, str]) -> Dict[str, str]:
-    """Parse the args attribute of a LangChain tool to create a matching inputs dictionary."""
-    inputs = args.copy()
-    for arg_details in inputs.values():
-        if "title" in arg_details:
-            arg_details.pop("title")
-    return inputs
-
-
 class ToolCollection:
     """
     Tool collections enable loading all Spaces from a collection in order to be added to the agent's toolbox.
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index 0f696cc3ac6a4d..490280ce813bd6 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -433,19 +433,22 @@ def update(
             self._seen_tokens += key_states.shape[-2]
 
         # Update the cache
-        if len(self.key_cache) <= layer_idx:
-            # There may be skipped layers, fill them with empty lists
-            for _ in range(len(self.key_cache), layer_idx):
-                self.key_cache.append([])
-                self.value_cache.append([])
-            self.key_cache.append(key_states)
-            self.value_cache.append(value_states)
-        elif len(self.key_cache[layer_idx]) == 0:  # fills previously skipped layers; checking for tensor causes errors
-            self.key_cache[layer_idx] = key_states
-            self.value_cache[layer_idx] = value_states
-        else:
-            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
-            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+        if key_states is not None:
+            if len(self.key_cache) <= layer_idx:
+                # There may be skipped layers, fill them with empty lists
+                for _ in range(len(self.key_cache), layer_idx):
+                    self.key_cache.append([])
+                    self.value_cache.append([])
+                self.key_cache.append(key_states)
+                self.value_cache.append(value_states)
+            elif (
+                len(self.key_cache[layer_idx]) == 0
+            ):  # fills previously skipped layers; checking for tensor causes errors
+                self.key_cache[layer_idx] = key_states
+                self.value_cache[layer_idx] = value_states
+            else:
+                self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+                self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
 
         return self.key_cache[layer_idx], self.value_cache[layer_idx]
 
@@ -525,7 +528,7 @@ def from_batch_splits(cls, splits: List["DynamicCache"], num_hidden_layers: int
         cache = cls()
         for idx in range(len(splits[0])):
             key_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
-            value_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
+            value_cache = [current.value_cache[idx] for current in splits if current.value_cache[idx] != []]
             if key_cache != []:
                 layer_keys = torch.cat(key_cache, dim=0)
                 layer_values = torch.cat(value_cache, dim=0)
@@ -781,6 +784,11 @@ def __init__(self, cache_config: CacheConfig) -> None:
         super().__init__(cache_config)
 
         if is_optimum_quanto_available():
+            optimum_quanto_version = version.parse(importlib.metadata.version("optimum-quanto"))
+            if optimum_quanto_version <= version.parse("0.2.5"):
+                raise ImportError(
+                    f"You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedCache`. Detected version {optimum_quanto_version}."
+                )
             from optimum.quanto import MaxOptimizer, qint2, qint4
         elif is_quanto_available():
             logger.warning_once(
@@ -813,7 +821,8 @@ def _quantize(self, tensor, axis):
         if is_optimum_quanto_available():
             from optimum.quanto import quantize_weight
 
-            qtensor = quantize_weight(tensor, self.qtype, axis, self.q_group_size)
+            scale, zeropoint = self.optimizer(tensor, self.qtype, axis, self.q_group_size)
+            qtensor = quantize_weight(tensor, self.qtype, axis, scale, zeropoint, self.q_group_size)
             return qtensor
         elif is_quanto_available():
             logger.warning_once(
@@ -1514,7 +1523,10 @@ def crop(self, maximum_length: int):
         self.check_dynamic_cache(self.crop.__name__)
         self.self_attention_cache.crop(maximum_length)
 
-    def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]":
+    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
+    def batch_split(
+        self, full_batch_size: int, split_size: int, num_hidden_layers: int = None
+    ) -> "List[EncoderDecoderCache]":
         """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
         `_split_model_inputs()` in `generation.utils`"""
         self.check_dynamic_cache(self.batch_split.__name__)
@@ -1527,7 +1539,10 @@ def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDec
         return out
 
     @classmethod
-    def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache":
+    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
+    def from_batch_splits(
+        cls, splits: List["EncoderDecoderCache"], num_hidden_layers: int = None
+    ) -> "EncoderDecoderCache":
         """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
         `generation.utils`"""
         self_attention_cache = DynamicCache()
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 1d892c49a231fc..e49eab86b4e12f 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -71,6 +71,8 @@ class PretrainedConfig(PushToHubMixin):
       outputs of the model during inference.
     - **attribute_map** (`Dict[str, str]`) -- A dict that maps model specific attribute names to the standardized
       naming of attributes.
+    - **base_model_tp_plan** (`Dict[str, Any]`) -- A dict that maps sub-modules FQNs of a base model to a tensor
+      parallel plan applied to the sub-module when `model.tensor_parallel` is called.
 
     Common attributes (present in all subclasses):
 
@@ -190,8 +192,11 @@ class PretrainedConfig(PushToHubMixin):
     """
 
     model_type: str = ""
+    base_config_key: str = ""
+    sub_configs: Dict[str, "PretrainedConfig"] = {}
     is_composition: bool = False
     attribute_map: Dict[str, str] = {}
+    base_model_tp_plan: Optional[Dict[str, Any]] = None
     _auto_class: Optional[str] = None
 
     def __setattr__(self, key, value):
@@ -543,11 +548,22 @@ def from_pretrained(
         cls._set_token_in_kwargs(kwargs, token)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if cls.base_config_key and cls.base_config_key in config_dict:
+            config_dict = config_dict[cls.base_config_key]
+
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
+            # sometimes the config has no `base_config_key` if the config is used in several composite models
+            # e.g. LlamaConfig. In that case we try to see if there is match in `model_type` before raising a warning
+            for k, v in config_dict.items():
+                if isinstance(v, dict) and v.get("model_type") == cls.model_type:
+                    config_dict = v
+
+            # raise warning only if we still can't see a match in `model_type`
+            if config_dict["model_type"] != cls.model_type:
+                logger.warning(
+                    f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                    f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+                )
 
         return cls.from_dict(config_dict, **kwargs)
 
@@ -835,6 +851,9 @@ def to_diff_dict(self) -> Dict[str, Any]:
 
         if "_attn_implementation_internal" in serializable_config_dict:
             del serializable_config_dict["_attn_implementation_internal"]
+        # Do not serialize `base_model_tp_plan` for now
+        if "base_model_tp_plan" in serializable_config_dict:
+            del serializable_config_dict["base_model_tp_plan"]
 
         return serializable_config_dict
 
@@ -854,6 +873,9 @@ def to_dict(self) -> Dict[str, Any]:
             del output["_commit_hash"]
         if "_attn_implementation_internal" in output:
             del output["_attn_implementation_internal"]
+        # Do not serialize `base_model_tp_plan` for now
+        if "base_model_tp_plan" in output:
+            del output["base_model_tp_plan"]
 
         # Transformers version when serializing the model
         output["transformers_version"] = __version__
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index cc80f6a19bfb26..9e75e6fd3c38df 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -443,7 +443,7 @@ def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int]
             return torch.stack(examples, dim=0)
 
     # If yes, check if we have a `pad_token`.
-    if tokenizer._pad_token is None:
+    if tokenizer.pad_token is None:
         raise ValueError(
             "You are attempting to pad samples but the tokenizer you are using"
             f" ({tokenizer.__class__.__name__}) does not have a pad token."
@@ -477,7 +477,7 @@ def _tf_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = N
         return tf.stack(examples, axis=0)
 
     # If yes, check if we have a `pad_token`.
-    if tokenizer._pad_token is None:
+    if tokenizer.pad_token is None:
         raise ValueError(
             "You are attempting to pad samples but the tokenizer you are using"
             f" ({tokenizer.__class__.__name__}) does not have a pad token."
@@ -513,7 +513,7 @@ def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int]
         return np.stack(examples, axis=0)
 
     # If yes, check if we have a `pad_token`.
-    if tokenizer._pad_token is None:
+    if tokenizer.pad_token is None:
         raise ValueError(
             "You are attempting to pad samples but the tokenizer you are using"
             f" ({tokenizer.__class__.__name__}) does not have a pad token."
@@ -1090,7 +1090,7 @@ def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
             self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
         ]
         probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
-        if self.tokenizer._pad_token is not None:
+        if self.tokenizer.pad_token is not None:
             padding_mask = labels.eq(self.tokenizer.pad_token_id)
             probability_matrix.masked_fill_(padding_mask, value=0.0)
 
@@ -1131,7 +1131,7 @@ def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
             self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels
         ]
         masked_indices = masked_indices & ~tf.cast(special_tokens_mask, dtype=tf.bool)
-        if self.tokenizer._pad_token is not None:
+        if self.tokenizer.pad_token is not None:
             padding_mask = inputs == self.tokenizer.pad_token_id
             masked_indices = masked_indices & ~padding_mask
 
@@ -1170,7 +1170,7 @@ def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
             self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
         ]
         masked_indices[np.array(special_tokens_mask, dtype=bool)] = 0
-        if self.tokenizer._pad_token is not None:
+        if self.tokenizer.pad_token is not None:
             padding_mask = labels == self.tokenizer.pad_token_id
             masked_indices[padding_mask] = 0
 
@@ -1251,13 +1251,13 @@ def mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any]:
             self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
         ]
         probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
-        if self.tokenizer._pad_token is not None:
+        if self.tokenizer.pad_token is not None:
             padding_mask = labels.eq(self.tokenizer.pad_token_id)
             probability_matrix.masked_fill_(padding_mask, value=0.0)
         masked_indices = torch.bernoulli(probability_matrix).bool()
         # probability be `1` (masked), however in albert model attention mask `0` means masked, revert the value
         attention_mask = (~masked_indices).float()
-        if self.tokenizer._pad_token is not None:
+        if self.tokenizer.pad_token is not None:
             attention_padding_mask = labels.eq(self.tokenizer.pad_token_id)
             attention_mask.masked_fill_(attention_padding_mask, value=1.0)
         labels[~masked_indices] = -100  # We only compute loss on masked tokens, -100 is default for CE compute
@@ -1367,7 +1367,7 @@ def torch_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
             dtype=torch.bool,
         )
         masked_indices.masked_fill_(special_tokens_mask, value=0.0)
-        if self.tokenizer._pad_token is not None:
+        if self.tokenizer.pad_token is not None:
             padding_mask = labels.eq(self.tokenizer.pad_token_id)
             masked_indices.masked_fill_(padding_mask, value=0.0)
 
@@ -1471,7 +1471,7 @@ def tf_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
         )
         special_tokens_mask = tf.cast(special_tokens_mask, dtype=tf.bool)
         masked_indices = masked_indices & ~special_tokens_mask
-        if self.tokenizer._pad_token is not None:
+        if self.tokenizer.pad_token is not None:
             padding_mask = labels == self.tokenizer.pad_token_id
             masked_indices = masked_indices & ~padding_mask
 
@@ -1571,7 +1571,7 @@ def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
             dtype=bool,
         )
         masked_indices[special_tokens_mask] = 0
-        if self.tokenizer._pad_token is not None:
+        if self.tokenizer.pad_token is not None:
             padding_mask = labels == self.tokenizer.pad_token_id
             masked_indices[padding_mask] = 0.0
 
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index a633f54a4af1a8..9543b58ad40d91 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -24,7 +24,7 @@
     "fugashi": "fugashi>=1.0",
     "GitPython": "GitPython<3.1.19",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "huggingface-hub": "huggingface-hub>=0.23.2,<1.0",
+    "huggingface-hub": "huggingface-hub>=0.24.0,<1.0",
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "isort": "isort>=5.5.4",
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 4e0e1dd3430209..bf44d4b427cf7b 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -152,7 +152,8 @@ def get_imports(filename: Union[str, os.PathLike]) -> List[str]:
         content = f.read()
 
     # filter out try/except block so in custom code we can have try/except imports
-    content = re.sub(r"\s*try\s*:\s*.*?\s*except\s*.*?:", "", content, flags=re.MULTILINE | re.DOTALL)
+    content = re.sub(r"\s*try\s*:.*?except.*?:", "", content, flags=re.DOTALL)
+
     # filter out imports under is_flash_attn_2_available block for avoid import issues in cpu only environment
     content = re.sub(
         r"if is_flash_attn[a-zA-Z0-9_]+available\(\):\s*(from flash_attn\s*.*\s*)+", "", content, flags=re.MULTILINE
diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py
index b487fa3c7fe6ec..e2ed48433b1639 100644
--- a/src/transformers/generation/__init__.py
+++ b/src/transformers/generation/__init__.py
@@ -49,6 +49,7 @@
     _import_structure["candidate_generator"] = [
         "AssistedCandidateGenerator",
         "CandidateGenerator",
+        "EarlyExitCandidateGenerator",
         "PromptLookupCandidateGenerator",
     ]
     _import_structure["logits_process"] = [
@@ -206,7 +207,12 @@
     else:
         from .beam_constraints import Constraint, ConstraintListState, DisjunctiveConstraint, PhrasalConstraint
         from .beam_search import BeamHypotheses, BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
-        from .candidate_generator import AssistedCandidateGenerator, CandidateGenerator, PromptLookupCandidateGenerator
+        from .candidate_generator import (
+            AssistedCandidateGenerator,
+            CandidateGenerator,
+            EarlyExitCandidateGenerator,
+            PromptLookupCandidateGenerator,
+        )
         from .logits_process import (
             AlternatingCodebooksLogitsProcessor,
             ClassifierFreeGuidanceLogitsProcessor,
diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 1e4d7a4702453a..df213b458cf8bb 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -255,7 +255,8 @@ def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.F
             "heuristic",
             "heuristic_transient",
         }:
-            if num_matches == int(self.num_assistant_tokens):
+            # len(scores[0])-1 is the number of candidates according to the target tokenizer.
+            if num_matches == len(scores[0]) - 1:
                 self.num_assistant_tokens += 2.0
             else:
                 self.num_assistant_tokens = max(1.0, self.num_assistant_tokens - 1.0)
@@ -670,6 +671,62 @@ def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.F
         return
 
 
+class EarlyExitCandidateGenerator(AssistedCandidateGenerator):
+    """
+    `CandidateGenerator` class to be used for assisted generation and speculative decoding. This class generates
+    candidates through the use of **the model itself**, exiting early. Can only be used with models that support early
+    exit, e.g., `facebook/layerskip-llama3.2-1B`.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+        assistant_model (`PreTrainedModel`):
+            The original model. This model must support early exit (i.e. is trained to compute logits in earlier
+            layers).
+        generation_config (`~generation.GenerationConfig`, *optional*):
+            The generation configuration to be used as base parametrization for the generation call.
+        logits_processor (`LogitsProcessorList`):
+            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+            used to modify the prediction scores of the language modeling head applied at each generation step.
+        model_kwargs (`Dict`):
+            The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant
+            model as well.
+        inputs_tensor (`torch.Tensor`, *optional*):
+            The model input tensor. In encoder-decoder models, this is the encoder input.
+    """
+
+    def __init__(
+        self,
+        input_ids: torch.LongTensor,
+        assistant_model: "PreTrainedModel",
+        generation_config: "GenerationConfig",
+        model_kwargs: Dict,
+        inputs_tensor: Optional[torch.Tensor] = None,
+        logits_processor: "LogitsProcessorList" = None,
+    ):
+        super().__init__(
+            input_ids=input_ids,
+            assistant_model=assistant_model,
+            generation_config=generation_config,
+            model_kwargs=model_kwargs,
+            inputs_tensor=inputs_tensor,
+            logits_processor=logits_processor,
+        )
+        # We have to move early exit out of the generation config, otherwise the assistant will also call `generate`
+        # with early exit
+        self.assistant_early_exit = self.generation_config.assistant_early_exit
+        self.generation_config.assistant_early_exit = None
+
+    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+        # Temporarily sets the number of hidden layers to the early exit value
+        base_model = getattr(self.assistant_model, self.assistant_model.base_model_prefix)
+        original_num_hidden_layers = base_model.config.num_hidden_layers
+        base_model.config.num_hidden_layers = self.assistant_early_exit
+        candidate_ids, candidate_logits = super().get_candidates(input_ids)
+        base_model.config.num_hidden_layers = original_num_hidden_layers
+        return candidate_ids, candidate_logits
+
+
 def _crop_past_key_values(model, past_key_values, max_length):
     """Crops the past key values up to a certain maximum length."""
     new_past = []
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index c460a19885afc5..de62ee767aeda0 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -172,7 +172,15 @@ class GenerationConfig(PushToHubMixin):
             speed up decoding.
         cache_implementation (`str`, *optional*, default to `None`):
             Name of the cache class that will be instantiated in `generate`, for faster decoding. Possible values are:
-            {ALL_CACHE_IMPLEMENTATIONS}. We support other cache types, but they must be manually instantiated and
+
+            - `"static"`: [`StaticCache`]
+            - `"offloaded_static"`: [`OffloadedStaticCache`]
+            - `"sliding_window"`: [`SlidingWindowCache`]
+            - `"hybrid"`: [`HybridCache`]
+            - `"mamba"`: [`MambaCache`]
+            - `"quantized"`: [`QuantizedCache`]
+
+            We support other cache types, but they must be manually instantiated and
             passed to `generate` through the `past_key_values` argument. See our
             [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
         cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
@@ -345,10 +353,13 @@ class GenerationConfig(PushToHubMixin):
             than this threshold, the assistant model stops the current token generation iteration, even if the number of _speculative tokens_
             (defined by `num_assistant_tokens`) is not yet reached. It is an unsupervised version of the dynamic speculation lookahead
             from Dynamic Speculation Lookahead Accelerates Speculative Decoding of Large Language Models <https://arxiv.org/abs/2405.04304>.
-        prompt_lookup_num_tokens (`int`, *optional*, default to `None`):
+        prompt_lookup_num_tokens (`int`, *optional*):
             The number of tokens to be output as candidate tokens.
-        max_matching_ngram_size (`int`, *optional*, default to `None`):
+        max_matching_ngram_size (`int`, *optional*):
             The maximum ngram size to be considered for matching in the prompt. Default to 2 if not provided.
+        assistant_early_exit(`int`, *optional*):
+            If set to a positive integer, early exit of the model will be used as an assistant. Can only be used with
+            models that support early exit (i.e. models where logits from intermediate layers can be interpreted by the LM head).
 
         > Wild card
 
@@ -446,10 +457,9 @@ def __init__(self, **kwargs):
         self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", 20)
         self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", "constant")
         self.assistant_confidence_threshold = kwargs.pop("assistant_confidence_threshold", 0.4)
-
-        # Prompt lookup decoding
         self.prompt_lookup_num_tokens = kwargs.pop("prompt_lookup_num_tokens", None)
         self.max_matching_ngram_size = kwargs.pop("max_matching_ngram_size", None)
+        self.assistant_early_exit = kwargs.pop("assistant_early_exit", None)
 
         # Wild card
         self.generation_kwargs = kwargs.pop("generation_kwargs", {})
@@ -526,7 +536,11 @@ def get_generation_mode(self, assistant_model: Optional["PreTrainedModel"] = Non
                 generation_mode = GenerationMode.BEAM_SEARCH
 
         # Assisted generation may extend some generation modes
-        if assistant_model is not None or self.prompt_lookup_num_tokens is not None:
+        if (
+            assistant_model is not None
+            or self.prompt_lookup_num_tokens is not None
+            or self.assistant_early_exit is not None
+        ):
             if generation_mode in ("greedy_search", "sample"):
                 generation_mode = GenerationMode.ASSISTED_GENERATION
             else:
@@ -1471,8 +1485,8 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig):
     ```python
     >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig
 
-    >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it')
-    >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it')
+    >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b', padding_side="left")
+    >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b')
 
     >>> # SynthID Text configuration
     >>> watermarking_config = SynthIDTextWatermarkingConfig(
@@ -1481,11 +1495,11 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig):
     ... )
 
     >>> # Generation with watermarking
-    >>> tokenized_prompts = tokenizer(["your prompts here"])
+    >>> tokenized_prompts = tokenizer(["Once upon a time, "], return_tensors="pt", padding=True)
     >>> output_sequences = model.generate(
-    ...     **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True,
+    ...     **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, max_new_tokens=10
     ... )
-    >>> watermarked_text = tokenizer.batch_decode(output_sequences)
+    >>> watermarked_text = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
     ```
     """
 
diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
index 08480ac983e805..8e87ead7fdd5a9 100644
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@@ -397,6 +397,12 @@ def generate(
                     "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
                 )
             generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+        else:  # by default let's always generate 10 new tokens
+            if generation_config.max_length == GenerationConfig().max_length:
+                generation_config.max_length = generation_config.max_length + input_ids_seq_length
+                max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+                if max_position_embeddings is not None:
+                    generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
 
         if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
             raise ValueError(
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index fde95c7a85652f..9d244191da811c 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -2565,8 +2565,8 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
     ```python
     >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig
 
-    >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it')
-    >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it')
+    >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b', padding_side="left")
+    >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b')
 
     >>> # SynthID Text configuration
     >>> watermarking_config = SynthIDTextWatermarkingConfig(
@@ -2575,11 +2575,11 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
     ... )
 
     >>> # Generation with watermarking
-    >>> tokenized_prompts = tokenizer(["your prompts here"])
+    >>> tokenized_prompts = tokenizer(["Once upon a time, "], return_tensors="pt", padding=True)
     >>> output_sequences = model.generate(
-    ...     **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True,
+    ...     **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, max_new_tokens=10
     ... )
-    >>> watermarked_text = tokenizer.batch_decode(output_sequences)
+    >>> watermarked_text = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
     ```
     """
 
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index efe953db051cb3..e3657550d0e7de 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -54,6 +54,7 @@
     AssistedCandidateGenerator,
     AssistedCandidateGeneratorDifferentTokenizers,
     CandidateGenerator,
+    EarlyExitCandidateGenerator,
     PromptLookupCandidateGenerator,
     _crop_past_key_values,
     _prepare_attention_mask,
@@ -378,10 +379,14 @@ def prepare_inputs_for_generation(
         # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
         # Exception 1: when passing input_embeds, input_ids may be missing entries
         # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+        #              (we can't check exception 3 while compiling)
         if past_key_values is not None:
             model_inputs["past_key_values"] = past_key_values
-            if inputs_embeds is not None or cache_position[-1] >= input_ids.shape[1]:  # Exception 1 or Exception 3
+            if (
+                inputs_embeds is not None  # Exception 1
+                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+            ):
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
                 input_ids = input_ids[:, cache_position]
@@ -414,7 +419,7 @@ def prepare_inputs_for_generation(
         for model_input_name in ["position_ids", "token_type_ids"]:
             model_input = kwargs.get(model_input_name)
             if model_input is not None:
-                if past_key_values:
+                if past_key_values is not None:
                     model_input = model_input[:, -input_ids.shape[1] :]
                     model_input = model_input.clone(memory_format=torch.contiguous_format)
                 model_inputs[model_input_name] = model_input
@@ -568,27 +573,34 @@ def _maybe_initialize_input_ids_for_generation(
 
     def _prepare_attention_mask_for_generation(
         self,
-        inputs: torch.Tensor,
-        pad_token_id: Optional[torch.Tensor],
-        eos_token_id: Optional[torch.Tensor],
+        inputs_tensor: torch.Tensor,
+        generation_config: GenerationConfig,
+        model_kwargs: Dict[str, Any],
     ) -> torch.LongTensor:
+        pad_token_id = generation_config._pad_token_tensor
+        eos_token_id = generation_config._eos_token_tensor
+
+        # `input_ids` may be present in the model kwargs, instead of being the main input (e.g. multimodal model)
+        if "input_ids" in model_kwargs and model_kwargs["input_ids"].shape[1] > 0:
+            inputs_tensor = model_kwargs["input_ids"]
+
         # No information for attention mask inference -> return default attention mask
-        default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device)
+        default_attention_mask = torch.ones(inputs_tensor.shape[:2], dtype=torch.long, device=inputs_tensor.device)
         if pad_token_id is None:
             return default_attention_mask
 
-        is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [torch.int, torch.long]
+        is_input_ids = len(inputs_tensor.shape) == 2 and inputs_tensor.dtype in [torch.int, torch.long]
         if not is_input_ids:
             return default_attention_mask
 
         is_pad_token_in_inputs = (pad_token_id is not None) and (
-            isin_mps_friendly(elements=inputs, test_elements=pad_token_id).any()
+            isin_mps_friendly(elements=inputs_tensor, test_elements=pad_token_id).any()
         )
         is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~(
             isin_mps_friendly(elements=eos_token_id, test_elements=pad_token_id).any()
         )
         can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id
-        attention_mask_from_padding = inputs.ne(pad_token_id).long()
+        attention_mask_from_padding = inputs_tensor.ne(pad_token_id).long()
 
         attention_mask = (
             attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask
@@ -811,7 +823,16 @@ def _get_candidate_generator(
         """
         different_tokenizers = all(v is not None for v in (assistant_model, target_tokenizer, assistant_tokenizer))
 
-        if generation_config.prompt_lookup_num_tokens is not None:
+        if generation_config.assistant_early_exit is not None:
+            candidate_generator = EarlyExitCandidateGenerator(
+                input_ids=input_ids,
+                assistant_model=self,
+                generation_config=generation_config,
+                model_kwargs=model_kwargs,
+                inputs_tensor=inputs_tensor,
+                logits_processor=logits_processor,
+            )
+        elif generation_config.prompt_lookup_num_tokens is not None:
             candidate_generator = PromptLookupCandidateGenerator(
                 eos_token_id=generation_config._eos_token_tensor,
                 num_output_tokens=generation_config.prompt_lookup_num_tokens,
@@ -1441,10 +1462,11 @@ def _prepare_generated_length(
         ):
             generation_config.max_length -= inputs_tensor.shape[1]
         elif has_default_max_length:  # by default let's always generate 20 new tokens
-            generation_config.max_length = generation_config.max_length + input_ids_length
-            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
-            if max_position_embeddings is not None:
-                generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
+            if generation_config.max_length == GenerationConfig().max_length:
+                generation_config.max_length = generation_config.max_length + input_ids_length
+                max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+                if max_position_embeddings is not None:
+                    generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
 
         # same for min length
         if generation_config.min_new_tokens is not None:
@@ -1624,7 +1646,10 @@ def get_layer_device_map(execution_device_map: Optional[dict] = None):
             # This is needed here if we don't want to make changes in accelerate in order to save execution_device
             # For offloaded case, we need to get the execution device, not just the device where it is offloaded
             if hasattr(self, "hf_device_map"):
-                main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0]
+                if set(self.hf_device_map.values()) == {"cpu"} or set(self.hf_device_map.values()) == {"cpu", "disk"}:
+                    main_device = "cpu"
+                else:
+                    main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0]
                 execution_device_map = {
                     name: main_device if device in ["cpu", "disk"] else device
                     for name, device in self.hf_device_map.items()
@@ -2020,7 +2045,7 @@ def generate(
 
         if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
-                inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor
+                inputs_tensor, generation_config, model_kwargs
             )
         elif kwargs_has_attention_mask:
             # TODO (joao): generalize this check with other types of inputs
@@ -3221,7 +3246,7 @@ def _sample(
 
             # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
             # (the clone itself is always small)
-            next_token_logits = outputs.logits.clone()[:, -1, :].float()
+            next_token_logits = outputs.logits[:, -1, :].clone().float()
             next_token_logits = next_token_logits.to(input_ids.device)
 
             # pre-process distribution
diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index d1a08132d73d89..3c1be325b7eb30 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -15,14 +15,18 @@
 
 import functools
 from dataclasses import dataclass
+from typing import Any, Iterable, List, Optional, Tuple
 
 from .image_processing_utils import BaseImageProcessor
-from .utils.import_utils import is_torchvision_available
+from .utils.import_utils import is_torch_available, is_torchvision_available
 
 
 if is_torchvision_available():
     from torchvision.transforms import Compose
 
+if is_torch_available():
+    import torch
+
 
 @dataclass(frozen=True)
 class SizeDict:
@@ -66,3 +70,64 @@ def to_dict(self):
         encoder_dict = super().to_dict()
         encoder_dict.pop("_transform_params", None)
         return encoder_dict
+
+
+def get_image_size_for_max_height_width(
+    image_size: Tuple[int, int],
+    max_height: int,
+    max_width: int,
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
+    Important, even if image_height < max_height and image_width < max_width, the image will be resized
+    to at least one of the edges be equal to max_height or max_width.
+
+    For example:
+        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
+        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The image to resize.
+        max_height (`int`):
+            The maximum allowed height.
+        max_width (`int`):
+            The maximum allowed width.
+    """
+    height, width = image_size
+    height_scale = max_height / height
+    width_scale = max_width / width
+    min_scale = min(height_scale, width_scale)
+    new_height = int(height * min_scale)
+    new_width = int(width * min_scale)
+    return new_height, new_width
+
+
+def safe_squeeze(tensor: "torch.Tensor", axis: Optional[int] = None) -> "torch.Tensor":
+    """
+    Squeezes a tensor, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return tensor.squeeze()
+
+    try:
+        return tensor.squeeze(axis=axis)
+    except ValueError:
+        return tensor
+
+
+def max_across_indices(values: Iterable[Any]) -> List[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+def get_max_height_width(images: List["torch.Tensor"]) -> Tuple[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+
+    _, max_height, max_width = max_across_indices([img.shape for img in images])
+
+    return (max_height, max_width)
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 1a70ef05638379..f59b99b490d38d 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -385,6 +385,27 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] =
     return image
 
 
+def load_images(
+    images: Union[List, Tuple, str, "PIL.Image.Image"], timeout: Optional[float] = None
+) -> Union["PIL.Image.Image", List["PIL.Image.Image"], List[List["PIL.Image.Image"]]]:
+    """Loads images, handling different levels of nesting.
+
+    Args:
+      images: A single image, a list of images, or a list of lists of images to load.
+      timeout: Timeout for loading images.
+
+    Returns:
+      A single image, a list of images, a list of lists of images.
+    """
+    if isinstance(images, (list, tuple)):
+        if len(images) and isinstance(images[0], (list, tuple)):
+            return [[load_image(image, timeout=timeout) for image in image_group] for image_group in images]
+        else:
+            return [load_image(image, timeout=timeout) for image in images]
+    else:
+        return load_image(images, timeout=timeout)
+
+
 def validate_preprocess_arguments(
     do_rescale: Optional[bool] = None,
     rescale_factor: Optional[float] = None,
diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index 4a2740fcb30e1c..57f0af5667e648 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -235,6 +235,33 @@
         "output.weight": "lm_head.weight",
         "output_norm": "model.norm",
     },
+    "mamba": {
+        "token_embd": "backbone.embeddings",
+        "blk": "backbone.layers",
+        "ssm_a": "mixer.A_log",
+        "ssm_conv1d": "mixer.conv1d",
+        "ssm_in": "mixer.in_proj",
+        "ssm_out": "mixer.out_proj",
+        "ssm_x": "mixer.x_proj",
+        "ssm_dt": "mixer.dt_proj",
+        "attn_norm": "norm",
+        "output_norm": "backbone.norm_f",
+        "output.weight": "lm_head.weight",
+    },
+    "nemotron": {
+        "token_embd": "model.embed_tokens",
+        "blk": "model.layers",
+        "ffn_up": "mlp.up_proj",
+        "ffn_down": "mlp.down_proj",
+        "ffn_norm": "post_attention_layernorm",
+        "attn_norm": "input_layernorm",
+        "attn_q": "self_attn.q_proj",
+        "attn_v": "self_attn.v_proj",
+        "attn_k": "self_attn.k_proj",
+        "attn_output": "self_attn.o_proj",
+        "output.weight": "lm_head.weight",
+        "output_norm": "model.norm",
+    },
 }
 
 
@@ -373,6 +400,29 @@
         "attention.head_count_kv": "num_key_value_heads",
         "attention.layer_norm_epsilon": "norm_epsilon",
     },
+    "mamba": {
+        "vocab_size": "vocab_size",
+        "context_length": "max_position_embeddings",
+        "embedding_length": "hidden_size",
+        "attention.layer_norm_rms_epsilon": "layer_norm_epsilon",
+        "block_count": "num_hidden_layers",
+        "ssm.conv_kernel": "conv_kernel",
+        "ssm.state_size": "state_size",
+        "ssm.time_step_rank": "time_step_rank",
+        "ssm.inner_size": "intermediate_size",
+    },
+    "nemotron": {
+        "context_length": "max_position_embeddings",
+        "block_count": "num_hidden_layers",
+        "feed_forward_length": "intermediate_size",
+        "embedding_length": "hidden_size",
+        "rope.dimension_count": None,
+        "rope.freq_base": "rope_theta",
+        "attention.head_count": "num_attention_heads",
+        "attention.head_count_kv": "num_key_value_heads",
+        "attention.layer_norm_rms_epsilon": "norm_eps",
+        "vocab_size": "vocab_size",
+    },
 }
 
 GGUF_TOKENIZER_MAPPING = {
@@ -768,6 +818,8 @@ def converted(self) -> Tokenizer:
     "gpt2": GGUFGPTConverter,
     "starcoder2": GGUFGPTConverter,
     "t5": GGUFT5Converter,
+    "mamba": GGUFGPTConverter,
+    "nemotron": GGUFGPTConverter,
 }
 
 
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index a09116552c8e34..0cc2685a55206f 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -208,7 +208,7 @@ def hp_params(trial):
     if is_optuna_available():
         import optuna
 
-        if isinstance(trial, optuna.Trial):
+        if isinstance(trial, optuna.trial.BaseTrial):
             return trial.params
     if is_ray_tune_available():
         if isinstance(trial, dict):
@@ -230,7 +230,7 @@ def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> Be
 
     if trainer.args.process_index == 0:
 
-        def _objective(trial, checkpoint_dir=None):
+        def _objective(trial: optuna.Trial, checkpoint_dir=None):
             checkpoint = None
             if checkpoint_dir:
                 for subdir in os.listdir(checkpoint_dir):
@@ -240,10 +240,11 @@ def _objective(trial, checkpoint_dir=None):
             if trainer.args.world_size > 1:
                 if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
                     raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
-                trainer._hp_search_setup(trial)
-                args_main_rank_list = [pickle.dumps(trainer.args)]
-                torch.distributed.broadcast_object_list(args_main_rank_list, src=0)
-                trainer.train(resume_from_checkpoint=checkpoint)
+                trainer.hp_space(trial)
+                fixed_trial = optuna.trial.FixedTrial(trial.params, trial.number)
+                trial_main_rank_list = [fixed_trial]
+                torch.distributed.broadcast_object_list(trial_main_rank_list, src=0)
+                trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
             else:
                 trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
             # If there hasn't been any evaluation during the training loop.
@@ -268,15 +269,11 @@ def _objective(trial, checkpoint_dir=None):
     else:
         for i in range(n_trials):
             trainer.objective = None
-            args_main_rank_list = [None]
+            trial_main_rank_list = [None]
             if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
                 raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
-            torch.distributed.broadcast_object_list(args_main_rank_list, src=0)
-            args = pickle.loads(bytes(args_main_rank_list[0]))
-            for key, value in asdict(args).items():
-                if key != "local_rank":
-                    setattr(trainer.args, key, value)
-            trainer.train(resume_from_checkpoint=None)
+            torch.distributed.broadcast_object_list(trial_main_rank_list, src=0)
+            trainer.train(resume_from_checkpoint=None, trial=trial_main_rank_list[0])
             # If there hasn't been any evaluation during the training loop.
             if getattr(trainer, "objective", None) is None:
                 metrics = trainer.evaluate()
@@ -697,6 +694,8 @@ def on_log(self, args, state, control, logs=None, **kwargs):
             for k, v in logs.items():
                 if isinstance(v, (int, float)):
                     self.tb_writer.add_scalar(k, v, state.global_step)
+                elif isinstance(v, str):
+                    self.tb_writer.add_text(k, v, state.global_step)
                 else:
                     logger.warning(
                         "Trainer is attempting to log a value of "
@@ -916,7 +915,7 @@ def on_train_end(self, args, state, control, model=None, tokenizer=None, **kwarg
         if self._log_model.is_enabled and self._initialized and state.is_world_process_zero:
             from ..trainer import Trainer
 
-            fake_trainer = Trainer(args=args, model=model, processing_class=tokenizer)
+            fake_trainer = Trainer(args=args, model=model, processing_class=tokenizer, eval_dataset=["fake"])
             with tempfile.TemporaryDirectory() as temp_dir:
                 fake_trainer.save_model(temp_dir)
                 metadata = (
@@ -2123,7 +2122,12 @@ def on_train_end(self, args, state, control, **kwargs):
             from transformers.trainer import Trainer
 
             if self._log_model is True:
-                fake_trainer = Trainer(args=args, model=kwargs.get("model"), processing_class=kwargs.get("tokenizer"))
+                fake_trainer = Trainer(
+                    args=args,
+                    model=kwargs.get("model"),
+                    processing_class=kwargs.get("tokenizer"),
+                    eval_dataset=["fake"],
+                )
                 name = "best" if args.load_best_model_at_end else "last"
                 output_dir = os.path.join(args.output_dir, name)
                 fake_trainer.save_model(output_dir)
diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py
index 171b2f4d15b122..cca6d548cdf3ac 100644
--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 import re
-from typing import Dict, Optional
+from typing import Dict, NamedTuple, Optional
 
 import numpy as np
 from tqdm import tqdm
@@ -55,6 +55,200 @@
 GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["tensors"].keys())
 
 
+class GGUFTensor(NamedTuple):
+    weights: np.ndarray
+    name: str
+    metadata: dict
+
+
+class TensorProcessor:
+    def __init__(self, config=None):
+        self.config = config or {}
+
+    def process(self, weights, name, **kwargs):
+        return GGUFTensor(weights, name, {})
+
+
+class LlamaTensorProcessor(TensorProcessor):
+    def __init__(self, config=None):
+        super().__init__(config=config)
+
+    def process(self, weights, name, **kwargs):
+        if ".attn_k." in name or ".attn_q." in name:
+            num_heads = self.config.get("num_attention_heads")
+            num_kv_heads = self.config.get("num_key_value_heads")
+
+            if None in (num_heads, num_kv_heads):
+                return GGUFTensor(weights, name, {})
+            if ".attn_q." in name:
+                weights = self._reverse_permute_weights(weights, num_heads, num_heads)
+            elif ".attn_k." in name:
+                weights = self._reverse_permute_weights(weights, num_heads, num_kv_heads)
+        return GGUFTensor(weights, name, {})
+
+    def _reverse_permute_weights(
+        self, weights: np.ndarray, n_head: int, num_kv_heads: Optional[int] = None
+    ) -> np.ndarray:
+        # Original permutation implementation
+        # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L1402-L1408
+        if num_kv_heads is not None and n_head != num_kv_heads:
+            n_head = num_kv_heads
+
+        dim = weights.shape[0] // n_head // 2
+        w = weights.reshape(n_head, dim, 2, *weights.shape[1:])
+        return w.swapaxes(2, 1).reshape(weights.shape)
+
+
+class Qwen2MoeTensorProcessor(TensorProcessor):
+    def __init__(self, config=None):
+        super().__init__(config=config)
+
+    def process(self, weights, name, **kwargs):
+        if "_exp" in name:
+            tensor_key_mapping = kwargs.get("tensor_key_mapping")
+            parsed_parameters = kwargs.get("parsed_parameters")
+            if tensor_key_mapping:
+                self._split_moe_expert_tensor(weights, parsed_parameters, name, tensor_key_mapping)
+                return GGUFTensor(weights, None, {})
+        if "ffn_gate_inp_shexp" in name:
+            # for compatibility tensor shared_expert_gate must be (1, 2048) dim,
+            # quantized one is (2048)
+            weights = np.expand_dims(weights, axis=0)
+        return GGUFTensor(weights, name, {})
+
+    def _split_moe_expert_tensor(
+        self, weights: np.ndarray, parsed_parameters: Dict[str, Dict], name: str, tensor_key_mapping: dict
+    ):
+        # Original merge implementation
+        # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022
+        exp_name = ""
+        if "ffn_gate_exps" in name:
+            exp_name = "gate_proj"
+        elif "ffn_down_exps" in name:
+            exp_name = "down_proj"
+        elif "ffn_up_exps" in name:
+            exp_name = "up_proj"
+        else:
+            raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.")
+        for tensor_name in tensor_key_mapping:
+            if tensor_name in name:
+                name = name.replace(tensor_name, tensor_key_mapping[tensor_name])
+        w_counter = self.config.get("num_experts", 60)
+        for i in range(0, w_counter):
+            temp_name = name.replace(".weight", f".{i}.{exp_name}.weight")
+            exp_weight = weights[i]
+            parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight))
+
+
+class BloomTensorProcessor(TensorProcessor):
+    def __init__(self, config=None):
+        super().__init__(config=config)
+
+    def process(self, weights, name, **kwargs):
+        if "attn_qkv" in name:
+            num_heads = self.config["n_head"]
+            n_embed = self.config["hidden_size"]
+            if "weight" in name:
+                weights = self._reverse_reshape_weights(weights, num_heads, n_embed)
+            else:
+                weights = self._reverse_reshape_bias(weights, num_heads, n_embed)
+        return GGUFTensor(weights, name, {})
+
+    def _reverse_reshape_weights(self, weights: np.ndarray, n_head: int, n_embed: int):
+        # Original reshape implementation
+        # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L972-L985
+        q, k, v = np.array_split(weights, 3, axis=0)
+
+        q = q.reshape(n_head, n_embed // n_head, n_embed)
+        k = k.reshape(n_head, n_embed // n_head, n_embed)
+        v = v.reshape(n_head, n_embed // n_head, n_embed)
+        qkv_weights = np.stack([q, k, v], axis=1)
+
+        return qkv_weights.reshape(n_head * 3 * (n_embed // n_head), n_embed)
+
+    def _reverse_reshape_bias(self, weights: np.ndarray, n_head: int, n_embed: int):
+        # Original reshape implementation
+        # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L986-L998
+        q_bias, k_bias, v_bias = np.array_split(weights, 3)
+
+        q_bias = q_bias.reshape(n_head, n_embed // n_head)
+        k_bias = k_bias.reshape(n_head, n_embed // n_head)
+        v_bias = v_bias.reshape(n_head, n_embed // n_head)
+
+        qkv_bias = np.stack([q_bias, k_bias, v_bias], axis=1).flatten()
+        return qkv_bias
+
+
+class T5TensorProcessor(TensorProcessor):
+    def __init__(self, config=None):
+        super().__init__(config=config)
+
+    def process(self, weights, name, **kwargs):
+        bid = None
+        for chunk in name.split("."):
+            if chunk.isdigit():
+                bid = int(chunk)
+                break
+        return GGUFTensor(weights, name, {"bid": bid})
+
+
+class GPT2TensorProcessor(TensorProcessor):
+    def __init__(self, config=None):
+        super().__init__(config=config)
+
+    def process(self, weights, name, **kwargs):
+        # Original transpose implementation
+        # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L2060-L2061
+        if (
+            "attn_qkv.weight" in name
+            or "ffn_down.weight" in name
+            or "ffn_up.weight" in name
+            or "attn_output.weight" in name
+        ):
+            weights = weights.T
+
+        # Handle special case for output.weight
+        if name == "output.weight":
+            # output.weight has conflicts with attn_output.weight in name checking
+            # Store the tensor directly and signal to skip further processing
+            name = "lm_head.weight"
+            parsed_parameters = kwargs.get("parsed_parameters", {})
+            parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights))
+            name = None  # Signal to skip further processing
+        return GGUFTensor(weights, name, {})
+
+
+class MambaTensorProcessor(TensorProcessor):
+    def __init__(self, config=None):
+        super().__init__(config=config)
+
+    def process(self, weights, name, **kwargs):
+        if "ssm_d" in name and "bias" not in name and "weight" not in name:
+            # ssm_d has conflicts with ssm_dt in name checking
+            # we have to explicitly check that name is exactly ssm_d
+            name = name.replace("ssm_d", "mixer.D")
+        if "ssm_conv1d.weight" in name:
+            # for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim,
+            # quantized one is (5120, 4)
+            weights = np.expand_dims(weights, axis=1)
+        if "ssm_a" in name:
+            # Original exponential implementation
+            # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L2975-L2977
+            weights = np.log(-weights)
+        return GGUFTensor(weights, name, {})
+
+
+TENSOR_PROCESSORS = {
+    "llama": LlamaTensorProcessor,
+    "qwen2moe": Qwen2MoeTensorProcessor,
+    "bloom": BloomTensorProcessor,
+    "t5": T5TensorProcessor,
+    "t5encoder": T5TensorProcessor,
+    "gpt2": GPT2TensorProcessor,
+    "mamba": MambaTensorProcessor,
+}
+
+
 def read_field(reader, field):
     value = reader.fields[field]
     return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]
@@ -106,6 +300,17 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     if "qwen2moe" in architecture:
         updated_architecture = "qwen2_moe"
 
+    # For stablelm architecture, we need to set qkv_bias and use_parallel_residual from tensors
+    # If `qkv_bias=True`, qkv_proj with bias will be present in the tensors
+    # If `use_parallel_residual=False`, ffn_norm will be present in the tensors
+    if "stablelm" in architecture:
+        attn_bias_name = {"attn_q.bias", "attn_k.bias", "attn_v.bias"}
+        ffn_norm_name = "ffn_norm"
+        qkv_bias = any(bias_name in tensor.name for tensor in reader.tensors for bias_name in attn_bias_name)
+        use_parallel_residual = any(ffn_norm_name in tensor.name for tensor in reader.tensors)
+        parsed_parameters["config"]["qkv_bias"] = qkv_bias
+        parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual
+
     model_size = ""
     # extract the number of params from file name as architectures can differ ;
     # eg. for falcon : `...falcon-7b-...`
@@ -166,60 +371,28 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
 
     if return_tensors:
         tensor_key_mapping = GGUF_TO_TRANSFORMERS_MAPPING["tensors"][architecture + model_size]
+        config = parsed_parameters.get("config", {})
+
+        ProcessorClass = TENSOR_PROCESSORS.get(architecture, TensorProcessor)
+        processor = ProcessorClass(config=config)
 
         for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."):
             name = tensor.name
-
             weights = dequantize(tensor.data, tensor.tensor_type)
 
-            if architecture == "llama" and (".attn_k." in name or ".attn_q." in name):
-                num_heads = parsed_parameters["config"]["num_attention_heads"]
-                num_kv_heads = parsed_parameters["config"]["num_key_value_heads"]
-                if ".attn_q." in name:
-                    weights = reverse_permute_weights(weights, num_heads, num_heads)
-                elif ".attn_k." in name:
-                    weights = reverse_permute_weights(weights, num_heads, num_kv_heads)
-
-            if architecture == "qwen2moe":
-                if "_exp" in name:
-                    split_moe_expert_tensor(weights, parsed_parameters, name, tensor_key_mapping)
-                    continue
-                if "ffn_gate_inp_shexp" in name:
-                    # for compatibility tensor shared_expert_gate must be (1, 2048) dim,
-                    # quantized one is (2048)
-                    weights = np.expand_dims(weights, axis=0)
-
-            if architecture == "bloom" and "attn_qkv" in name:
-                num_heads = parsed_parameters["config"]["n_head"]
-                n_embed = parsed_parameters["config"]["hidden_size"]
-                if "weight" in name:
-                    weights = reverse_reshape_weights(weights, num_heads, n_embed)
-                else:
-                    weights = reverse_reshape_bias(weights, num_heads, n_embed)
-
-            bid = None
-            if architecture in ("t5", "t5encoder"):
-                for chunk in name.split("."):
-                    if chunk.isdigit():
-                        bid = int(chunk)
-                        break
-
-            if architecture == "gpt2":
-                if (
-                    "attn_qkv.weight" in name
-                    or "ffn_down.weight" in name
-                    or "ffn_up.weight" in name
-                    or "attn_output.weight" in name
-                ):
-                    # Original transpose implementation
-                    # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L2060-L2061
-                    weights = weights.T
-                if name == "output.weight":
-                    # output.weight has conflicts with attn_output.weight in name checking
-                    # we have to explicitly check that name is exactly output.weight
-                    name = "lm_head.weight"
-                    parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights))
-                    continue
+            result = processor.process(
+                weights=weights,
+                name=name,
+                tensor_key_mapping=tensor_key_mapping,
+                parsed_parameters=parsed_parameters,
+            )
+
+            weights = result.weights
+            name = result.name
+            bid = result.metadata.get("bid")
+
+            if name is None:
+                continue
 
             for tensor_name in tensor_key_mapping:
                 if tensor_name.format(bid=bid) in name:
@@ -232,64 +405,3 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
         logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
 
     return parsed_parameters
-
-
-def reverse_permute_weights(weights: np.ndarray, n_head: int, num_kv_heads: Optional[int] = None) -> np.ndarray:
-    # Original permutation implementation
-    # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L1402-L1408
-    if num_kv_heads is not None and n_head != num_kv_heads:
-        n_head = num_kv_heads
-
-    dim = weights.shape[0] // n_head // 2
-    w = weights.reshape(n_head, dim, 2, *weights.shape[1:])
-    return w.swapaxes(2, 1).reshape(weights.shape)
-
-
-def reverse_reshape_weights(weights: np.ndarray, n_head: int, n_embed: int):
-    # Original reshape implementation
-    # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L972-L985
-    q, k, v = np.array_split(weights, 3, axis=0)
-
-    q = q.reshape(n_head, n_embed // n_head, n_embed)
-    k = k.reshape(n_head, n_embed // n_head, n_embed)
-    v = v.reshape(n_head, n_embed // n_head, n_embed)
-    qkv_weights = np.stack([q, k, v], axis=1)
-
-    return qkv_weights.reshape(n_head * 3 * (n_embed // n_head), n_embed)
-
-
-def reverse_reshape_bias(weights: np.ndarray, n_head: int, n_embed: int):
-    # Original reshape implementation
-    # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L986-L998
-    q_bias, k_bias, v_bias = np.array_split(weights, 3)
-
-    q_bias = q_bias.reshape(n_head, n_embed // n_head)
-    k_bias = k_bias.reshape(n_head, n_embed // n_head)
-    v_bias = v_bias.reshape(n_head, n_embed // n_head)
-
-    qkv_bias = np.stack([q_bias, k_bias, v_bias], axis=1).flatten()
-    return qkv_bias
-
-
-def split_moe_expert_tensor(
-    weights: np.ndarray, parsed_parameters: Dict[str, Dict], name: str, tensor_key_mapping: dict
-):
-    # Original merge implementation
-    # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022
-    exp_name = ""
-    if "ffn_gate_exps" in name:
-        exp_name = "gate_proj"
-    elif "ffn_down_exps" in name:
-        exp_name = "down_proj"
-    elif "ffn_up_exps" in name:
-        exp_name = "up_proj"
-    else:
-        raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.")
-    for tensor_name in tensor_key_mapping:
-        if tensor_name in name:
-            name = name.replace(tensor_name, tensor_key_mapping[tensor_name])
-    w_counter = parsed_parameters["config"].get("num_experts", 60)
-    for i in range(0, w_counter):
-        temp_name = name.replace(".weight", f".{i}.{exp_name}.weight")
-        exp_weight = weights[i]
-        parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight))
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index a6fbd7b1a91453..a4de8abed03df4 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -28,7 +28,7 @@
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass
-from functools import lru_cache, partial, wraps
+from functools import partial, wraps
 from threading import Thread
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 from zipfile import is_zipfile
@@ -52,9 +52,11 @@
     find_pruneable_heads_and_indices,
     id_tensor_storage,
     is_torch_greater_or_equal_than_1_13,
+    is_torch_greater_or_equal_than_2_4,
     prune_conv1d_layer,
     prune_layer,
     prune_linear_layer,
+    translate_to_torch_parallel_style,
 )
 from .quantizers import AutoHfQuantizer, HfQuantizer
 from .quantizers.quantizers_utils import get_module_from_name
@@ -94,7 +96,7 @@
     replace_return_docstrings,
     strtobool,
 )
-from .utils.hub import convert_file_size_to_int, create_and_tag_model_card, get_checkpoint_shard_files
+from .utils.hub import create_and_tag_model_card, get_checkpoint_shard_files
 from .utils.import_utils import (
     ENV_VARS_TRUE_VALUES,
     is_sagemaker_mp_enabled,
@@ -136,6 +138,8 @@
 
 
 _init_weights = True
+_is_quantized = False
+_is_ds_init_called = False
 
 
 def is_fsdp_enabled():
@@ -213,6 +217,29 @@ def _skip_init(*args, **kwargs):
                 setattr(torch.nn.init, name, init_func)
 
 
+@contextmanager
+def set_quantized_state():
+    global _is_quantized
+    _is_quantized = True
+    try:
+        yield
+    finally:
+        _is_quantized = False
+
+
+# Skip recursive calls to deepspeed.zero.Init to avoid pinning errors.
+# This issue occurs with ZeRO stage 3 when using NVMe offloading.
+# For more details, refer to issue #34429.
+@contextmanager
+def set_zero3_state():
+    global _is_ds_init_called
+    _is_ds_init_called = True
+    try:
+        yield
+    finally:
+        _is_ds_init_called = False
+
+
 def get_parameter_device(parameter: Union[nn.Module, "ModuleUtilsMixin"]):
     try:
         return next(parameter.parameters()).device
@@ -348,6 +375,9 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi
 
     Note: We fully disable this if we are using `deepspeed`
     """
+    if model_to_load.device.type == "meta":
+        return False
+
     if len([key for key in state_dict if key.startswith(start_prefix)]) == 0:
         return False
 
@@ -362,7 +392,7 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi
         return False
 
     # If the model does, the incoming `state_dict` and the `model_to_load` must be the same dtype
-    first_key = list(model_to_load.state_dict().keys())[0]
+    first_key = next(iter(model_to_load.state_dict().keys()))
     if start_prefix + first_key in state_dict:
         return state_dict[start_prefix + first_key].dtype == model_to_load.state_dict()[first_key].dtype
 
@@ -370,92 +400,6 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi
     return False
 
 
-def shard_checkpoint(
-    state_dict: Dict[str, torch.Tensor], max_shard_size: Union[int, str] = "10GB", weights_name: str = WEIGHTS_NAME
-):
-    """
-    Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
-    given size.
-
-    The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no
-    optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the
-    limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB],
-    [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB].
-
-    <Tip warning={true}>
-
-    If one of the model's weight is bigger than `max_shard_size`, it will end up in its own sub-checkpoint which will
-    have a size greater than `max_shard_size`.
-
-    </Tip>
-
-    Args:
-        state_dict (`Dict[str, torch.Tensor]`): The state dictionary of a model to save.
-        max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
-            The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
-            (like `"5MB"`).
-        weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`):
-            The name of the model save file.
-    """
-    logger.warning(
-        "Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using "
-        "split_torch_state_dict_into_shards from huggingface_hub library"
-    )
-    max_shard_size = convert_file_size_to_int(max_shard_size)
-
-    sharded_state_dicts = [{}]
-    last_block_size = 0
-    total_size = 0
-    storage_id_to_block = {}
-
-    for key, weight in state_dict.items():
-        # when bnb serialization is used the weights in the state dict can be strings
-        # check: https://github.com/huggingface/transformers/pull/24416 for more details
-        if isinstance(weight, str):
-            continue
-        else:
-            storage_id = id_tensor_storage(weight)
-
-        # If a `weight` shares the same underlying storage as another tensor, we put `weight` in the same `block`
-        if storage_id in storage_id_to_block and weight.device != torch.device("meta"):
-            block_id = storage_id_to_block[storage_id]
-            sharded_state_dicts[block_id][key] = weight
-            continue
-
-        weight_size = weight.numel() * dtype_byte_size(weight.dtype)
-        # If this weight is going to tip up over the maximal size, we split, but only if we have put at least one
-        # weight in the current shard.
-        if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0:
-            sharded_state_dicts.append({})
-            last_block_size = 0
-
-        sharded_state_dicts[-1][key] = weight
-        last_block_size += weight_size
-        total_size += weight_size
-        storage_id_to_block[storage_id] = len(sharded_state_dicts) - 1
-
-    # If we only have one shard, we return it
-    if len(sharded_state_dicts) == 1:
-        return {weights_name: sharded_state_dicts[0]}, None
-
-    # Otherwise, let's build the index
-    weight_map = {}
-    shards = {}
-    for idx, shard in enumerate(sharded_state_dicts):
-        shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
-        shard_file = shard_file.replace(
-            ".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors"
-        )
-        shards[shard_file] = shard
-        for key in shard.keys():
-            weight_map[key] = shard_file
-
-    # Add the metadata
-    metadata = {"total_size": total_size}
-    index = {"metadata": metadata, "weight_map": weight_map}
-    return shards, index
-
-
 def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
     """
     This is the same as
@@ -943,13 +887,14 @@ def _load_state_dict_into_meta_model(
         old_param = model
         splits = param_name.split(".")
         for split in splits:
-            old_param = getattr(old_param, split)
-            # Not all the attributes of a module are Parameters/Tensor
-            if not isinstance(old_param, (torch.nn.Parameter, torch.Tensor)):
-                old_param = None
+            # We shouldn't hit the default value unless for quant methods like hqq that modifies expected_keys.
+            old_param = getattr(old_param, split, None)
             if old_param is None:
                 break
 
+        if not isinstance(old_param, (torch.nn.Parameter, torch.Tensor)):
+            old_param = None
+
         if old_param is not None:
             if dtype is None:
                 param = param.to(old_param.dtype)
@@ -1001,7 +946,10 @@ def _load_state_dict_into_meta_model(
                 param_to = "cpu"
                 if is_fsdp_enabled() and not is_local_dist_rank_0():
                     param_to = "meta"
-                value = type(value)(value.data.to(param_to), **value.__dict__)
+                val_kwargs = {}
+                if hasattr(module, "weight") and module.weight.__class__.__name__ == "Int8Params":
+                    val_kwargs["requires_grad"] = False
+                value = type(value)(value.data.to(param_to), **val_kwargs, **value.__dict__)
                 setattr(module, tensor_name, value)
             # TODO: consider removing used param_parts from state_dict before return
 
@@ -1397,6 +1345,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     # Has support for a `QuantoQuantizedCache` instance as `past_key_values`
     _supports_quantized_cache = False
 
+    # A tensor parallel plan to be applied to the model when TP is enabled. For
+    # top-level models, this attribute is currently defined in respective model
+    # code. For base models, this attribute comes from
+    # `config.base_model_tp_plan` during `post_init`.
+    _tp_plan = None
+
     @property
     def dummy_inputs(self) -> Dict[str, torch.Tensor]:
         """
@@ -1441,6 +1395,9 @@ def post_init(self):
         """
         self.init_weights()
         self._backward_compatibility_gradient_checkpointing()
+        # If current model is a base model, attach `base_model_tp_plan` from config
+        if self.base_model is self:
+            self._tp_plan = self.config.base_model_tp_plan
 
     def dequantize(self):
         """
@@ -1530,13 +1487,14 @@ def _from_config(cls, config, **kwargs):
                 torch_dtype=torch_dtype,
             )
 
-        if is_deepspeed_zero3_enabled():
+        if is_deepspeed_zero3_enabled() and not _is_quantized and not _is_ds_init_called:
             import deepspeed
 
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
             # this immediately partitions the model across all gpus, to avoid the overhead in time
             # and memory copying it on CPU or each GPU first
-            with deepspeed.zero.Init(config_dict_or_path=deepspeed_config()):
+            init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config()), set_zero3_state()]
+            with ContextManagers(init_contexts):
                 model = cls(config, **kwargs)
 
         else:
@@ -1579,6 +1537,7 @@ def _autoset_attn_implementation(
                 "eager",
                 "sdpa",
                 "flash_attention_2",
+                "flex_attention",
             ]:
                 message = f'Specified `attn_implementation="{config._attn_implementation}"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)'
                 if cls._supports_flash_attn_2:
@@ -1596,15 +1555,14 @@ def _autoset_attn_implementation(
         # Below we check if a config is composite and manually prepare a dict of attn impl if not already passed as a dict.
         # Later each sub-module will dispatch with its own attn impl, by calling `XXXModel._from_config(config.text_config)`
         # If any of sub-modules doesn't support requested attn, an error will be raised. See https://github.com/huggingface/transformers/pull/32238
-        for key in config:
-            if isinstance(getattr(config, key), PretrainedConfig):
-                sub_config = getattr(config, key)
-                curr_attn_implementation = (
-                    requested_attn_implementation
-                    if not isinstance(requested_attn_implementation, dict)
-                    else requested_attn_implementation.get(key, None)
-                )
-                sub_config._attn_implementation_internal = curr_attn_implementation
+        for key in config.sub_configs.keys():
+            sub_config = getattr(config, key)
+            curr_attn_implementation = (
+                requested_attn_implementation
+                if not isinstance(requested_attn_implementation, dict)
+                else requested_attn_implementation.get(key, None)
+            )
+            sub_config._attn_implementation_internal = curr_attn_implementation
 
         if use_flash_attention_2:
             logger.warning_once(
@@ -3471,6 +3429,11 @@ def from_pretrained(
         # Cache path to the GGUF file
         gguf_path = None
 
+        tp_plan = kwargs.pop("tp_plan", None)
+        if tp_plan is not None and tp_plan != "auto":
+            # TODO: we can relax this check when we support taking tp_plan from a json file, for example.
+            raise ValueError(f"tp_plan supports 'auto' only for now but got {tp_plan}.")
+
         if is_fsdp_enabled():
             low_cpu_mem_usage = True
 
@@ -3654,7 +3617,11 @@ def from_pretrained(
 
         if hf_quantizer is not None:
             hf_quantizer.validate_environment(
-                torch_dtype=torch_dtype, from_tf=from_tf, from_flax=from_flax, device_map=device_map
+                torch_dtype=torch_dtype,
+                from_tf=from_tf,
+                from_flax=from_flax,
+                device_map=device_map,
+                weights_only=weights_only,
             )
             torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype)
             device_map = hf_quantizer.update_device_map(device_map)
@@ -4072,18 +4039,35 @@ def from_pretrained(
 
         # Instantiate model.
         init_contexts = [no_init_weights(_enable=_fast_init)]
+        tp_device = None
 
-        if is_deepspeed_zero3_enabled() and not is_quantized:
+        if is_deepspeed_zero3_enabled() and not is_quantized and not _is_ds_init_called:
             import deepspeed
 
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
-            init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config())] + init_contexts
+            init_contexts = [
+                deepspeed.zero.Init(config_dict_or_path=deepspeed_config()),
+                set_zero3_state(),
+            ] + init_contexts
         elif low_cpu_mem_usage:
             if not is_accelerate_available():
                 raise ImportError(
                     f"Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
                 )
             init_contexts.append(init_empty_weights())
+        elif tp_plan is not None:
+            if not torch.distributed.is_initialized():
+                raise ValueError("Tensor Parallel requires torch.distributed to be initialized first.")
+
+            # Detect the accelerator on the machine. If no accelerator is available, it returns CPU.
+            device_type = torch._C._get_accelerator().type
+            device_module = torch.get_device_module(device_type)
+            # Get device with index assuming equal number of devices per host
+            tp_device = torch.device(device_type, torch.distributed.get_rank() % device_module.device_count())
+            init_contexts.append(tp_device)
+
+        if is_deepspeed_zero3_enabled() and is_quantized:
+            init_contexts.append(set_quantized_state())
 
         config = copy.deepcopy(config)  # We do not want to modify the config inplace in from_pretrained.
         if not getattr(config, "_attn_implementation_autoset", False):
@@ -4214,32 +4198,38 @@ def from_pretrained(
             if dtype_orig is not None:
                 torch.set_default_dtype(dtype_orig)
 
-            (
-                model,
-                missing_keys,
-                unexpected_keys,
-                mismatched_keys,
-                offload_index,
-                error_msgs,
-            ) = cls._load_pretrained_model(
-                model,
-                state_dict,
-                loaded_state_dict_keys,  # XXX: rename?
-                resolved_archive_file,
-                pretrained_model_name_or_path,
-                ignore_mismatched_sizes=ignore_mismatched_sizes,
-                sharded_metadata=sharded_metadata,
-                _fast_init=_fast_init,
-                low_cpu_mem_usage=low_cpu_mem_usage,
-                device_map=device_map,
-                offload_folder=offload_folder,
-                offload_state_dict=offload_state_dict,
-                dtype=torch_dtype,
-                hf_quantizer=hf_quantizer,
-                keep_in_fp32_modules=keep_in_fp32_modules,
-                gguf_path=gguf_path,
-                weights_only=weights_only,
-            )
+            load_contexts = []
+            # Make sure we load onto targeted device
+            if tp_device is not None:
+                load_contexts.append(tp_device)
+
+            with ContextManagers(load_contexts):
+                (
+                    model,
+                    missing_keys,
+                    unexpected_keys,
+                    mismatched_keys,
+                    offload_index,
+                    error_msgs,
+                ) = cls._load_pretrained_model(
+                    model,
+                    state_dict,
+                    loaded_state_dict_keys,  # XXX: rename?
+                    resolved_archive_file,
+                    pretrained_model_name_or_path,
+                    ignore_mismatched_sizes=ignore_mismatched_sizes,
+                    sharded_metadata=sharded_metadata,
+                    _fast_init=_fast_init,
+                    low_cpu_mem_usage=low_cpu_mem_usage,
+                    device_map=device_map,
+                    offload_folder=offload_folder,
+                    offload_state_dict=offload_state_dict,
+                    dtype=torch_dtype,
+                    hf_quantizer=hf_quantizer,
+                    keep_in_fp32_modules=keep_in_fp32_modules,
+                    gguf_path=gguf_path,
+                    weights_only=weights_only,
+                )
 
         # make sure token embedding weights are still tied if needed
         model.tie_weights()
@@ -4323,6 +4313,16 @@ def from_pretrained(
                 }
             return model, loading_info
 
+        if tp_plan is not None:
+            assert tp_device is not None, "tp_device not set!"
+            if not model.supports_tp_plan:
+                raise NotImplementedError("This model does not have a tensor parallel plan.")
+            # Assuming sharding the model onto the world
+            world_size = torch.distributed.get_world_size()
+            device_mesh = torch.distributed.init_device_mesh(tp_device.type, (world_size,))
+            # Apply Tensor Parallelism
+            model.tensor_parallel(device_mesh)
+
         return model
 
     @classmethod
@@ -5013,7 +5013,56 @@ def _is_quantized_training_enabled(self):
         return self.hf_quantizer.is_trainable
 
     @property
-    @lru_cache
+    def supports_tp_plan(self):
+        """
+        Returns whether the model has a tensor parallelism plan.
+        """
+        if self._tp_plan is not None:
+            return True
+        # Check if base model has a TP plan
+        if getattr(self.base_model, "_tp_plan", None) is not None:
+            return True
+        return False
+
+    def tensor_parallel(self, device_mesh):
+        """
+        Tensor parallelize the model across the given device mesh.
+
+        Args:
+            device_mesh (`torch.distributed.DeviceMesh`):
+                The device mesh to use for tensor parallelism.
+        """
+        if not is_torch_greater_or_equal_than_2_4:
+            raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.")
+
+        # Tensor parallelize a nn.Module based on the `_tp_plan` attribute of the module.
+        # No op if `_tp_plan` attribute does not exist under the module.
+        # This is a helper function to be used with `model.apply` to recursively
+        # parallelize a model.
+        def tplize(mod: torch.nn.Module) -> None:
+            tp_plan = getattr(mod, "_tp_plan", None)
+            if tp_plan is None:
+                return
+            logger.debug(f"Applying tensor parallel to {mod.__class__.__name__}: {tp_plan}")
+            # In model configs, we use a neutral type (string) to specify
+            # parallel styles, here we translate them into torch TP types.
+            # Using tree_map because `tp_plan` is a dict.
+            tp_plan = torch.utils._pytree.tree_map(
+                translate_to_torch_parallel_style,
+                tp_plan,
+            )
+            # Apply TP to current module.
+            torch.distributed.tensor.parallel.parallelize_module(
+                mod,
+                device_mesh=device_mesh,
+                parallelize_plan=tp_plan,
+            )
+
+        # `apply` is a native method of `nn.Module` that recursively applies a
+        # function to every submodule.
+        self.apply(tplize)
+
+    @property
     def loss_function(self):
         if getattr(self.config, "loss_type", None) is not None:
             loss_type = self.config.loss_type
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 9155f629e63f91..0d4b9f2f94de9b 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -177,6 +177,7 @@
     nougat,
     nystromformer,
     olmo,
+    olmo_1124,
     olmoe,
     omdet_turbo,
     oneformer,
diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py
index 99fa81b4a9350d..a22ab1dc40f8d0 100644
--- a/src/transformers/models/align/configuration_align.py
+++ b/src/transformers/models/align/configuration_align.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """ALIGN model configuration"""
 
-import os
-from typing import TYPE_CHECKING, List, Union
+from typing import TYPE_CHECKING, List
 
 
 if TYPE_CHECKING:
@@ -95,6 +94,7 @@ class AlignTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "align_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -133,24 +133,6 @@ def __init__(
         self.use_cache = use_cache
         self.pad_token_id = pad_token_id
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from AlignConfig
-        if config_dict.get("model_type") == "align":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class AlignVisionConfig(PretrainedConfig):
     r"""
@@ -223,6 +205,7 @@ class AlignVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "align_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -272,24 +255,6 @@ def __init__(
         self.drop_connect_rate = drop_connect_rate
         self.num_hidden_layers = sum(num_block_repeats) * 4
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from AlignConfig
-        if config_dict.get("model_type") == "align":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class AlignConfig(PretrainedConfig):
     r"""
@@ -340,6 +305,7 @@ class AlignConfig(PretrainedConfig):
     ```"""
 
     model_type = "align"
+    sub_configs = {"text_config": AlignTextConfig, "vision_config": AlignVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py
index 7333fa63a35280..3c8e91bd473533 100755
--- a/src/transformers/models/altclip/configuration_altclip.py
+++ b/src/transformers/models/altclip/configuration_altclip.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """AltCLIP model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -199,6 +196,7 @@ class AltCLIPVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "altclip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -233,24 +231,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from AltCLIPConfig
-        if config_dict.get("model_type") == "altclip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class AltCLIPConfig(PretrainedConfig):
     r"""
@@ -298,6 +278,7 @@ class AltCLIPConfig(PretrainedConfig):
     ```"""
 
     model_type = "altclip"
+    sub_configs = {"text_config": AltCLIPTextConfig, "vision_config": AltCLIPVisionConfig}
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 48625ea3f346cd..7f0182b50085c5 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -195,6 +195,7 @@
         ("nougat", "VisionEncoderDecoderConfig"),
         ("nystromformer", "NystromformerConfig"),
         ("olmo", "OlmoConfig"),
+        ("olmo_1124", "Olmo1124Config"),
         ("olmoe", "OlmoeConfig"),
         ("omdet-turbo", "OmDetTurboConfig"),
         ("oneformer", "OneFormerConfig"),
@@ -510,6 +511,7 @@
         ("nougat", "Nougat"),
         ("nystromformer", "Nyströmformer"),
         ("olmo", "OLMo"),
+        ("olmo_1124", "OLMo November 2024"),
         ("olmoe", "OLMoE"),
         ("omdet-turbo", "OmDet-Turbo"),
         ("oneformer", "OneFormer"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index d181afeb2d4d0d..0b180272bdb085 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -68,7 +68,7 @@
             ("convnextv2", ("ConvNextImageProcessor",)),
             ("cvt", ("ConvNextImageProcessor",)),
             ("data2vec-vision", ("BeitImageProcessor",)),
-            ("deformable_detr", ("DeformableDetrImageProcessor",)),
+            ("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")),
             ("deit", ("DeiTImageProcessor",)),
             ("depth_anything", ("DPTImageProcessor",)),
             ("deta", ("DetaImageProcessor",)),
@@ -114,6 +114,7 @@
             ("oneformer", ("OneFormerImageProcessor",)),
             ("owlv2", ("Owlv2ImageProcessor",)),
             ("owlvit", ("OwlViTImageProcessor",)),
+            ("paligemma", ("SiglipImageProcessor",)),
             ("perceiver", ("PerceiverImageProcessor",)),
             ("pix2struct", ("Pix2StructImageProcessor",)),
             ("pixtral", ("PixtralImageProcessor",)),
@@ -123,7 +124,7 @@
             ("qwen2_vl", ("Qwen2VLImageProcessor",)),
             ("regnet", ("ConvNextImageProcessor",)),
             ("resnet", ("ConvNextImageProcessor",)),
-            ("rt_detr", "RTDetrImageProcessor"),
+            ("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")),
             ("sam", ("SamImageProcessor",)),
             ("segformer", ("SegformerImageProcessor",)),
             ("seggpt", ("SegGptImageProcessor",)),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 67c539fca66496..5206972b72efde 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -184,6 +184,7 @@
         ("nllb-moe", "NllbMoeModel"),
         ("nystromformer", "NystromformerModel"),
         ("olmo", "OlmoModel"),
+        ("olmo_1124", "Olmo1124Model"),
         ("olmoe", "OlmoeModel"),
         ("omdet-turbo", "OmDetTurboForObjectDetection"),
         ("oneformer", "OneFormerModel"),
@@ -516,6 +517,7 @@
         ("mvp", "MvpForCausalLM"),
         ("nemotron", "NemotronForCausalLM"),
         ("olmo", "OlmoForCausalLM"),
+        ("olmo_1124", "Olmo1124ForCausalLM"),
         ("olmoe", "OlmoeForCausalLM"),
         ("open-llama", "OpenLlamaForCausalLM"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 7674ea51a53377..4ed67df0e84b52 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -348,6 +348,7 @@
                 ),
             ),
             ("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+            ("olmo_1124", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
             ("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "omdet-turbo",
diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py
index 6dd08b65e89e6c..a498d1dd19371d 100644
--- a/src/transformers/models/bark/configuration_bark.py
+++ b/src/transformers/models/bark/configuration_bark.py
@@ -14,12 +14,11 @@
 # limitations under the License.
 """BARK model configuration"""
 
-import os
-from typing import Dict, Optional, Union
+from typing import Dict
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import add_start_docstrings, logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -64,7 +63,6 @@
 
 
 class BarkSubModelConfig(PretrainedConfig):
-    model_type = "bark_module"
     keys_to_ignore_at_inference = ["past_key_values"]
 
     attribute_map = {
@@ -101,38 +99,6 @@ def __init__(
 
         super().__init__(**kwargs)
 
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
-        cache_dir: Optional[Union[str, os.PathLike]] = None,
-        force_download: bool = False,
-        local_files_only: bool = False,
-        token: Optional[Union[str, bool]] = None,
-        revision: str = "main",
-        **kwargs,
-    ) -> "PretrainedConfig":
-        kwargs["cache_dir"] = cache_dir
-        kwargs["force_download"] = force_download
-        kwargs["local_files_only"] = local_files_only
-        kwargs["revision"] = revision
-
-        cls._set_token_in_kwargs(kwargs, token)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the config dict if we are loading from Bark
-        if config_dict.get("model_type") == "bark":
-            config_dict = config_dict[f"{cls.model_type}_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 @add_start_docstrings(
     BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkSemanticConfig", model="BarkSemanticModel"),
@@ -154,6 +120,7 @@ def from_pretrained(
 )
 class BarkSemanticConfig(BarkSubModelConfig):
     model_type = "semantic"
+    base_config_key = "semantic_config"
 
 
 @add_start_docstrings(
@@ -176,6 +143,7 @@ class BarkSemanticConfig(BarkSubModelConfig):
 )
 class BarkCoarseConfig(BarkSubModelConfig):
     model_type = "coarse_acoustics"
+    base_config_key = "coarse_acoustics_config"
 
 
 @add_start_docstrings(
@@ -203,6 +171,7 @@ class BarkCoarseConfig(BarkSubModelConfig):
 )
 class BarkFineConfig(BarkSubModelConfig):
     model_type = "fine_acoustics"
+    base_config_key = "fine_acoustics_config"
 
     def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs):
         self.n_codes_total = n_codes_total
@@ -265,6 +234,12 @@ class BarkConfig(PretrainedConfig):
     """
 
     model_type = "bark"
+    sub_configs = {
+        "semantic_config": BarkSemanticConfig,
+        "coarse_acoustics_config": BarkCoarseConfig,
+        "fine_acoustics_config": BarkFineConfig,
+        "codec_config": AutoConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py
index 4772738be10352..18db71eb14890b 100644
--- a/src/transformers/models/blip/configuration_blip.py
+++ b/src/transformers/models/blip/configuration_blip.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """Blip model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -96,6 +93,7 @@ class BlipTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "blip_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -146,24 +144,6 @@ def __init__(
         self.use_cache = use_cache
         self.label_smoothing = label_smoothing
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from BlipConfig
-        if config_dict.get("model_type") == "blip":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class BlipVisionConfig(PretrainedConfig):
     r"""
@@ -215,6 +195,7 @@ class BlipVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "blip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -245,24 +226,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from BlipConfig
-        if config_dict.get("model_type") == "blip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class BlipConfig(PretrainedConfig):
     r"""
@@ -316,6 +279,7 @@ class BlipConfig(PretrainedConfig):
     ```"""
 
     model_type = "blip"
+    sub_configs = {"text_config": BlipTextConfig, "vision_config": BlipVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index e7df0578588653..b623d2a8adb17b 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -795,6 +795,12 @@ def __init__(self, config: BlipConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_model.set_input_embeddings(value)
+
     @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
     def get_text_features(
         self,
@@ -1053,8 +1059,11 @@ def __init__(self, config: BlipConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_input_embeddings(self) -> nn.Module:
-        return self.vision_model.embeddings.patch_embedding
+    def get_input_embeddings(self):
+        return self.text_decoder.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_decoder.set_input_embeddings(value)
 
     @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BlipForConditionalGenerationModelOutput, config_class=BlipVisionConfig)
@@ -1117,7 +1126,8 @@ def forward(
         )
 
         if not return_dict:
-            outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:]
+            outputs = (outputs[0], outputs[1]) if labels is not None else (outputs[0],)
+            outputs += (image_embeds, vision_outputs[0]) + vision_outputs[2:]
             return tuple(output for output in outputs if output is not None)
 
         return BlipForConditionalGenerationModelOutput(
@@ -1232,8 +1242,12 @@ def __init__(self, config: BlipConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_input_embeddings(self) -> nn.Module:
-        return self.vision_model.embeddings.patch_embedding
+    def set_input_embeddings(self, value):
+        self.text_encoder.set_input_embeddings(value)
+
+    def get_input_embeddings(self):
+        # This will return shared embeddings if they are shared else specific to encoder.
+        return self.text_encoder.get_input_embeddings()
 
     @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig)
@@ -1474,8 +1488,11 @@ def __init__(self, config: BlipConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_input_embeddings(self) -> nn.Module:
-        return self.vision_model.embeddings.patch_embedding
+    def get_input_embeddings(self):
+        return self.text_encoder.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_encoder.set_input_embeddings(value)
 
     @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig)
diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py
index 5ee7ae21f9d549..97a4f523380bc5 100644
--- a/src/transformers/models/blip/modeling_blip_text.py
+++ b/src/transformers/models/blip/modeling_blip_text.py
@@ -817,6 +817,12 @@ def __init__(self, config):
         self.cls = BlipTextOnlyMLMHead(config)
         self.label_smoothing = config.label_smoothing
 
+    def get_input_embeddings(self):
+        return self.bert.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.bert.set_input_embeddings(new_embeddings)
+
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
 
diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py
index 16fa4aec38492b..d690d22338a687 100644
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@@ -14,13 +14,12 @@
 # limitations under the License.
 """BLIP-2 model configuration"""
 
-import os
-from typing import Optional, Union
+from typing import Optional
 
 from ...configuration_utils import PretrainedConfig
 from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -76,6 +75,7 @@ class Blip2VisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "blip_2_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -106,24 +106,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.qkv_bias = qkv_bias
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from Blip2Config
-        if config_dict.get("model_type") == "blip-2":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Blip2QFormerConfig(PretrainedConfig):
     r"""
@@ -190,6 +172,7 @@ class Blip2QFormerConfig(PretrainedConfig):
     ```"""
 
     model_type = "blip_2_qformer"
+    base_config_key = "qformer_config"
 
     def __init__(
         self,
@@ -229,24 +212,6 @@ def __init__(
         self.encoder_hidden_size = encoder_hidden_size
         self.use_qformer_text_input = use_qformer_text_input
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the qformer config dict if we are loading from Blip2Config
-        if config_dict.get("model_type") == "blip-2":
-            config_dict = config_dict["qformer_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Blip2Config(PretrainedConfig):
     r"""
@@ -306,6 +271,7 @@ class Blip2Config(PretrainedConfig):
     ```"""
 
     model_type = "blip-2"
+    sub_configs = {"text_config": AutoConfig, "qformer_config": Blip2QFormerConfig, "vision_config": Blip2VisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
index 5f972353c4f41e..d6640045b80c30 100644
--- a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
+++ b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
@@ -249,7 +249,7 @@ def convert_blip2_checkpoint(
                 {"image": original_pixel_values, "text_input": [caption]}, match_head="itm"
             )
             logits = hf_model(
-                pixel_values=original_pixel_values,
+                pixel_values=pixel_values,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 use_image_text_matching_head=True,
@@ -274,7 +274,7 @@ def convert_blip2_checkpoint(
                 {"image": original_pixel_values, "text_input": [caption]}, match_head="itc"
             )
             logits = hf_model(
-                pixel_values=original_pixel_values,
+                pixel_values=pixel_values,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 use_image_text_matching_head=False,
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index eba82cd1b3c8e4..d34528b7431453 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1768,11 +1768,12 @@ def forward(
                 decoder_attention_mask=decoder_attention_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
+                return_dict=True,  # toggle for easier access to loss/logits below
                 labels=labels,
             )
-            loss = outputs.loss if return_dict else outputs[0]
-            logits = outputs.logits if return_dict else outputs[1]
+            loss = outputs.loss
+            logits = outputs.logits
+            outputs = outputs.to_tuple() if not return_dict else outputs
 
         if not return_dict:
             output = (logits, vision_outputs, query_outputs, outputs)
@@ -1810,6 +1811,12 @@ def __init__(self, config: Blip2Config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
     @add_start_docstrings_to_model_forward(BLIP_2_TEXT_WITH_PROJECTION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Blip2TextModelOutput, config_class=Blip2Config)
     def forward(
@@ -2196,7 +2203,7 @@ def forward(
             logger.warning_once(
                 "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
                 "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
             )
             inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
             attention_mask = torch.cat(
@@ -2233,11 +2240,12 @@ def forward(
                 decoder_attention_mask=decoder_attention_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
+                return_dict=True,  # toggle for easier access to loss/logits below
                 labels=labels,
             )
-            loss = outputs.loss if return_dict else outputs[0]
-            logits = outputs.logits if return_dict else outputs[1]
+            loss = outputs.loss
+            logits = outputs.logits
+            outputs = outputs.to_tuple() if not return_dict else outputs
 
         if not return_dict:
             output = (logits, vision_outputs, query_outputs, outputs)
@@ -2318,7 +2326,7 @@ def generate(
             logger.warning_once(
                 "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
                 "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
             )
             inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
             attention_mask = torch.cat(
@@ -2334,24 +2342,11 @@ def generate(
                 )
                 generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
 
-        outputs = self.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-
-        # this is a temporary workaround to be consistent with other generation models and
-        # have BOS as the first token, even though under the hood we are calling LM with embeds
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
         if not self.language_model.config.is_encoder_decoder:
-            bos_tokens = (
-                torch.LongTensor([[self.config.text_config.bos_token_id]])
-                .repeat(batch_size, 1)
-                .to(image_embeds.device)
-            )
-            if not isinstance(outputs, torch.Tensor):
-                outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1)
-            else:
-                outputs = torch.cat([bos_tokens, outputs], dim=-1)
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
         return outputs
 
 
@@ -2389,6 +2384,12 @@ def __init__(self, config: Blip2Config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
     @add_start_docstrings_to_model_forward(BLIP2_IMAGE_TEXT_RETRIEVAL_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Blip2ImageTextMatchingModelOutput, config_class=Blip2Config)
     def forward(
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index fa6a99f71a4616..4129920f9b3663 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -74,8 +74,11 @@ class Blip2Processor(ProcessorMixin):
     def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
         tokenizer.return_token_type_ids = False
         self.current_processor = image_processor
-        self.image_token = AddedToken("<image>", normalized=False, special=True)
-        tokenizer.add_tokens([self.image_token], special_tokens=True)
+        if not hasattr(tokenizer, "image_token"):
+            self.image_token = AddedToken("<image>", normalized=False, special=True)
+            tokenizer.add_tokens([self.image_token], special_tokens=True)
+        else:
+            self.image_token = tokenizer.image_token
         self.num_query_tokens = num_query_tokens
 
         super().__init__(image_processor, tokenizer)
@@ -150,7 +153,7 @@ def __call__(
                 logger.warning_once(
                     "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
                     "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
                 )
 
             # cast to desired return tensors type
diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py
index 4985b6ef89fec2..de49283493b63f 100644
--- a/src/transformers/models/bridgetower/configuration_bridgetower.py
+++ b/src/transformers/models/bridgetower/configuration_bridgetower.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """BridgeTower model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -68,6 +65,7 @@ class BridgeTowerVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "bridgetower_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -95,21 +93,6 @@ def __init__(
         self.share_layernorm = share_layernorm
         self.remove_last_layer = remove_last_layer
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "bridgetower":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class BridgeTowerTextConfig(PretrainedConfig):
     r"""
@@ -175,6 +158,7 @@ class BridgeTowerTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "bridgetower_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -217,21 +201,6 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "bridgetower":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class BridgeTowerConfig(PretrainedConfig):
     r"""
@@ -288,6 +257,7 @@ class BridgeTowerConfig(PretrainedConfig):
     ```"""
 
     model_type = "bridgetower"
+    sub_configs = {"text_config": BridgeTowerTextConfig, "vision_config": BridgeTowerVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py
index 67de37f2d01b2c..9842127e7bb48f 100644
--- a/src/transformers/models/chameleon/configuration_chameleon.py
+++ b/src/transformers/models/chameleon/configuration_chameleon.py
@@ -62,6 +62,7 @@ class ChameleonVQVAEConfig(PretrainedConfig):
     """
 
     model_type = "chameleon_vqgan"
+    base_config_key = "vq_config"
 
     def __init__(
         self,
@@ -187,6 +188,7 @@ class ChameleonConfig(PretrainedConfig):
     ```"""
 
     model_type = "chameleon"
+    sub_configs = {"vq_config": ChameleonVQVAEConfig}
     keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py
index 797908277930cf..0661da8727996f 100644
--- a/src/transformers/models/chameleon/modeling_chameleon.py
+++ b/src/transformers/models/chameleon/modeling_chameleon.py
@@ -1288,7 +1288,7 @@ def forward(
         if pixel_values is not None:
             image_tokens = self.get_image_tokens(pixel_values)
             n_image_tokens_in_text = (input_ids == self.vocabulary_mapping.image_token_id).sum().item()
-            n_image_features = image_tokens.shape[0]
+            n_image_features = image_tokens.shape[0] * image_tokens.shape[1]
             if n_image_tokens_in_text != n_image_features:
                 raise ValueError(
                     f"Image features and image tokens do not match: tokens: {n_image_tokens_in_text}, features {n_image_features}"
diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index 2d699c8f663a61..e2a50d1af51b9e 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -66,9 +66,12 @@ class ChameleonProcessor(ProcessorMixin):
 
     def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
         self.image_seq_length = image_seq_length
-        self.image_token = image_token
-        self.image_start_token = "<racm3:break>"  # fixed tokens for start and end, so can hardcode
-        self.image_end_token = "<eoss>"
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.image_start_token = (
+            tokenizer.boi_token if hasattr(tokenizer, "boi_token") else "<racm3:break>"
+        )  # fixed tokens for start and end, so can hardcode
+        self.image_end_token = tokenizer.eoi_token if hasattr(tokenizer, "eoi_token") else "<eoss>"
+
         super().__init__(image_processor, tokenizer)
 
     def __call__(
diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
index 5b37044fab500d..d50d6c842b313c 100644
--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 """Chinese-CLIP model configuration"""
 
-import os
 from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+from typing import TYPE_CHECKING, Any, Mapping, Optional
 
 
 if TYPE_CHECKING:
@@ -102,6 +101,7 @@ class ChineseCLIPTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "chinese_clip_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -141,24 +141,6 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from ChineseCLIPConfig
-        if config_dict.get("model_type") == "chinese_clip":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class ChineseCLIPVisionConfig(PretrainedConfig):
     r"""
@@ -215,6 +197,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "chinese_clip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -249,24 +232,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from ChineseCLIPConfig
-        if config_dict.get("model_type") == "chinese_clip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class ChineseCLIPConfig(PretrainedConfig):
     r"""
@@ -316,6 +281,7 @@ class ChineseCLIPConfig(PretrainedConfig):
     ```"""
 
     model_type = "chinese_clip"
+    sub_configs = {"text_config": ChineseCLIPTextConfig, "vision_config": ChineseCLIPVisionConfig}
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 1425e2a86289cc..b2added7f0e073 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """CLAP model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -94,6 +91,7 @@ class ClapTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "clap_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -137,24 +135,6 @@ def __init__(
         self.projection_hidden_act = projection_hidden_act
         self.projection_dim = projection_dim
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from ClapConfig
-        if config_dict.get("model_type") == "clap":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class ClapAudioConfig(PretrainedConfig):
     r"""
@@ -245,6 +225,7 @@ class ClapAudioConfig(PretrainedConfig):
     ```"""
 
     model_type = "clap_audio_model"
+    base_config_key = "audio_config"
 
     def __init__(
         self,
@@ -307,24 +288,6 @@ def __init__(
         self.initializer_factor = initializer_factor
         self.projection_hidden_act = projection_hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the audio config dict if we are loading from ClapConfig
-        if config_dict.get("model_type") == "clap":
-            config_dict = config_dict["audio_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class ClapConfig(PretrainedConfig):
     r"""
@@ -377,6 +340,7 @@ class ClapConfig(PretrainedConfig):
     ```"""
 
     model_type = "clap"
+    sub_configs = {"text_config": ClapTextConfig, "audio_config": ClapAudioConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index d0224e3caa5b28..f422b17b204f13 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -575,7 +575,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 # Copied from transformers.models.swin.modeling_swin.SwinLayer with SwinDropPath->ClapDropPath, Swin->ClapAudio
 class ClapAudioLayer(nn.Module):
-    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
+    def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.shift_size = shift_size
@@ -583,7 +583,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
         self.input_resolution = input_resolution
         self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
         self.attention = ClapAudioAttention(config, dim, num_heads, window_size=self.window_size)
-        self.drop_path = ClapDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.drop_path = ClapDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
         self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
         self.intermediate = ClapAudioIntermediate(config, dim)
         self.output = ClapAudioOutput(config, dim)
@@ -712,6 +712,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d
                     dim=dim,
                     input_resolution=input_resolution,
                     num_heads=num_heads,
+                    drop_path_rate=drop_path[i],
                     shift_size=0 if (i % 2 == 0) else config.window_size // 2,
                 )
                 for i in range(depth)
diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py
index 8e027f5c3f010f..2e1f2deede00c9 100644
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 """CLIP model configuration"""
 
-import os
 from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+from typing import TYPE_CHECKING, Any, Mapping, Optional
 
 
 if TYPE_CHECKING:
@@ -93,6 +92,7 @@ class CLIPTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "clip_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -130,24 +130,6 @@ def __init__(
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from CLIPConfig
-        if config_dict.get("model_type") == "clip":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class CLIPVisionConfig(PretrainedConfig):
     r"""
@@ -205,6 +187,7 @@ class CLIPVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "clip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -239,24 +222,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from CLIPConfig
-        if config_dict.get("model_type") == "clip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class CLIPConfig(PretrainedConfig):
     r"""
@@ -305,6 +270,7 @@ class CLIPConfig(PretrainedConfig):
     ```"""
 
     model_type = "clip"
+    sub_configs = {"text_config": CLIPTextConfig, "vision_config": CLIPVisionConfig}
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index 0ac8196fc7f546..5474840f357a34 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """CLIPSeg model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -84,6 +81,7 @@ class CLIPSegTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "clipseg_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -117,24 +115,6 @@ def __init__(
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from CLIPSegConfig
-        if config_dict.get("model_type") == "clipseg":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class CLIPSegVisionConfig(PretrainedConfig):
     r"""
@@ -190,6 +170,7 @@ class CLIPSegVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "clipseg_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -222,24 +203,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from CLIPSegConfig
-        if config_dict.get("model_type") == "clipseg":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class CLIPSegConfig(PretrainedConfig):
     r"""
@@ -306,6 +269,7 @@ class CLIPSegConfig(PretrainedConfig):
     ```"""
 
     model_type = "clipseg"
+    sub_configs = {"text_config": CLIPSegTextConfig, "vision_config": CLIPSegVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 8ff7f1cd96a0d2..4ead68032b6034 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -205,7 +205,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
 
         return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
 
-    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=True) -> torch.Tensor:
         batch_size, _, height, width = pixel_values.shape
         if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
             raise ValueError(
@@ -535,7 +535,7 @@ def _init_weights(self, module):
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
-        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `True`):
             Whether to interpolate the pre-trained position encodings.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
@@ -574,7 +574,7 @@ def _init_weights(self, module):
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
-        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `True`):
             Whether to interpolate the pre-trained position encodings.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
@@ -845,14 +845,13 @@ def __init__(self, config: CLIPSegVisionConfig):
 
     @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig)
-    # Copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward
     def forward(
         self,
-        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor],
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        interpolate_pos_encoding: Optional[bool] = False,
+        interpolate_pos_encoding: Optional[bool] = True,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
         r"""
         Returns:
@@ -864,9 +863,6 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
         hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
         hidden_states = self.pre_layrnorm(hidden_states)
 
@@ -912,7 +908,7 @@ def forward(
         pixel_values: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: Optional[bool] = False,
+        interpolate_pos_encoding: Optional[bool] = True,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
         r"""
@@ -1035,7 +1031,7 @@ def get_image_features(
         pixel_values: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: bool = False,
+        interpolate_pos_encoding: bool = True,
         return_dict: Optional[bool] = None,
     ) -> torch.FloatTensor:
         r"""
@@ -1091,7 +1087,7 @@ def forward(
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: bool = False,
+        interpolate_pos_encoding: bool = True,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CLIPSegOutput]:
         r"""
@@ -1397,7 +1393,7 @@ def forward(
         labels: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: bool = False,
+        interpolate_pos_encoding: bool = True,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CLIPSegOutput]:
         r"""
diff --git a/src/transformers/models/clvp/configuration_clvp.py b/src/transformers/models/clvp/configuration_clvp.py
index d17a04c861bf3b..8fd0e150801a66 100644
--- a/src/transformers/models/clvp/configuration_clvp.py
+++ b/src/transformers/models/clvp/configuration_clvp.py
@@ -91,6 +91,7 @@ class ClvpEncoderConfig(PretrainedConfig):
     ```"""
 
     model_type = "clvp_encoder"
+    base_config_key = ["text_config", "speech_config"]
 
     def __init__(
         self,
@@ -141,7 +142,7 @@ def from_pretrained(
 
         # make sure to have the config_type be either "text_config" or "speech_config"
         # this is to make sure that we can load only text or speech configs from the nested ClvpConfig.
-        if config_type not in ["text_config", "speech_config"]:
+        if config_type not in cls.base_config_key:
             raise ValueError(
                 f"We can only load either 'text_config' or 'speech_config' but you are trying to load" f"{config_type}"
             )
@@ -253,6 +254,7 @@ class ClvpDecoderConfig(PretrainedConfig):
     ```"""
 
     model_type = "clvp_decoder"
+    base_config_key = "decoder_config"
 
     def __init__(
         self,
@@ -314,24 +316,6 @@ def __init__(
 
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the speech config dict if we are loading from ClvpConfig
-        if config_dict.get("model_type") == "clvp":
-            config_dict = config_dict["decoder_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class ClvpConfig(PretrainedConfig):
     r"""
@@ -386,7 +370,11 @@ class ClvpConfig(PretrainedConfig):
     ```"""
 
     model_type = "clvp"
-    is_composition = True
+    sub_configs = {
+        "text_config": ClvpEncoderConfig,
+        "speech_config": ClvpEncoderConfig,
+        "decoder_config": ClvpDecoderConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index b215fb6561bf81..d481d87e7ab8ed 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -890,7 +890,7 @@ def forward(
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
 
-        for decoder_layer in self.layers:
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
@@ -1068,7 +1068,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         return causal_mask
 
 
-# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere
+# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere
 class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
 
diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index dde5232ae5cc9b..302b5e6a55821d 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -41,6 +41,8 @@ class DbrxAttentionConfig(PretrainedConfig):
         rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
     """
 
+    base_config_key = "attn_config"
+
     def __init__(
         self,
         attn_pdrop: float = 0.0,
@@ -55,29 +57,12 @@ def __init__(
         self.kv_n_heads = kv_n_heads
         self.rope_theta = rope_theta
 
-        for k in ["model_type"]:
+        for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]:
             if k in kwargs:
                 kwargs.pop(k)
         if len(kwargs) != 0:
             raise ValueError(f"Found unknown {kwargs=}")
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "dbrx":
-            config_dict = config_dict["attn_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class DbrxFFNConfig(PretrainedConfig):
     """Configuration class for Dbrx FFN.
@@ -100,6 +85,8 @@ class DbrxFFNConfig(PretrainedConfig):
         moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
     """
 
+    base_config_key = "ffn_config"
+
     def __init__(
         self,
         ffn_act_fn: dict = None,
@@ -122,29 +109,12 @@ def __init__(
         self.moe_loss_weight = moe_loss_weight
         self.moe_normalize_expert_weights = moe_normalize_expert_weights
 
-        for k in ["model_type"]:
+        for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]:
             if k in kwargs:
                 kwargs.pop(k)
         if len(kwargs) != 0:
             raise ValueError(f"Found unknown {kwargs=}")
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "dbrx":
-            config_dict = config_dict["ffn_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class DbrxConfig(PretrainedConfig):
     r"""
@@ -202,6 +172,7 @@ class DbrxConfig(PretrainedConfig):
     """
 
     model_type = "dbrx"
+    sub_configs = {"attn_config": DbrxAttentionConfig, "ffn_config": DbrxFFNConfig}
     attribute_map = {
         "num_attention_heads": "n_heads",
         "hidden_size": "d_model",
diff --git a/src/transformers/models/deformable_detr/__init__.py b/src/transformers/models/deformable_detr/__init__.py
index ab44adf3718149..7c756c4bdffd7f 100644
--- a/src/transformers/models/deformable_detr/__init__.py
+++ b/src/transformers/models/deformable_detr/__init__.py
@@ -29,6 +29,7 @@
 else:
     _import_structure["feature_extraction_deformable_detr"] = ["DeformableDetrFeatureExtractor"]
     _import_structure["image_processing_deformable_detr"] = ["DeformableDetrImageProcessor"]
+    _import_structure["image_processing_deformable_detr_fast"] = ["DeformableDetrImageProcessorFast"]
 
 try:
     if not is_torch_available():
@@ -54,6 +55,7 @@
     else:
         from .feature_extraction_deformable_detr import DeformableDetrFeatureExtractor
         from .image_processing_deformable_detr import DeformableDetrImageProcessor
+        from .image_processing_deformable_detr_fast import DeformableDetrImageProcessorFast
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
new file mode 100644
index 00000000000000..fde0540c5d494f
--- /dev/null
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
@@ -0,0 +1,1057 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for Deformable DETR."""
+
+import functools
+import pathlib
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ...image_processing_utils import BatchFeature, get_size_dict
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    SizeDict,
+    get_image_size_for_max_height_width,
+    get_max_height_width,
+    safe_squeeze,
+)
+from ...image_transforms import (
+    center_to_corners_format,
+    corners_to_center_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    ImageType,
+    PILImageResampling,
+    get_image_size,
+    get_image_type,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    pil_torch_interpolation_mapping,
+    validate_annotations,
+    validate_kwargs,
+)
+from ...utils import (
+    TensorType,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    is_vision_available,
+    logging,
+)
+from .image_processing_deformable_detr import (
+    get_size_with_aspect_ratio,
+)
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    from torchvision.io import read_image
+
+    if is_vision_available():
+        from ...image_utils import pil_torch_interpolation_mapping
+
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+# Copied from transformers.models.detr.image_processing_detr_fast.convert_coco_poly_to_mask
+def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`List[List[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8, device=device)
+        mask = torch.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, axis=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device)
+
+    return masks
+
+
+# Copied from transformers.models.detr.image_processing_detr_fast.prepare_coco_detection_annotation with DETR->DeformableDetr
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by DeformableDetr.
+    """
+    image_height, image_width = image.size()[-2:]
+
+    image_id = target["image_id"]
+    image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    classes = []
+    area = []
+    boxes = []
+    keypoints = []
+    for obj in annotations:
+        if "iscrowd" not in obj or obj["iscrowd"] == 0:
+            classes.append(obj["category_id"])
+            area.append(obj["area"])
+            boxes.append(obj["bbox"])
+            if "keypoints" in obj:
+                keypoints.append(obj["keypoints"])
+
+    classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+    # guard against no boxes via resizing
+    boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {
+        "image_id": image_id,
+        "class_labels": classes[keep],
+        "boxes": boxes[keep],
+        "area": area[keep],
+        "iscrowd": iscrowd[keep],
+        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+    }
+
+    if keypoints:
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr_fast.masks_to_boxes
+def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+    y = torch.arange(0, h, dtype=torch.float32, device=masks.device)
+    x = torch.arange(0, w, dtype=torch.float32, device=masks.device)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = torch.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * torch.unsqueeze(x, 0)
+    x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0]
+    x_min = (
+        torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+    )
+
+    y_mask = masks * torch.unsqueeze(y, 0)
+    y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0]
+    y_min = (
+        torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+    )
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# Copied from transformers.models.detr.image_processing_detr_fast.rgb_to_id
+def rgb_to_id(color):
+    """
+    Converts RGB color to unique ID.
+    """
+    if isinstance(color, torch.Tensor) and len(color.shape) == 3:
+        if color.dtype == torch.uint8:
+            color = color.to(torch.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+# Copied from transformers.models.detr.image_processing_detr_fast.prepare_coco_panoptic_annotation with DETR->DeformableDetr
+def prepare_coco_panoptic_annotation(
+    image: torch.Tensor,
+    target: Dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> Dict:
+    """
+    Prepare a coco panoptic annotation for DeformableDetr.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = torch.as_tensor(
+        [target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device
+    )
+    new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+    new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+
+    if "segments_info" in target:
+        masks = read_image(annotation_path).permute(1, 2, 0).to(torch.int32).to(image.device)
+        masks = rgb_to_id(masks)
+
+        ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device)
+        masks = masks == ids[:, None, None]
+        masks = masks.to(torch.bool)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = torch.as_tensor(
+            [segment_info["category_id"] for segment_info in target["segments_info"]],
+            dtype=torch.int64,
+            device=image.device,
+        )
+        new_target["iscrowd"] = torch.as_tensor(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]],
+            dtype=torch.int64,
+            device=image.device,
+        )
+        new_target["area"] = torch.as_tensor(
+            [segment_info["area"] for segment_info in target["segments_info"]],
+            dtype=torch.float32,
+            device=image.device,
+        )
+
+    return new_target
+
+
+class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
+    r"""
+    Constructs a fast Deformable DETR image processor.
+
+    Args:
+        format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+            in the `preprocess` method. Available options are:
+                - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                    Do NOT keep the aspect ratio.
+                - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                    the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                    less or equal to `longest_edge`.
+                - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                    aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                    `max_width`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.__init__
+    def __init__(
+        self,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: Union[PILImageResampling, "F.InterpolationMode"] = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Union[float, List[float]] = None,
+        image_std: Union[float, List[float]] = None,
+        do_convert_annotations: Optional[bool] = None,
+        do_pad: bool = True,
+        pad_size: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+        # Backwards compatibility
+        if do_convert_annotations is None:
+            do_convert_annotations = do_normalize
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_convert_annotations = do_convert_annotations
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+        self._valid_processor_keys = [
+            "images",
+            "annotations",
+            "return_segmentation_masks",
+            "masks_path",
+            "do_resize",
+            "size",
+            "resample",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "do_convert_annotations",
+            "image_mean",
+            "image_std",
+            "do_pad",
+            "pad_size",
+            "format",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    @classmethod
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.from_dict with Detr->DeformableDetr
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `DeformableDetrImageProcessorFast.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.prepare_annotation with DETR->DeformableDetr
+    def prepare_annotation(
+        self,
+        image: torch.Tensor,
+        target: Dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: bool = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Dict:
+        """
+        Prepare an annotation for feeding into DeformableDetr model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotationFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize
+    def resize(
+        self,
+        image: torch.Tensor,
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                Resampling filter to use if resizing the image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            # Resize the image so that the shortest edge or the longest edge is of the given size
+            # while maintaining the aspect ratio of the original image.
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size["shortest_edge"],
+                size["longest_edge"],
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
+        elif size.height and size.width:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+
+        image = F.resize(
+            image,
+            size=new_size,
+            interpolation=interpolation,
+            **kwargs,
+        )
+        return image
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize_annotation
+    def resize_annotation(
+        self,
+        annotation: Dict[str, Any],
+        orig_size: Tuple[int, int],
+        target_size: Tuple[int, int],
+        threshold: float = 0.5,
+        interpolation: "F.InterpolationMode" = None,
+    ):
+        """
+        Resizes an annotation to a target size.
+
+        Args:
+            annotation (`Dict[str, Any]`):
+                The annotation dictionary.
+            orig_size (`Tuple[int, int]`):
+                The original size of the input image.
+            target_size (`Tuple[int, int]`):
+                The target size of the image, as returned by the preprocessing `resize` step.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The threshold used to binarize the segmentation masks.
+            resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`):
+                The resampling filter to use when resizing the masks.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST
+        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
+
+        new_annotation = {}
+        new_annotation["size"] = target_size
+
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                scaled_boxes = boxes * torch.as_tensor(
+                    [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
+                )
+                new_annotation["boxes"] = scaled_boxes
+            elif key == "area":
+                area = value
+                scaled_area = area * (ratio_width * ratio_height)
+                new_annotation["area"] = scaled_area
+            elif key == "masks":
+                masks = value[:, None]
+                masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
+                masks = torch.stack(masks).to(torch.float32)
+                masks = masks[:, 0] > threshold
+                new_annotation["masks"] = masks
+            elif key == "size":
+                new_annotation["size"] = target_size
+            else:
+                new_annotation[key] = value
+
+        return new_annotation
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.normalize_annotation
+    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+        image_height, image_width = image_size
+        norm_annotation = {}
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                boxes = corners_to_center_format(boxes)
+                boxes /= torch.as_tensor(
+                    [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
+                )
+                norm_annotation[key] = boxes
+            else:
+                norm_annotation[key] = value
+        return norm_annotation
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._update_annotation_for_padded_image
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: Dict,
+        input_image_size: Tuple[int, int],
+        output_image_size: Tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> Dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+        ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = F.pad(
+                    masks,
+                    padding,
+                    fill=0,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.pad
+    def pad(
+        self,
+        image: torch.Tensor,
+        padded_size: Tuple[int, int],
+        annotation: Optional[Dict[str, Any]] = None,
+        update_bboxes: bool = True,
+        fill: int = 0,
+    ):
+        original_size = image.size()[-2:]
+        padding_bottom = padded_size[0] - original_size[0]
+        padding_right = padded_size[1] - original_size[1]
+        if padding_bottom < 0 or padding_right < 0:
+            raise ValueError(
+                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+                f"original size. Got padded size: {padded_size}, original size: {original_size}."
+            )
+        if original_size != padded_size:
+            padding = [0, 0, padding_right, padding_bottom]
+            image = F.pad(image, padding, fill=fill)
+            if annotation is not None:
+                annotation = self._update_annotation_for_padded_image(
+                    annotation, original_size, padded_size, padding, update_bboxes
+                )
+
+        # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+        pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
+        pixel_mask[: original_size[0], : original_size[1]] = 1
+
+        return image, pixel_mask, annotation
+
+    @functools.lru_cache(maxsize=1)
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._validate_input_arguments
+    def _validate_input_arguments(
+        self,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+        do_resize: bool,
+        size: Dict[str, int],
+        resample: "PILImageResampling",
+        data_format: Union[str, ChannelDimension],
+        return_tensors: Union[TensorType, str],
+    ):
+        if return_tensors != "pt":
+            raise ValueError("Only returning PyTorch tensors is currently supported.")
+
+        if data_format != ChannelDimension.FIRST:
+            raise ValueError("Only channel first data format is currently supported.")
+
+        if do_resize and None in (size, resample):
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and None in (image_mean, image_std):
+            raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.preprocess
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        return_segmentation_masks: bool = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: Optional[Union[PILImageResampling, "F.InterpolationMode"]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        do_convert_annotations: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        pad_size: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+                Whether to return segmentation masks.
+            masks_path (`str` or `pathlib.Path`, *optional*):
+                Path to the directory containing the segmentation masks.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+                Whether to convert the annotations to the format expected by the model. Converts the bounding
+                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+                and in relative coordinates.
+            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+                the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+                dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+                Format of the annotations.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            pad_size (`Dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead."
+            )
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                " `size['longest_edge']` instead."
+            )
+            size = kwargs.pop("max_size")
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, default_to_square=False)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_convert_annotations = (
+            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+        )
+        do_pad = self.do_pad if do_pad is None else do_pad
+        pad_size = self.pad_size if pad_size is None else pad_size
+        format = self.format if format is None else format
+        device = kwargs.pop("device", None)
+
+        # Make hashable for cache
+        size = SizeDict(**size)
+        image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
+        image_std = tuple(image_std) if isinstance(image_std, list) else image_std
+
+        images = make_list_of_images(images)
+        image_type = get_image_type(images[0])
+
+        if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
+            raise ValueError(f"Unsupported input image type {image_type}")
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        self._validate_input_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            return_tensors=return_tensors,
+            data_format=data_format,
+        )
+
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        if (
+            masks_path is not None
+            and format == AnnotationFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        data = {}
+        if image_type == ImageType.PIL:
+            images = [F.pil_to_tensor(image) for image in images]
+        elif image_type == ImageType.NUMPY:
+            # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays
+            images = [torch.from_numpy(image).contiguous() for image in images]
+
+        if device is not None:
+            images = [image.to(device) for image in images]
+
+        # We assume that all images have the same channel dimension format.
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(images[0])
+        if input_data_format == ChannelDimension.LAST:
+            images = [image.permute(2, 0, 1).contiguous() for image in images]
+            input_data_format = ChannelDimension.FIRST
+
+        if do_rescale and do_normalize:
+            # fused rescale and normalize
+            new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor)
+            new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor)
+
+        processed_images = []
+        processed_annotations = []
+        pixel_masks = []  # Initialize pixel_masks here
+        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+            # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+            if annotations is not None:
+                annotation = self.prepare_annotation(
+                    image,
+                    annotation,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
+                )
+
+            if do_resize:
+                interpolation = (
+                    pil_torch_interpolation_mapping[resample]
+                    if isinstance(resample, (PILImageResampling, int))
+                    else resample
+                )
+                resized_image = self.resize(image, size=size, interpolation=interpolation)
+                if annotations is not None:
+                    annotation = self.resize_annotation(
+                        annotation,
+                        orig_size=image.size()[-2:],
+                        target_size=resized_image.size()[-2:],
+                    )
+                image = resized_image
+
+            if do_rescale and do_normalize:
+                # fused rescale and normalize
+                image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std)
+            elif do_rescale:
+                image = image * rescale_factor
+            elif do_normalize:
+                image = F.normalize(image, image_mean, image_std)
+
+            if do_convert_annotations and annotations is not None:
+                annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+
+            processed_images.append(image)
+            processed_annotations.append(annotation)
+        images = processed_images
+        annotations = processed_annotations if annotations is not None else None
+
+        if do_pad:
+            # depends on all resized image shapes so we need another loop
+            if pad_size is not None:
+                padded_size = (pad_size["height"], pad_size["width"])
+            else:
+                padded_size = get_max_height_width(images)
+
+            padded_images = []
+            padded_annotations = []
+            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+                if padded_size == image.size()[-2:]:
+                    padded_images.append(image)
+                    pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+                    padded_annotations.append(annotation)
+                    continue
+                image, pixel_mask, annotation = self.pad(
+                    image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+                )
+                padded_images.append(image)
+                padded_annotations.append(annotation)
+                pixel_masks.append(pixel_mask)
+            images = padded_images
+            annotations = padded_annotations if annotations is not None else None
+            data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+        return encoded_inputs
+
+    # Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DeformableDetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    # Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process_object_detection
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
+    ):
+        """
+        Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            top_k (`int`, *optional*, defaults to 100):
+                Keep only top k bounding boxes before filtering by thresholding.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = out_logits.sigmoid()
+        prob = prob.view(out_logits.shape[0], -1)
+        k_value = min(top_k, prob.size(1))
+        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, List):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results
diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py
index 59c628786328e6..4667c413457b19 100644
--- a/src/transformers/models/depth_anything/modeling_depth_anything.py
+++ b/src/transformers/models/depth_anything/modeling_depth_anything.py
@@ -224,16 +224,16 @@ def forward(self, hidden_states, size=None):
         hidden_states = hidden_states[::-1]
 
         fused_hidden_states = []
-        # first layer only uses the last hidden_state
-        size = hidden_states[1].shape[2:]
-        fused_hidden_state = self.layers[0](hidden_states[0], size=size)
-        fused_hidden_states.append(fused_hidden_state)
+        fused_hidden_state = None
 
-        # looping from the last layer to the second
-        for idx, (hidden_state, layer) in enumerate(zip(hidden_states[1:], self.layers[1:])):
-            size = hidden_states[1:][idx + 1].shape[2:] if idx != (len(hidden_states[1:]) - 1) else None
+        for idx, (hidden_state, layer) in enumerate(zip(hidden_states, self.layers)):
+            size = hidden_states[idx + 1].shape[2:] if idx != (len(hidden_states) - 1) else None
 
-            fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size)
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state, size=size)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size)
 
             fused_hidden_states.append(fused_hidden_state)
 
diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py
index 0fa1d0ffd9dba9..0d28d7df7a647a 100644
--- a/src/transformers/models/detr/image_processing_detr_fast.py
+++ b/src/transformers/models/detr/image_processing_detr_fast.py
@@ -21,7 +21,13 @@
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 from ...image_processing_utils import BatchFeature, get_size_dict
-from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    SizeDict,
+    get_image_size_for_max_height_width,
+    get_max_height_width,
+    safe_squeeze,
+)
 from ...image_transforms import (
     center_to_corners_format,
     corners_to_center_format,
@@ -55,7 +61,6 @@
     compute_segments,
     convert_segmentation_to_rle,
     get_size_with_aspect_ratio,
-    max_across_indices,
     remove_low_and_no_objects,
 )
 
@@ -85,60 +90,6 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
-def get_image_size_for_max_height_width(
-    image_size: Tuple[int, int],
-    max_height: int,
-    max_width: int,
-) -> Tuple[int, int]:
-    """
-    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
-    Important, even if image_height < max_height and image_width < max_width, the image will be resized
-    to at least one of the edges be equal to max_height or max_width.
-
-    For example:
-        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
-        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
-
-    Args:
-        image_size (`Tuple[int, int]`):
-            The image to resize.
-        max_height (`int`):
-            The maximum allowed height.
-        max_width (`int`):
-            The maximum allowed width.
-    """
-    height, width = image_size
-    height_scale = max_height / height
-    width_scale = max_width / width
-    min_scale = min(height_scale, width_scale)
-    new_height = int(height * min_scale)
-    new_width = int(width * min_scale)
-    return new_height, new_width
-
-
-def safe_squeeze(tensor: torch.Tensor, axis: Optional[int] = None) -> torch.Tensor:
-    """
-    Squeezes a tensor, but only if the axis specified has dim 1.
-    """
-    if axis is None:
-        return tensor.squeeze()
-
-    try:
-        return tensor.squeeze(axis=axis)
-    except ValueError:
-        return tensor
-
-
-def get_max_height_width(images: List[torch.Tensor]) -> Tuple[int]:
-    """
-    Get the maximum height and width across all images in a batch.
-    """
-
-    _, max_height, max_width = max_across_indices([img.shape for img in images])
-
-    return (max_height, max_width)
-
-
 # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
 def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
     """
@@ -191,18 +142,21 @@ def prepare_coco_detection_annotation(
 
     # Get all COCO annotations for the given image.
     annotations = target["annotations"]
-    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+    classes = []
+    area = []
+    boxes = []
+    keypoints = []
+    for obj in annotations:
+        if "iscrowd" not in obj or obj["iscrowd"] == 0:
+            classes.append(obj["category_id"])
+            area.append(obj["area"])
+            boxes.append(obj["bbox"])
+            if "keypoints" in obj:
+                keypoints.append(obj["keypoints"])
 
-    classes = [obj["category_id"] for obj in annotations]
     classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
-
-    # for conversion to coco api
-    area = torch.as_tensor([obj["area"] for obj in annotations], dtype=torch.float32, device=image.device)
-    iscrowd = torch.as_tensor(
-        [obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=torch.int64, device=image.device
-    )
-
-    boxes = [obj["bbox"] for obj in annotations]
+    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
     # guard against no boxes via resizing
     boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
     boxes[:, 2:] += boxes[:, :2]
@@ -211,19 +165,16 @@ def prepare_coco_detection_annotation(
 
     keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
 
-    new_target = {}
-    new_target["image_id"] = image_id
-    new_target["class_labels"] = classes[keep]
-    new_target["boxes"] = boxes[keep]
-    new_target["area"] = area[keep]
-    new_target["iscrowd"] = iscrowd[keep]
-    new_target["orig_size"] = torch.as_tensor(
-        [int(image_height), int(image_width)], dtype=torch.int64, device=image.device
-    )
+    new_target = {
+        "image_id": image_id,
+        "class_labels": classes[keep],
+        "boxes": boxes[keep],
+        "area": area[keep],
+        "iscrowd": iscrowd[keep],
+        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+    }
 
-    if annotations and "keypoints" in annotations[0]:
-        keypoints = [obj["keypoints"] for obj in annotations]
-        # Converting the filtered keypoints list to a numpy array
+    if keypoints:
         keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
         # Apply the keep mask here to filter the relevant annotations
         keypoints = keypoints[keep]
@@ -396,7 +347,7 @@ def __init__(
         format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
-        resample: [Union[PILImageResampling, F.InterpolationMode]] = PILImageResampling.BILINEAR,
+        resample: Union[PILImageResampling, "F.InterpolationMode"] = PILImageResampling.BILINEAR,
         do_rescale: bool = True,
         rescale_factor: Union[int, float] = 1 / 255,
         do_normalize: bool = True,
@@ -465,7 +416,7 @@ def __init__(
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
         Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
-        created using from_dict and kwargs e.g. `DetrImageProcessor.from_pretrained(checkpoint, size=600,
+        created using from_dict and kwargs e.g. `DetrImageProcessorFast.from_pretrained(checkpoint, size=600,
         max_size=800)`
         """
         image_processor_dict = image_processor_dict.copy()
@@ -511,7 +462,7 @@ def resize(
         self,
         image: torch.Tensor,
         size: SizeDict,
-        interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR,
+        interpolation: "F.InterpolationMode" = None,
         **kwargs,
     ) -> torch.Tensor:
         """
@@ -534,6 +485,7 @@ def resize(
             interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
                 Resampling filter to use if resizing the image.
         """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
         if size.shortest_edge and size.longest_edge:
             # Resize the image so that the shortest edge or the longest edge is of the given size
             # while maintaining the aspect ratio of the original image.
@@ -566,7 +518,7 @@ def resize_annotation(
         orig_size: Tuple[int, int],
         target_size: Tuple[int, int],
         threshold: float = 0.5,
-        interpolation: F.InterpolationMode = F.InterpolationMode.NEAREST,
+        interpolation: "F.InterpolationMode" = None,
     ):
         """
         Resizes an annotation to a target size.
@@ -583,6 +535,7 @@ def resize_annotation(
             resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`):
                 The resampling filter to use when resizing the masks.
         """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST
         ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
 
         new_annotation = {}
@@ -729,7 +682,7 @@ def preprocess(
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
-        resample: Optional[Union[PILImageResampling, F.InterpolationMode]] = None,
+        resample: Optional[Union[PILImageResampling, "F.InterpolationMode"]] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[Union[int, float]] = None,
         do_normalize: Optional[bool] = None,
@@ -910,85 +863,83 @@ def preprocess(
             input_data_format = infer_channel_dimension_format(images[0])
         if input_data_format == ChannelDimension.LAST:
             images = [image.permute(2, 0, 1).contiguous() for image in images]
+            input_data_format = ChannelDimension.FIRST
 
-        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
-        if annotations is not None:
-            prepared_images = []
-            prepared_annotations = []
-            for image, target in zip(images, annotations):
-                target = self.prepare_annotation(
+        if do_rescale and do_normalize:
+            # fused rescale and normalize
+            new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor)
+            new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor)
+
+        processed_images = []
+        processed_annotations = []
+        pixel_masks = []  # Initialize pixel_masks here
+        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+            # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+            if annotations is not None:
+                annotation = self.prepare_annotation(
                     image,
-                    target,
+                    annotation,
                     format,
                     return_segmentation_masks=return_segmentation_masks,
                     masks_path=masks_path,
                     input_data_format=input_data_format,
                 )
-                prepared_images.append(image)
-                prepared_annotations.append(target)
-            images = prepared_images
-            annotations = prepared_annotations
-            del prepared_images, prepared_annotations
-
-        if do_resize:
-            if isinstance(resample, (PILImageResampling, int)):
-                interpolation = pil_torch_interpolation_mapping[resample]
-            else:
-                interpolation = resample
-            resized_images = [self.resize(image, size=size, interpolation=interpolation) for image in images]
-            if annotations is not None:
-                for i, (image, target) in enumerate(zip(resized_images, annotations)):
-                    annotations[i] = self.resize_annotation(
-                        target,
-                        orig_size=images[i].size()[-2:],
-                        target_size=image.size()[-2:],
+
+            if do_resize:
+                interpolation = (
+                    pil_torch_interpolation_mapping[resample]
+                    if isinstance(resample, (PILImageResampling, int))
+                    else resample
+                )
+                resized_image = self.resize(image, size=size, interpolation=interpolation)
+                if annotations is not None:
+                    annotation = self.resize_annotation(
+                        annotation,
+                        orig_size=image.size()[-2:],
+                        target_size=resized_image.size()[-2:],
                     )
-            images = resized_images
-            del resized_images
+                image = resized_image
 
-        if do_rescale and do_normalize:
-            # fused rescale and normalize
-            new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor)
-            new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor)
-            images = [F.normalize(image.to(dtype=torch.float32), new_mean, new_std) for image in images]
-        elif do_rescale:
-            images = [image * rescale_factor for image in images]
-        elif do_normalize:
-            images = [F.normalize(image, image_mean, image_std) for image in images]
-
-        if do_convert_annotations and annotations is not None:
-            annotations = [
-                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
-                for annotation, image in zip(annotations, images)
-            ]
+            if do_rescale and do_normalize:
+                # fused rescale and normalize
+                image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std)
+            elif do_rescale:
+                image = image * rescale_factor
+            elif do_normalize:
+                image = F.normalize(image, image_mean, image_std)
+
+            if do_convert_annotations and annotations is not None:
+                annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+
+            processed_images.append(image)
+            processed_annotations.append(annotation)
+        images = processed_images
+        annotations = processed_annotations if annotations is not None else None
 
         if do_pad:
-            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+            # depends on all resized image shapes so we need another loop
             if pad_size is not None:
                 padded_size = (pad_size["height"], pad_size["width"])
             else:
                 padded_size = get_max_height_width(images)
 
-            annotation_list = annotations if annotations is not None else [None] * len(images)
             padded_images = []
-            pixel_masks = []
             padded_annotations = []
-            for image, annotation in zip(images, annotation_list):
+            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
                 if padded_size == image.size()[-2:]:
                     padded_images.append(image)
                     pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
                     padded_annotations.append(annotation)
                     continue
-                padded_image, pixel_mask, padded_annotation = self.pad(
+                image, pixel_mask, annotation = self.pad(
                     image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
                 )
-                padded_images.append(padded_image)
+                padded_images.append(image)
+                padded_annotations.append(annotation)
                 pixel_masks.append(pixel_mask)
-                padded_annotations.append(padded_annotation)
             images = padded_images
-            if annotations is not None:
-                annotations = padded_annotations
-            del padded_images, padded_annotations
+            annotations = padded_annotations if annotations is not None else None
             data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
 
         data.update({"pixel_values": torch.stack(images, dim=0)})
diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py
index 8d639131b841ca..2d5272e8642ee5 100644
--- a/src/transformers/models/donut/modeling_donut_swin.py
+++ b/src/transformers/models/donut/modeling_donut_swin.py
@@ -558,7 +558,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 # Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->DonutSwin
 class DonutSwinLayer(nn.Module):
-    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
+    def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.shift_size = shift_size
@@ -566,7 +566,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
         self.input_resolution = input_resolution
         self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
         self.attention = DonutSwinAttention(config, dim, num_heads, window_size=self.window_size)
-        self.drop_path = DonutSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.drop_path = DonutSwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
         self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
         self.intermediate = DonutSwinIntermediate(config, dim)
         self.output = DonutSwinOutput(config, dim)
@@ -695,6 +695,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d
                     dim=dim,
                     input_resolution=input_resolution,
                     num_heads=num_heads,
+                    drop_path_rate=drop_path[i],
                     shift_size=0 if (i % 2 == 0) else config.window_size // 2,
                 )
                 for i in range(depth)
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index 9552d323ac57c0..b46ff4bcfab902 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -24,12 +24,16 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
 
 
 class DonutProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
+logger = logging.get_logger(__name__)
+
+
 class DonutProcessor(ProcessorMixin):
     r"""
     Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
@@ -85,6 +89,16 @@ def __call__(
         [`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
         """
         # For backward compatibility
+        legacy = kwargs.pop("legacy", True)
+        if legacy:
+            # With `add_special_tokens=True`, the performance of donut are degraded when working with both images and text.
+            logger.warning_once(
+                "Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. "
+                "In the new behavior, if both images and text are provided, the default value of `add_special_tokens` "
+                "will be changed to `False` when calling the tokenizer if `add_special_tokens` is unset. "
+                "To test the new behavior, set `legacy=False`as a processor call argument."
+            )
+
         if self._in_target_context_manager:
             return self.current_processor(images, text, **kwargs)
 
@@ -100,6 +114,8 @@ def __call__(
         if images is not None:
             inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         if text is not None:
+            if not legacy and images is not None:
+                output_kwargs["text_kwargs"].setdefault("add_special_tokens", False)
             encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
         if text is None:
diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index 2d4654a234c2c6..5886d288b88271 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -689,12 +689,13 @@ def forward(self, hidden_states):
         hidden_states = hidden_states[::-1]
 
         fused_hidden_states = []
-        # first layer only uses the last hidden_state
-        fused_hidden_state = self.layers[0](hidden_states[0])
-        fused_hidden_states.append(fused_hidden_state)
-        # looping from the last layer to the second
-        for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
-            fused_hidden_state = layer(fused_hidden_state, hidden_state)
+        fused_hidden_state = None
+        for hidden_state, layer in zip(hidden_states, self.layers):
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state)
             fused_hidden_states.append(fused_hidden_state)
 
         return fused_hidden_states
diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
index ab5d49b32fea90..5190ed51ffd350 100644
--- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
@@ -17,6 +17,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ..auto import AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -70,6 +71,7 @@ class EncoderDecoderConfig(PretrainedConfig):
     ```"""
 
     model_type = "encoder-decoder"
+    sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
     is_composition = True
 
     def __init__(self, **kwargs):
@@ -84,8 +86,6 @@ def __init__(self, **kwargs):
         decoder_config = kwargs.pop("decoder")
         decoder_model_type = decoder_config.pop("model_type")
 
-        from ..auto.configuration_auto import AutoConfig
-
         self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
         self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
         self.is_encoder_decoder = True
diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py
index 9634a20015f207..083664747ddd85 100644
--- a/src/transformers/models/esm/configuration_esm.py
+++ b/src/transformers/models/esm/configuration_esm.py
@@ -87,11 +87,14 @@ class EsmConfig(PretrainedConfig):
     ```python
     >>> from transformers import EsmModel, EsmConfig
 
-    >>> # Initializing a ESM facebook/esm-1b style configuration >>> configuration = EsmConfig()
+    >>> # Initializing a ESM facebook/esm-1b style configuration
+    >>> configuration = EsmConfig(vocab_size=33)
 
-    >>> # Initializing a model from the configuration >>> model = ESMModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = EsmModel(configuration)
 
-    >>> # Accessing the model configuration >>> configuration = model.config
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
     ```"""
 
     model_type = "esm"
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index 504dcf10b206c3..faea670ecbf428 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -1277,12 +1277,18 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
     ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
             `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
             are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+
+        num_logits_to_keep (`int`, *optional*):
+            Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
         """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1302,7 +1308,7 @@ def forward(
         )
         hidden_states = transformer_outputs[0]
 
-        lm_logits = self.lm_head(hidden_states)
+        lm_logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
diff --git a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
index ade5b8b2667537..59a1b029751646 100644
--- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
@@ -164,6 +164,7 @@ class FastSpeech2ConformerConfig(PretrainedConfig):
     ```"""
 
     model_type = "fastspeech2_conformer"
+    base_config_key = "model_config"
     attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"}
 
     def __init__(
@@ -377,6 +378,7 @@ class FastSpeech2ConformerHifiGanConfig(PretrainedConfig):
     ```"""
 
     model_type = "hifigan"
+    base_config_key = "vocoder_config"
 
     def __init__(
         self,
@@ -453,7 +455,7 @@ class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig):
     """
 
     model_type = "fastspeech2_conformer_with_hifigan"
-    is_composition = True
+    sub_configs = {"model_config": FastSpeech2ConformerConfig, "vocoder_config": FastSpeech2ConformerHifiGanConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py
index b6349361c0dda8..47cdb488a2eb5d 100644
--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """FLAVA model configurations"""
 
-import os
-from typing import Any, Dict, Union
+from typing import Any, Dict
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -86,6 +85,7 @@ class FlavaImageConfig(PretrainedConfig):
     ```"""
 
     model_type = "flava_image_model"
+    base_config_key = "image_config"
 
     def __init__(
         self,
@@ -124,24 +124,6 @@ def __init__(
         self.mask_token = mask_token
         self.vocab_size = vocab_size
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the image config dict if we are loading from FlavaConfig
-        if config_dict.get("model_type") == "flava":
-            config_dict = config_dict["image_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class FlavaTextConfig(PretrainedConfig):
     r"""
@@ -216,6 +198,7 @@ class FlavaTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "flava_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -254,24 +237,6 @@ def __init__(
         self.qkv_bias = qkv_bias
         self.pad_token_id = pad_token_id
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from FlavaConfig
-        if config_dict.get("model_type") == "flava":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class FlavaMultimodalConfig(PretrainedConfig):
     r"""
@@ -327,6 +292,7 @@ class FlavaMultimodalConfig(PretrainedConfig):
     ```"""
 
     model_type = "flava_multimodal_model"
+    base_config_key = "multimodal_config"
 
     def __init__(
         self,
@@ -357,27 +323,10 @@ def __init__(
         self.qkv_bias = qkv_bias
         self.use_cls_token = use_cls_token
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the multimodal config dict if we are loading from FlavaConfig
-        if config_dict.get("model_type") == "flava":
-            config_dict = config_dict["multimodal_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class FlavaImageCodebookConfig(PretrainedConfig):
     model_type = "flava_image_codebook"
+    base_config_key = "image_codebook_config"
 
     r"""
     [`FlavaImageCodebookConfig`] is the configuration class to store the configuration of a [`FlavaImageCodebook`]. It
@@ -442,24 +391,6 @@ def __init__(
         self.freeze = freeze
         self.initializer_range = initializer_range
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the image codebook config dict if we are loading from FlavaConfig
-        if config_dict.get("model_type") == "flava":
-            config_dict = config_dict["image_codebook_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class FlavaConfig(PretrainedConfig):
     r"""
@@ -532,6 +463,12 @@ class FlavaConfig(PretrainedConfig):
     """
 
     model_type = "flava"
+    sub_configs = {
+        "text_config": FlavaTextConfig,
+        "image_config": FlavaImageConfig,
+        "multimodal_config": FlavaMultimodalConfig,
+        "image_codebook_config": FlavaImageCodebookConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/fuyu/image_processing_fuyu.py b/src/transformers/models/fuyu/image_processing_fuyu.py
index 255922b8308889..4bb9ea7964d416 100644
--- a/src/transformers/models/fuyu/image_processing_fuyu.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 
-from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
     pad,
     resize,
@@ -475,6 +475,7 @@ def preprocess(
             input_data_format = infer_channel_dimension_format(batch_images[0][0])
 
         original_image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
+        size = get_size_dict(size)  # for BC
 
         if do_resize:
             batch_images = [
diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py
index c8c758e6888a59..2df5dbc8b29177 100644
--- a/src/transformers/models/fuyu/modeling_fuyu.py
+++ b/src/transformers/models/fuyu/modeling_fuyu.py
@@ -346,7 +346,7 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        if past_key_values:
+        if past_key_values is not None:
             input_ids = input_ids[:, -1:]
 
         position_ids = kwargs.get("position_ids", None)
@@ -355,7 +355,7 @@ def prepare_inputs_for_generation(
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
+                position_ids = position_ids[:, -1:]
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
@@ -377,3 +377,12 @@ def prepare_inputs_for_generation(
             }
         )
         return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py
index ff7d2c547dc44c..e24f2fd4d1abd0 100644
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@@ -264,10 +264,10 @@ def _tokenize_prompts_with_image_and_batch(
         bos_token = tokenizer.vocab["|ENDOFTEXT|"]
     prompts_tokens = [[[bos_token] + x for x in prompt_seq] for prompt_seq in prompts_tokens]
     if add_beginning_of_answer_token:
-        boa = tokenizer.vocab[BEGINNING_OF_ANSWER_STRING]
+        beginning_of_answer = tokenizer.vocab[BEGINNING_OF_ANSWER_STRING]
         # Only add bbox open token to the last subsequence since that is what will be completed
         for token_seq in prompts_tokens:
-            token_seq[-1].append(boa)
+            token_seq[-1].append(beginning_of_answer)
 
     # Now we have a list of list of tokens which each list has a different
     # size. We want to extend this list to:
@@ -682,6 +682,32 @@ def tokens_to_points(tokens, original_size):
 
         return results
 
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-processes the output of `FuyuForConditionalGeneration` to only return the text output.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                containing the token ids of the generated sequences.
+
+        Returns:
+            `List[str]`: The decoded text output.
+        """
+        beginning_of_answer = self.tokenizer.convert_tokens_to_ids(BEGINNING_OF_ANSWER_STRING)
+        # get boa index for each outputted sequence tensor
+        # start all generated sequences from the beginning of the answer token, pad to have consistent length
+        unpadded_output_sequences = [
+            seq[(seq == beginning_of_answer).nonzero(as_tuple=True)[0] + 1 :] for seq in generated_outputs
+        ]
+        max_len = max(len(seq) for seq in unpadded_output_sequences)
+        # convert to torch and pad sequences
+        padded_output_sequences = torch.full((len(unpadded_output_sequences), max_len), self.pad_token_id)
+        for i, seq in enumerate(unpadded_output_sequences):
+            padded_output_sequences[i, : len(seq)] = torch.tensor(seq)
+
+        return self.batch_decode(padded_output_sequences, skip_special_tokens=True)
+
     def batch_decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index 9a4de1022c57e9..52d02995016167 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -23,7 +23,6 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 
 from ...activations import ACT2FN
@@ -49,7 +48,10 @@
 from .configuration_gemma import GemmaConfig
 
 
+logger = logging.get_logger(__name__)
+
 _CHECKPOINT_FOR_DOC = "google/gemma-7b"
+_CONFIG_FOR_DOC = "GemmaConfig"
 
 
 class GemmaRMSNorm(nn.Module):
@@ -72,9 +74,6 @@ def extra_repr(self):
         return f"{tuple(self.weight.shape)}, eps={self.eps}"
 
 
-logger = logging.get_logger(__name__)
-
-
 class GemmaRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -624,9 +623,6 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
 
-_CONFIG_FOR_DOC = "GemmaConfig"
-
-
 GEMMA_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -724,7 +720,10 @@ def __init__(self, config: GemmaConfig):
             [GemmaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
         self.gradient_checkpointing = False
+        if getattr(config, "pretraining_tp", 1) != 1:
+            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -809,7 +808,7 @@ def forward(
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
 
-        for decoder_layer in self.layers:
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
@@ -986,6 +985,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
 class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py
index 807f91ff9e6baa..ad1348ae5e3163 100644
--- a/src/transformers/models/gemma/modular_gemma.py
+++ b/src/transformers/models/gemma/modular_gemma.py
@@ -886,7 +886,7 @@ def forward(
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
 
-        for decoder_layer in self.layers:
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
diff --git a/src/transformers/models/gemma/tokenization_gemma.py b/src/transformers/models/gemma/tokenization_gemma.py
index ff0d1d034c2238..7138cafbd625fc 100644
--- a/src/transformers/models/gemma/tokenization_gemma.py
+++ b/src/transformers/models/gemma/tokenization_gemma.py
@@ -138,7 +138,7 @@ def __getstate__(self):
         return state
 
     def __setstate__(self, d):
-        self.__dict__ = d
+        self.__dict__.update(d)
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
 
diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py
index 74976bdd340f41..eb562b3a6893bd 100644
--- a/src/transformers/models/gemma2/configuration_gemma2.py
+++ b/src/transformers/models/gemma2/configuration_gemma2.py
@@ -19,8 +19,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
 from ...configuration_utils import PretrainedConfig
 
 
@@ -36,15 +34,15 @@ class Gemma2Config(PretrainedConfig):
         vocab_size (`int`, *optional*, defaults to 256000):
             Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`Gemma2Model`]
-        hidden_size (`int`, *optional*, defaults to 3072):
+        hidden_size (`int`, *optional*, defaults to 2304):
             Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 24576):
+        intermediate_size (`int`, *optional*, defaults to 9216):
             Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 28):
+        num_hidden_layers (`int`, *optional*, defaults to 26):
             Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 16):
+        num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*, defaults to 16):
+        num_key_value_heads (`int`, *optional*, defaults to 4):
             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
             `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
@@ -80,7 +78,7 @@ class Gemma2Config(PretrainedConfig):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        query_pre_attn_scalar (`float`, *optional*, defaults to 224): scaling factor used on the attention scores
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256): scaling factor used on the attention scores
         sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the
             size of the sliding window.
         final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits.
@@ -103,11 +101,11 @@ class Gemma2Config(PretrainedConfig):
     def __init__(
         self,
         vocab_size=256000,
-        hidden_size=3072,
-        intermediate_size=24576,
-        num_hidden_layers=28,
-        num_attention_heads=16,
-        num_key_value_heads=16,
+        hidden_size=2304,
+        intermediate_size=9216,
+        num_hidden_layers=26,
+        num_attention_heads=8,
+        num_key_value_heads=4,
         head_dim=256,
         hidden_activation="gelu_pytorch_tanh",
         max_position_embeddings=8192,
@@ -121,7 +119,7 @@ def __init__(
         rope_theta=10000.0,
         attention_bias=False,
         attention_dropout=0.0,
-        query_pre_attn_scalar=224,
+        query_pre_attn_scalar=256,
         sliding_window=4096,
         final_logit_softcapping=30.0,
         attn_logit_softcapping=50.0,
diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py
index 6d61c47619f304..6111261830b8f0 100644
--- a/src/transformers/models/gemma2/modeling_gemma2.py
+++ b/src/transformers/models/gemma2/modeling_gemma2.py
@@ -23,7 +23,6 @@
 
 import torch
 import torch.nn as nn
-import torch.utils.checkpoint
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache, HybridCache
@@ -40,15 +39,26 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
     is_flash_attn_greater_or_equal,
-    is_flash_attn_greater_or_equal_2_10,
+    is_torch_greater_or_equal,
     logging,
     replace_return_docstrings,
 )
 from .configuration_gemma2 import Gemma2Config
 
 
+if is_flash_attn_2_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+if is_torch_greater_or_equal("2.5"):
+    from torch.nn.attention.flex_attention import flex_attention
+
+logger = logging.get_logger(__name__)
+
+
 _CHECKPOINT_FOR_DOC = "google/gemma2-7b"
+_CONFIG_FOR_DOC = "Gemma2Config"
 
 
 class Gemma2RMSNorm(nn.Module):
@@ -86,9 +96,6 @@ def forward(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 
 
-logger = logging.get_logger(__name__)
-
-
 class Gemma2RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -163,6 +170,127 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
+def eager_attention_forward(config, query, key, value, mask, **_kwargs):
+    key_states = repeat_kv(key, config.num_key_value_groups)
+    value_states = repeat_kv(value, config.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * config.scaling
+
+    if config.attn_logit_softcapping is not None:
+        attn_weights = attn_weights / config.attn_logit_softcapping
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * config.attn_logit_softcapping
+    if mask is not None:  # no matter the length, we just slice it
+        causal_mask = mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+def flash_attention_forward(config, query, key, value, mask, target_dtype=torch.float16, **_kwargs):
+    if mask is not None:
+        seq_len = mask.shape[1]
+        query = query[:, :, :seq_len]
+        value = value[:, :, :seq_len]
+
+    # TODO: These transpose are quite inefficient but Flash Attention requires the layout
+    # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding
+    query_states = query.transpose(1, 2)
+    key_states = key.transpose(1, 2)
+    value_states = value.transpose(1, 2)
+
+    dropout_rate = config.attention_dropout if config.training else 0.0
+
+    input_dtype = query_states.dtype
+    if input_dtype == torch.float32:
+        query_states = query_states.to(target_dtype)
+        key_states = key_states.to(target_dtype)
+        value_states = value_states.to(target_dtype)
+
+    attn_output = _flash_attention_forward(
+        query_states,
+        key_states,
+        value_states,
+        mask,
+        seq_len,
+        dropout=dropout_rate,
+        softmax_scale=config.scaling,
+        is_causal=config.is_causal,
+        sliding_window=config.sliding_window,
+        use_top_left_mask=config._flash_attn_uses_top_left_mask,
+        softcap=config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
+    )
+
+    return attn_output, None
+
+
+def flex_attention_forward(config, query, key, value, mask, output_attentions=False, **_kwargs):
+    def tanh_softcap(score, b, h, q_idx, kv_idx):
+        soft_cap = config.attn_logit_softcapping
+        score = soft_cap * torch.tanh(score / soft_cap)
+        if mask is not None:
+            return score + mask[b][0][q_idx][kv_idx]
+        return score
+
+    attn_output = flex_attention(
+        query,
+        key,
+        value,
+        score_mod=tanh_softcap,
+        enable_gqa=True,
+        scale=config.scaling,
+        return_lse=output_attentions,
+    )
+    if not output_attentions:
+        return attn_output, None
+    else:
+        return attn_output[0], attn_output[1]
+
+
+def sdpa_attention_forward(config, query, key, value, mask, **_kwargs):
+    key = repeat_kv(key, config.num_key_value_groups)
+    value = repeat_kv(value, config.num_key_value_groups)
+
+    causal_mask = mask
+    if mask is not None:
+        causal_mask = causal_mask[:, :, :, : key.shape[-2]]
+
+    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+    # Reference: https://github.com/pytorch/pytorch/issues/112577.
+    if query.device.type == "cuda" and causal_mask is not None:
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+
+    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+    is_causal = True if causal_mask is None and query.shape[1] > 1 else False
+
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=causal_mask,
+        dropout_p=config.attention_dropout if config.training else 0.0,
+        is_causal=is_causal,
+        scale=config.scaling,
+    )
+    return attn_output, None
+
+
+GEMMA2_ATTENTION_FUNCTION = {
+    "flash_attention_2": flash_attention_forward,
+    "flex_attention": flex_attention_forward,
+    "eager": eager_attention_forward,
+    "sdpa": sdpa_attention_forward,
+}
+
+
 class Gemma2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -170,12 +298,6 @@ def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
 
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
@@ -187,7 +309,8 @@ def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
         self.rope_theta = config.rope_theta
         self.is_causal = True
         self.scaling = config.query_pre_attn_scalar**-0.5
-
+        self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
+        self.attn_logit_softcapping = config.attn_logit_softcapping
         if self.hidden_size % self.num_heads != 0:
             raise ValueError(
                 f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
@@ -198,7 +321,6 @@ def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
         self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
-        self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
         self.rotary_emb = Gemma2RotaryEmbedding(
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
@@ -238,33 +360,17 @@ def forward(
             }
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
-
-        if self.config.attn_logit_softcapping is not None:
-            attn_weights = attn_weights / self.config.attn_logit_softcapping
-            attn_weights = torch.tanh(attn_weights)
-            attn_weights = attn_weights * self.config.attn_logit_softcapping
-        if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-            attn_weights = attn_weights + causal_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
+        if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]:
+            logger.warning_once("Setting `attention_type` to `flex_attention` because `output_attentions=True`")
+            attention_type = "eager"
+        else:
+            attention_type = self.config._attn_implementation
 
-        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output, attn_weights = GEMMA2_ATTENTION_FUNCTION[attention_type](
+            self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions
+        )
 
-        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
         attn_output = self.o_proj(attn_output)
 
         if not output_attentions:
@@ -274,233 +380,39 @@ def forward(
 
 
 class Gemma2FlashAttention2(Gemma2Attention):
-    """
-    Gemma2 flash attention module. This module inherits from `Gemma2Attention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                "sin": sin,
-                "cos": cos,
-                "sliding_window": self.sliding_window,
-                "cache_position": cache_position,
-            }
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        if attention_mask is not None:
-            seq_len = attention_mask.shape[1]
-            key_states = key_states[:, :, :seq_len]
-            value_states = value_states[:, :, :seq_len]
-
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = self.attention_dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (Gemma2RMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            softmax_scale=self.scaling,
-            is_causal=self.is_causal,
-            sliding_window=self.sliding_window,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
-            softcap=self.config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.config._attn_implementation = "flash_attention_2"
+        logger.warning_once(
+            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
+            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
         )
 
-        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
 
 class Gemma2SdpaAttention(Gemma2Attention):
-    """
-    Gemma2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Gemma2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Gemma2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Gemma2Model is using Gemma2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                "sin": sin,
-                "cos": cos,
-                "sliding_window": self.sliding_window,
-                "cache_position": cache_position,
-            }
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        causal_mask = attention_mask
-        if attention_mask is not None:
-            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and causal_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        is_causal = True if causal_mask is None and q_len > 1 else False
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=is_causal,
-            scale=self.scaling,
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.config._attn_implementation = "sdpa"
+        logger.warning_once(
+            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
+            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
         )
 
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, -1)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-GEMMA2_ATTENTION_CLASSES = {
-    "eager": Gemma2Attention,
-    "flash_attention_2": Gemma2FlashAttention2,
-    "sdpa": Gemma2SdpaAttention,
-}
-
 
 class Gemma2DecoderLayer(nn.Module):
     def __init__(self, config: Gemma2Config, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = GEMMA2_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
-        self.mlp = Gemma2MLP(config)
-        self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.config = config
         self.is_sliding = not bool(layer_idx % 2)
+        self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma2MLP(config)
+        self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
         self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.sliding_window = config.sliding_window
-        self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -512,25 +424,6 @@ def forward(
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*):
-                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
-                query_sequence_length, key_sequence_length)` if default attention is used.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-                Indices depicting the position of the input sequence tokens in the sequence
-            kwargs (`dict`, *optional*):
-                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
-                into the model
-        """
         if self.is_sliding and attention_mask is not None:  # efficient SDPA and no padding
             # Flash-attn is a 2D tensor
             if self.config._attn_implementation == "flash_attention_2":
@@ -638,9 +531,6 @@ def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False):
         return config
 
 
-_CONFIG_FOR_DOC = "Gemma2Config"
-
-
 GEMMA2_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -738,7 +628,10 @@ def __init__(self, config: Gemma2Config):
             [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
         self.gradient_checkpointing = False
+        if getattr(config, "pretraining_tp", 1) != 1:
+            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -818,7 +711,7 @@ def forward(
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
 
-        for decoder_layer in self.layers:
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
@@ -865,6 +758,7 @@ def forward(
             attentions=all_self_attns,
         )
 
+    @torch.no_grad()
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -958,6 +852,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
 class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py
index 7ddb1c9f4c99e7..8d86238632365f 100644
--- a/src/transformers/models/gemma2/modular_gemma2.py
+++ b/src/transformers/models/gemma2/modular_gemma2.py
@@ -29,18 +29,17 @@
 from ...utils import (
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal,
-    is_flash_attn_greater_or_equal_2_10,
+    is_torch_greater_or_equal,
     logging,
 )
 from ..gemma.modeling_gemma import (
-    GemmaAttention,
-    GemmaDecoderLayer,
     GemmaForCausalLM,
     GemmaForSequenceClassification,
     GemmaForTokenClassification,
     GemmaModel,
     GemmaPreTrainedModel,
     GemmaRMSNorm,
+    GemmaRotaryEmbedding,
     apply_rotary_pos_emb,
     repeat_kv,
 )
@@ -49,6 +48,9 @@
 if is_flash_attn_2_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
+if is_torch_greater_or_equal("2.5"):
+    from torch.nn.attention.flex_attention import flex_attention
+
 
 _CHECKPOINT_FOR_DOC = "google/gemma2-7b"
 
@@ -67,15 +69,15 @@ class Gemma2Config(PretrainedConfig):
         vocab_size (`int`, *optional*, defaults to 256000):
             Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`Gemma2Model`]
-        hidden_size (`int`, *optional*, defaults to 3072):
+        hidden_size (`int`, *optional*, defaults to 2304):
             Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 24576):
+        intermediate_size (`int`, *optional*, defaults to 9216):
             Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 28):
+        num_hidden_layers (`int`, *optional*, defaults to 26):
             Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 16):
+        num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*, defaults to 16):
+        num_key_value_heads (`int`, *optional*, defaults to 4):
             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
             `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
@@ -111,7 +113,7 @@ class Gemma2Config(PretrainedConfig):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        query_pre_attn_scalar (`float`, *optional*, defaults to 224): scaling factor used on the attention scores
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256): scaling factor used on the attention scores
         sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the
             size of the sliding window.
         final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits.
@@ -134,11 +136,11 @@ class Gemma2Config(PretrainedConfig):
     def __init__(
         self,
         vocab_size=256000,
-        hidden_size=3072,
-        intermediate_size=24576,
-        num_hidden_layers=28,
-        num_attention_heads=16,
-        num_key_value_heads=16,
+        hidden_size=2304,
+        intermediate_size=9216,
+        num_hidden_layers=26,
+        num_attention_heads=8,
+        num_key_value_heads=4,
         head_dim=256,
         hidden_activation="gelu_pytorch_tanh",
         max_position_embeddings=8192,
@@ -152,7 +154,7 @@ def __init__(
         rope_theta=10000.0,
         attention_bias=False,
         attention_dropout=0.0,
-        query_pre_attn_scalar=224,
+        query_pre_attn_scalar=256,
         sliding_window=4096,
         final_logit_softcapping=30.0,
         attn_logit_softcapping=50.0,
@@ -207,118 +209,183 @@ def forward(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 
 
-class Gemma2Attention(GemmaAttention):
+class Gemma2RotaryEmbedding(GemmaRotaryEmbedding):
+    pass
+
+
+def eager_attention_forward(config, query, key, value, mask, **_kwargs):
+    key_states = repeat_kv(key, config.num_key_value_groups)
+    value_states = repeat_kv(value, config.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * config.scaling
+
+    if config.attn_logit_softcapping is not None:
+        attn_weights = attn_weights / config.attn_logit_softcapping
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * config.attn_logit_softcapping
+    if mask is not None:  # no matter the length, we just slice it
+        causal_mask = mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+def flash_attention_forward(config, query, key, value, mask, target_dtype=torch.float16, **_kwargs):
+    if mask is not None:
+        seq_len = mask.shape[1]
+        query = query[:, :, :seq_len]
+        value = value[:, :, :seq_len]
+
+    # TODO: These transpose are quite inefficient but Flash Attention requires the layout
+    # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding
+    query_states = query.transpose(1, 2)
+    key_states = key.transpose(1, 2)
+    value_states = value.transpose(1, 2)
+
+    dropout_rate = config.attention_dropout if config.training else 0.0
+
+    input_dtype = query_states.dtype
+    if input_dtype == torch.float32:
+        query_states = query_states.to(target_dtype)
+        key_states = key_states.to(target_dtype)
+        value_states = value_states.to(target_dtype)
+
+    attn_output = _flash_attention_forward(
+        query_states,
+        key_states,
+        value_states,
+        mask,
+        seq_len,
+        dropout=dropout_rate,
+        softmax_scale=config.scaling,
+        is_causal=config.is_causal,
+        sliding_window=config.sliding_window,
+        use_top_left_mask=config._flash_attn_uses_top_left_mask,
+        softcap=config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
+    )
+
+    return attn_output, None
+
+
+def flex_attention_forward(config, query, key, value, mask, output_attentions=False, **_kwargs):
+    def tanh_softcap(score, b, h, q_idx, kv_idx):
+        soft_cap = config.attn_logit_softcapping
+        score = soft_cap * torch.tanh(score / soft_cap)
+        if mask is not None:
+            return score + mask[b][0][q_idx][kv_idx]
+        return score
+
+    attn_output = flex_attention(
+        query,
+        key,
+        value,
+        score_mod=tanh_softcap,
+        enable_gqa=True,
+        scale=config.scaling,
+        return_lse=output_attentions,
+    )
+    if not output_attentions:
+        return attn_output, None
+    else:
+        return attn_output[0], attn_output[1]
+
+
+def sdpa_attention_forward(config, query, key, value, mask, **_kwargs):
+    key = repeat_kv(key, config.num_key_value_groups)
+    value = repeat_kv(value, config.num_key_value_groups)
+
+    causal_mask = mask
+    if mask is not None:
+        causal_mask = causal_mask[:, :, :, : key.shape[-2]]
+
+    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+    # Reference: https://github.com/pytorch/pytorch/issues/112577.
+    if query.device.type == "cuda" and causal_mask is not None:
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+
+    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+    is_causal = True if causal_mask is None and query.shape[1] > 1 else False
+
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=causal_mask,
+        dropout_p=config.attention_dropout if config.training else 0.0,
+        is_causal=is_causal,
+        scale=config.scaling,
+    )
+    return attn_output, None
+
+
+GEMMA2_ATTENTION_FUNCTION = {
+    "flash_attention_2": flash_attention_forward,
+    "flex_attention": flex_attention_forward,
+    "eager": eager_attention_forward,
+    "sdpa": sdpa_attention_forward,
+}
+
+
+class Gemma2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
-        super().__init__(config, layer_idx)
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
         self.scaling = config.query_pre_attn_scalar**-0.5
         self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                "sin": sin,
-                "cos": cos,
-                "sliding_window": self.sliding_window,
-                "cache_position": cache_position,
-            }
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
-
-        if self.config.attn_logit_softcapping is not None:
-            attn_weights = attn_weights / self.config.attn_logit_softcapping
-            attn_weights = torch.tanh(attn_weights)
-            attn_weights = attn_weights * self.config.attn_logit_softcapping
-        if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-            attn_weights = attn_weights + causal_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+        self.attn_logit_softcapping = config.attn_logit_softcapping
+        if self.hidden_size % self.num_heads != 0:
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
             )
 
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.view(bsz, q_len, -1)
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Gemma2FlashAttention2(Gemma2Attention):
-    """
-    Gemma2 flash attention module. This module inherits from `Gemma2Attention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self.rotary_emb = Gemma2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        output_attentions = False
-
         bsz, q_len, _ = hidden_states.size()
 
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
 
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -336,57 +403,14 @@ def forward(
             }
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
-        if attention_mask is not None:
-            seq_len = attention_mask.shape[1]
-            key_states = key_states[:, :, :seq_len]
-            value_states = value_states[:, :, :seq_len]
-
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = self.attention_dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (Gemma2RMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
+        if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]:
+            logger.warning_once("Setting `attention_type` to `flex_attention` because `output_attentions=True`")
+            attention_type = "eager"
+        else:
+            attention_type = self.config._attn_implementation
 
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            softmax_scale=self.scaling,
-            is_causal=self.is_causal,
-            sliding_window=self.sliding_window,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
-            softcap=self.config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
+        attn_output, attn_weights = GEMMA2_ATTENTION_FUNCTION[attention_type](
+            self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions
         )
 
         attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
@@ -398,105 +422,37 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-class Gemma2SdpaAttention(Gemma2Attention):
-    """
-    Gemma2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Gemma2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Gemma2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Gemma2Model is using Gemma2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                "sin": sin,
-                "cos": cos,
-                "sliding_window": self.sliding_window,
-                "cache_position": cache_position,
-            }
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        causal_mask = attention_mask
-        if attention_mask is not None:
-            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and causal_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        is_causal = True if causal_mask is None and q_len > 1 else False
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=is_causal,
-            scale=self.scaling,
+class Gemma2FlashAttention2(Gemma2Attention):
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.config._attn_implementation = "flash_attention_2"
+        logger.warning_once(
+            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
+            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
         )
 
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, -1)
-
-        attn_output = self.o_proj(attn_output)
 
-        return attn_output, None, past_key_value
+class Gemma2SdpaAttention(Gemma2Attention):
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.config._attn_implementation = "sdpa"
+        logger.warning_once(
+            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
+            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
+        )
 
 
-class Gemma2DecoderLayer(GemmaDecoderLayer):
+class Gemma2DecoderLayer(nn.Module):
     def __init__(self, config: Gemma2Config, layer_idx: int):
-        super().__init__(config, layer_idx)
+        super().__init__()
+        self.hidden_size = config.hidden_size
         self.config = config
         self.is_sliding = not bool(layer_idx % 2)
+        self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
         self.mlp = Gemma2MLP(config)
+        self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
         self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.sliding_window = config.sliding_window
@@ -653,7 +609,7 @@ def forward(
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
 
-        for decoder_layer in self.layers:
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py
index ecaea17ff946af..1be3e7067bdfcf 100644
--- a/src/transformers/models/git/configuration_git.py
+++ b/src/transformers/models/git/configuration_git.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from typing import Union
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -72,6 +70,7 @@ class GitVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "git_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -102,24 +101,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from GITConfig
-        if config_dict.get("model_type") == "git":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class GitConfig(PretrainedConfig):
     r"""
@@ -186,6 +167,7 @@ class GitConfig(PretrainedConfig):
     ```"""
 
     model_type = "git"
+    sub_configs = {"vision_config": GitVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py
index 3744d81a0aca81..e9e96fa765d841 100644
--- a/src/transformers/models/git/processing_git.py
+++ b/src/transformers/models/git/processing_git.py
@@ -22,12 +22,16 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
 
 
 class GitProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
+logger = logging.get_logger(__name__)
+
+
 class GitProcessor(ProcessorMixin):
     r"""
     Constructs a GIT processor which wraps a CLIP image processor and a BERT tokenizer into a single processor.
@@ -91,6 +95,15 @@ def __call__(
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+        legacy = kwargs.pop("legacy", True)
+        if legacy:
+            logger.warning_once(
+                "Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. "
+                "In the new behavior, if both images and text are provided, the last token (EOS token) "
+                "of the input_ids and attention_mask tensors will be removed. "
+                "To test the new behavior, set `legacy=False`as a processor call argument."
+            )
+
         if text is None and images is None:
             raise ValueError("You have to specify either text or images. Both cannot be none.")
 
@@ -110,6 +123,10 @@ def __call__(
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
             data.update(image_features)
+            if not legacy:
+                data["input_ids"] = data["input_ids"][:, :-1]
+                data["attention_mask"] = data["attention_mask"][:, :-1]
+
         return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, *args, **kwargs):
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index 5f8eaf89ed9353..9080b5b9cc7c39 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -24,7 +24,6 @@
 
 import torch
 import torch.nn as nn
-import torch.utils.checkpoint
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, StaticCache
@@ -50,7 +49,10 @@
 from .configuration_glm import GlmConfig
 
 
+logger = logging.get_logger(__name__)
+
 _CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b"
+_CONFIG_FOR_DOC = "GlmConfig"
 
 
 class GlmRMSNorm(nn.Module):
@@ -121,7 +123,16 @@ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
         return self.down_proj(up_states)
 
 
-logger = logging.get_logger(__name__)
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
 def rotate_half(x):
@@ -172,18 +183,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
 
 
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
 class GlmAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -608,9 +607,6 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
 
-_CONFIG_FOR_DOC = "GlmConfig"
-
-
 GLM_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -712,6 +708,8 @@ def __init__(self, config: GlmConfig):
             dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta
         )
         self.gradient_checkpointing = False
+        if getattr(config, "pretraining_tp", 1) != 1:
+            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -791,7 +789,7 @@ def forward(
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
 
-        for decoder_layer in self.layers:
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
@@ -971,6 +969,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
 class GlmForCausalLM(GlmPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
 
     def __init__(self, config: GlmConfig):
         super().__init__(config)
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 9e69c8ae8a6e7a..115cefc86beec3 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -14,7 +14,11 @@
 # limitations under the License.
 """Image processor class for GLPN."""
 
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthEstimatorOutput
 
 import numpy as np
 import PIL.Image
@@ -27,12 +31,17 @@
     get_image_size,
     infer_channel_dimension_format,
     is_scaled_image,
+    is_torch_available,
     make_list_of_images,
     to_numpy_array,
     valid_images,
     validate_preprocess_arguments,
 )
-from ...utils import TensorType, filter_out_non_signature_kwargs, logging
+from ...utils import TensorType, filter_out_non_signature_kwargs, logging, requires_backends
+
+
+if is_torch_available():
+    import torch
 
 
 logger = logging.get_logger(__name__)
@@ -218,3 +227,44 @@ def preprocess(
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_depth_estimation(
+        self,
+        outputs: "DepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
+    ) -> List[Dict[str, TensorType]]:
+        """
+        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`DepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        requires_backends(self, "torch")
+
+        predicted_depth = outputs.predicted_depth
+
+        if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth"
+            )
+
+        results = []
+        target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes
+        for depth, target_size in zip(predicted_depth, target_sizes):
+            if target_size is not None:
+                depth = depth[None, None, ...]
+                depth = torch.nn.functional.interpolate(depth, size=target_size, mode="bicubic", align_corners=False)
+                depth = depth.squeeze()
+
+            results.append({"predicted_depth": depth})
+
+        return results
diff --git a/src/transformers/models/glpn/modeling_glpn.py b/src/transformers/models/glpn/modeling_glpn.py
index 9fd22ca0f7be95..70f175df8c9973 100755
--- a/src/transformers/models/glpn/modeling_glpn.py
+++ b/src/transformers/models/glpn/modeling_glpn.py
@@ -723,20 +723,18 @@ def forward(
 
         >>> with torch.no_grad():
         ...     outputs = model(**inputs)
-        ...     predicted_depth = outputs.predicted_depth
 
         >>> # interpolate to original size
-        >>> prediction = torch.nn.functional.interpolate(
-        ...     predicted_depth.unsqueeze(1),
-        ...     size=image.size[::-1],
-        ...     mode="bicubic",
-        ...     align_corners=False,
+        >>> post_processed_output = image_processor.post_process_depth_estimation(
+        ...     outputs,
+        ...     target_sizes=[(image.height, image.width)],
         ... )
 
         >>> # visualize the prediction
-        >>> output = prediction.squeeze().cpu().numpy()
-        >>> formatted = (output * 255 / np.max(output)).astype("uint8")
-        >>> depth = Image.fromarray(formatted)
+        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
+        >>> depth = predicted_depth * 255 / predicted_depth.max()
+        >>> depth = depth.detach().cpu().numpy()
+        >>> depth = Image.fromarray(depth.astype("uint8"))
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (
diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py
index e608fbcdbe9c0a..e85e4fc9184371 100644
--- a/src/transformers/models/groupvit/configuration_groupvit.py
+++ b/src/transformers/models/groupvit/configuration_groupvit.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 """GroupViT model configuration"""
 
-import os
 from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+from typing import TYPE_CHECKING, Any, Mapping, Optional
 
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
@@ -86,6 +85,7 @@ class GroupViTTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "groupvit_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -121,24 +121,6 @@ def __init__(
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from GroupViTConfig
-        if config_dict.get("model_type") == "groupvit":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class GroupViTVisionConfig(PretrainedConfig):
     r"""
@@ -197,6 +179,7 @@ class GroupViTVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "groupvit_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -246,24 +229,6 @@ def __init__(
         self.assign_eps = assign_eps
         self.assign_mlp_ratio = assign_mlp_ratio
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from GroupViTConfig
-        if config_dict.get("model_type") == "groupvit":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class GroupViTConfig(PretrainedConfig):
     r"""
@@ -292,6 +257,7 @@ class GroupViTConfig(PretrainedConfig):
     """
 
     model_type = "groupvit"
+    sub_configs = {"text_config": GroupViTTextConfig, "vision_config": GroupViTVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py
index 56b6025a8e89dd..e34a5764400196 100644
--- a/src/transformers/models/idefics/configuration_idefics.py
+++ b/src/transformers/models/idefics/configuration_idefics.py
@@ -38,7 +38,7 @@ class IdeficsVisionConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        hidden_size (`int`, *optional*, defaults to 768):
+        embed_dim (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
@@ -50,12 +50,12 @@ class IdeficsVisionConfig(PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        image_num_channels (`int`, *optional*, defaults to `3`):
+        num_channels (`int`, *optional*, defaults to 3):
             Number of image channels.
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
@@ -64,11 +64,9 @@ class IdeficsVisionConfig(PretrainedConfig):
         initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
             testing).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
     """
 
-    model_type = "idefics"
+    model_type = "idefics_vision"
     attribute_map = {
         "hidden_size": "embed_dim",
     }
@@ -119,7 +117,7 @@ class IdeficsPerceiverConfig(PretrainedConfig):
     Args:
         use_resampler (`bool`, *optional*, defaults to `False`):
             Whether or not to use the resampler
-        resampler_n_latents (`int`, *optional*, defaults to ):
+        resampler_n_latents (`int`, *optional*, defaults to 64):
             Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
         resampler_depth (`int`, *optional*, defaults to 6):
             Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
@@ -131,7 +129,7 @@ class IdeficsPerceiverConfig(PretrainedConfig):
             Whether or not to use qk layer norms in perceiver
     """
 
-    model_type = "idefics"
+    model_type = "idefics_perciever"
 
     def __init__(
         self,
@@ -235,7 +233,7 @@ class IdeficsConfig(PretrainedConfig):
     ```"""
 
     model_type = "idefics"
-    is_composition = False
+    sub_configs = {"perceiver_config": IdeficsPerceiverConfig, "vision_config": IdeficsVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index 3406ab2226e08b..ca6e4702d3173e 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -219,7 +219,11 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u
 
         super().__init__(image_processor, tokenizer)
         self.current_processor = self.image_processor
-        self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if hasattr(tokenizer, "image_token")
+            else tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+        )
 
         self.default_image_dims = (
             self.image_processor.image_num_channels,
diff --git a/src/transformers/models/idefics2/configuration_idefics2.py b/src/transformers/models/idefics2/configuration_idefics2.py
index 64743d1cd470e7..408d374c77f7eb 100644
--- a/src/transformers/models/idefics2/configuration_idefics2.py
+++ b/src/transformers/models/idefics2/configuration_idefics2.py
@@ -13,12 +13,9 @@
 # limitations under the License.
 """Idefics2 model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -76,7 +73,8 @@ class Idefics2VisionConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    model_type = "idefics2"
+    model_type = "idefics2_vision"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -107,24 +105,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from Idefics2Config
-        if config_dict.get("model_type") == "idefics2":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Idefics2PerceiverConfig(PretrainedConfig):
     r"""
@@ -152,7 +132,7 @@ class Idefics2PerceiverConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
     """
 
-    model_type = "idefics2"
+    model_type = "idefics2_perceiver"
 
     def __init__(
         self,
@@ -220,7 +200,11 @@ class Idefics2Config(PretrainedConfig):
     ```"""
 
     model_type = "idefics2"
-    is_composition = True
+    sub_configs = {
+        "text_config": AutoConfig,
+        "perceiver_config": Idefics2PerceiverConfig,
+        "vision_config": Idefics2VisionConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py
index ac9df68871eee2..ce0032f80c5ece 100644
--- a/src/transformers/models/idefics2/image_processing_idefics2.py
+++ b/src/transformers/models/idefics2/image_processing_idefics2.py
@@ -99,6 +99,7 @@ def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
         isinstance(images, (list, tuple))
         and len(images) > 0
         and isinstance(images[0], (list, tuple))
+        and len(images[0]) > 0
         and is_valid_image(images[0][0])
     ):
         pass
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index 68566d182678c2..f99c1bda474568 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -16,6 +16,7 @@
 Processor class for IDEFICS2.
 """
 
+from itertools import accumulate
 from typing import TYPE_CHECKING, List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
@@ -94,16 +95,19 @@ def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 64, cha
         if tokenizer is None:
             raise ValueError("You need to specify a `tokenizer`.")
 
-        self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True)
-        self.image_token = AddedToken("<image>", normalized=False, special=True)
+        if not hasattr(tokenizer, "image_token"):
+            self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True)
+            self.image_token = AddedToken("<image>", normalized=False, special=True)
+            tokens_to_add = {"additional_special_tokens": [self.fake_image_token, self.image_token]}
+            tokenizer.add_special_tokens(tokens_to_add)
+        else:
+            self.fake_image_token = tokenizer.image_boundary_token
+            self.image_token = tokenizer.image_token
+
         self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
+        tokenizer.add_special_tokens({"additional_special_tokens": [self.end_of_utterance_token]})
         self.image_seq_len = image_seq_len
 
-        tokens_to_add = {
-            "additional_special_tokens": [self.fake_image_token, self.image_token, self.end_of_utterance_token]
-        }
-        tokenizer.add_special_tokens(tokens_to_add)
-
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
     def _extract_images_from_prompts(self, prompts):
@@ -218,7 +222,21 @@ def __call__(
             if is_image_or_image_url(images):
                 images = [[images]]
             elif isinstance(images, list) and is_image_or_image_url(images[0]):
-                images = [images]
+                if text is not None:
+                    if sum(n_images_in_text) != len(images):
+                        raise ValueError(
+                            f"The total number of {image_token} tokens in the prompts should be the same as the number of images passed."
+                            f" Found {sum(n_images_in_text)} {image_token} tokens and {len(images)} images."
+                        )
+                    # Reorganize the images to match the prompts
+                    cumsum_images_in_text = [0] + list(accumulate(n_images_in_text))
+                    images = [
+                        images[cumsum_images_in_text[i] : cumsum_images_in_text[i + 1]]
+                        for i in range(len(n_images_in_text))
+                    ]
+                else:
+                    images = [images]
+
             elif (
                 not isinstance(images, list)
                 and not isinstance(images[0], list)
diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
index 45afe685f5209c..4b10d8d2d03a81 100644
--- a/src/transformers/models/idefics3/configuration_idefics3.py
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -13,12 +13,9 @@
 # limitations under the License.
 """Idefics3 model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -57,8 +54,7 @@ class Idefics3VisionConfig(PretrainedConfig):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        intializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation for initializing all weight matrices in the model.
+        initializer_range (`<fill_type>`, *optional*, defaults to 0.02): <fill_docstring>
 
     Example:
 
@@ -76,7 +72,8 @@ class Idefics3VisionConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    model_type = "idefics3"
+    model_type = "idefics3_vision"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -107,24 +104,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from Idefics3Config
-        if config_dict.get("model_type") == "idefics3":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Idefics3Config(PretrainedConfig):
     r"""
@@ -165,7 +144,7 @@ class Idefics3Config(PretrainedConfig):
     ```"""
 
     model_type = "idefics3"
-    is_composition = True
+    sub_configs = {"text_config": AutoConfig, "vision_config": Idefics3VisionConfig}
 
     def __init__(
         self,
@@ -204,4 +183,4 @@ def __init__(
         self.text_config = text_config
         self.scale_factor = scale_factor
 
-        super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
+        super().__init__(**kwargs, pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings)
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 495ac04595fbc6..05a1a396dc72d3 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -151,9 +151,11 @@ def get_resize_output_image_size(
 def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
     """
     Convert a single image or a list of images to a list of numpy arrays.
+
     Args:
         images (`ImageInput`):
             A single image or a list of images.
+
     Returns:
         A list of numpy arrays.
     """
@@ -168,6 +170,7 @@ def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
         isinstance(images, (list, tuple))
         and len(images) > 0
         and isinstance(images[0], (list, tuple))
+        and len(images[0]) > 0
         and is_valid_image(images[0][0])
     ):
         pass
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index ceafa26a8b1187..872f5206f20175 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -17,6 +17,7 @@
 """
 
 import re
+from itertools import accumulate
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
@@ -241,11 +242,31 @@ def __call__(
         n_images_in_images = []
         inputs = BatchFeature()
 
+        if text is not None:
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+            n_images_in_text = [sample.count(self.image_token.content) for sample in text]
+
         if images is not None:
             if is_image_or_image_url(images):
                 images = [[images]]
             elif isinstance(images, list) and is_image_or_image_url(images[0]):
-                images = [images]
+                if text is not None:
+                    if sum(n_images_in_text) != len(images):
+                        raise ValueError(
+                            f"The total number of {self.image_token.content} tokens in the prompts should be the same as the number of images passed."
+                            f" Found {sum(n_images_in_text)} {self.image_token.content} tokens and {len(images)} images."
+                        )
+                    # Reorganize the images to match the prompts
+                    cumsum_images_in_text = [0] + list(accumulate(n_images_in_text))
+                    images = [
+                        images[cumsum_images_in_text[i] : cumsum_images_in_text[i + 1]]
+                        for i in range(len(n_images_in_text))
+                    ]
+                else:
+                    images = [images]
             elif (
                 not isinstance(images, list)
                 and not isinstance(images[0], list)
@@ -263,10 +284,10 @@ def __call__(
             inputs.update(image_inputs)
 
         if text is not None:
-            if isinstance(text, str):
-                text = [text]
-            elif not isinstance(text, list) and not isinstance(text[0], str):
-                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+            if n_images_in_images != n_images_in_text:
+                raise ValueError(
+                    f"The number of images in the text {n_images_in_text} and images  {n_images_in_images} should be the same."
+                )
 
             image_rows = inputs.pop("rows", [[0] * len(text)])
             image_cols = inputs.pop("cols", [[0] * len(text)])
@@ -277,8 +298,6 @@ def __call__(
 
             prompt_strings = []
             for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
-                n_images_in_text.append(sample.count(image_token))
-
                 # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
                 image_prompt_strings = []
                 for n_rows, n_cols in zip(sample_rows, sample_cols):
@@ -305,11 +324,6 @@ def __call__(
             text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"])
             inputs.update(text_inputs)
 
-            if n_images_in_images != n_images_in_text:
-                raise ValueError(
-                    f"The number of images in the text {n_images_in_text} and images  {n_images_in_images} should be the same."
-                )
-
         return inputs
 
     def batch_decode(self, *args, **kwargs):
diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py
index a274212a945e04..6124dba3a08efe 100644
--- a/src/transformers/models/instructblip/configuration_instructblip.py
+++ b/src/transformers/models/instructblip/configuration_instructblip.py
@@ -14,13 +14,10 @@
 # limitations under the License.
 """InstructBLIP model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -78,6 +75,7 @@ class InstructBlipVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -108,24 +106,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.qkv_bias = qkv_bias
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from InstructBlipConfig
-        if config_dict.get("model_type") == "instructblip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class InstructBlipQFormerConfig(PretrainedConfig):
     r"""
@@ -192,6 +172,7 @@ class InstructBlipQFormerConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblip_qformer"
+    base_config_key = "qformer_config"
 
     def __init__(
         self,
@@ -229,24 +210,6 @@ def __init__(
         self.cross_attention_frequency = cross_attention_frequency
         self.encoder_hidden_size = encoder_hidden_size
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the qformer config dict if we are loading from InstructBlipConfig
-        if config_dict.get("model_type") == "instructblip":
-            config_dict = config_dict["qformer_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class InstructBlipConfig(PretrainedConfig):
     r"""
@@ -305,6 +268,11 @@ class InstructBlipConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblip"
+    sub_configs = {
+        "text_config": AutoConfig,
+        "qformer_config": InstructBlipQFormerConfig,
+        "vision_config": InstructBlipVisionConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 5cce774ce0716a..e5622185bc39a8 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -1471,7 +1471,7 @@ def forward(
             logger.warning_once(
                 "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
                 "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
             )
             inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
             attention_mask = torch.cat(
@@ -1610,7 +1610,7 @@ def generate(
             logger.warning_once(
                 "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
                 "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
             )
             inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
             attention_mask = torch.cat(
@@ -1625,27 +1625,10 @@ def generate(
                 )
                 generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
 
-        outputs = self.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-
-        # this is a temporary workaround to be consistent with other generation models and
-        # have BOS as the first token, even though under the hood we are calling LM with embeds
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
         if not self.language_model.config.is_encoder_decoder:
-            # the InstructBLIP authors used inconsistent tokenizer/model files during training,
-            # with the tokenizer's bos token being set to </s> which has ID=2,
-            # whereas the model's text config has bos token id = 0
-            bos_token_id = (
-                2
-                if self.config.text_config.architectures[0] == "LLaMAForCausalLM"
-                else self.config.text_config.bos_token_id
-            )
-            bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device)
-            if not isinstance(outputs, torch.Tensor):
-                outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1)
-            else:
-                outputs = torch.cat([bos_tokens, outputs], dim=-1)
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
 
         return outputs
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index 05ff9871f4d731..a96d97fb07e1d9 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -78,8 +78,11 @@ class InstructBlipProcessor(ProcessorMixin):
     qformer_tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
-        self.image_token = AddedToken("<image>", normalized=False, special=True)
-        tokenizer.add_tokens([self.image_token], special_tokens=True)
+        if not hasattr(tokenizer, "image_token"):
+            self.image_token = AddedToken("<image>", normalized=False, special=True)
+            tokenizer.add_tokens([self.image_token], special_tokens=True)
+        else:
+            self.image_token = tokenizer.image_token
         self.num_query_tokens = num_query_tokens
         super().__init__(image_processor, tokenizer, qformer_tokenizer)
 
@@ -145,7 +148,7 @@ def __call__(
                     logger.warning_once(
                         "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
                         "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
-                        "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                        "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
                     )
 
             # cast to desired return tensors type after concatenating
diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
index e7c8eeccef98b4..14687a96e54f37 100644
--- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
@@ -19,13 +19,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from typing import Union
 
 from ...configuration_utils import PretrainedConfig
 from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -83,6 +81,7 @@ class InstructBlipVideoVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblipvideo_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -113,24 +112,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.qkv_bias = qkv_bias
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from InstructBlipVideoConfig
-        if config_dict.get("model_type") == "instructblipvideo":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class InstructBlipVideoQFormerConfig(PretrainedConfig):
     r"""
@@ -197,6 +178,7 @@ class InstructBlipVideoQFormerConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblipvideo_qformer"
+    base_config_key = "qformer_config"
 
     def __init__(
         self,
@@ -234,24 +216,6 @@ def __init__(
         self.cross_attention_frequency = cross_attention_frequency
         self.encoder_hidden_size = encoder_hidden_size
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the qformer config dict if we are loading from InstructBlipVideoConfig
-        if config_dict.get("model_type") == "instructblipvideo":
-            config_dict = config_dict["qformer_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class InstructBlipVideoConfig(PretrainedConfig):
     r"""
@@ -310,6 +274,11 @@ class InstructBlipVideoConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblipvideo"
+    sub_configs = {
+        "text_config": AutoConfig,
+        "qformer_config": InstructBlipVideoQFormerConfig,
+        "vision_config": InstructBlipVideoVisionConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
index c9f12391666c22..b0a494dcfe6cec 100644
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -24,7 +24,6 @@
 from typing import Any, Optional, Tuple, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
@@ -347,104 +346,6 @@ def _init_weights(self, module):
             module.bias.data.zero_()
 
 
-INSTRUCTBLIPVIDEO_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`InstructBlipVideoConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-INSTRUCTBLIPVIDEO_VISION_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`InstructBlipVideoProcessor`]. See
-            [`InstructBlipVideoProcessor.__call__`] for details.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
-            Whether to interpolate the pre-trained position encodings.
-"""
-
-INSTRUCTBLIPVIDEO_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`InstructBlipVideoProcessor`]. See
-            [`InstructBlipVideoProcessor.__call__`] for details.
-
-        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
-            to serve as text prompt, which the Q-Former model will encode.
-
-            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
-            details.
-
-            [What are input IDs?](../glossary#input-ids)
-
-        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
-            provided to serve as text prompt, which the language model can continue.
-
-            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
-            details.
-
-            [What are input IDs?](../glossary#input-ids)
-
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an
-            encoder-decoder language model (like T5) is used.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids)
-
-        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
-            be used by default.
-
-            Only relevant in case an encoder-decoder language model (like T5) is used.
-
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
-            Whether to interpolate the pre-trained position encodings.
-"""
-
-
 class InstructBlipVideoEncoder(nn.Module):
     """
     Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
@@ -531,6 +432,24 @@ def forward(
         )
 
 
+INSTRUCTBLIPVIDEO_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`InstructBlipVideoProcessor`]. See
+            [`InstructBlipVideoProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
+"""
+
+
 class InstructBlipVideoVisionModel(InstructBlipVideoPreTrainedModel):
     main_input_name = "pixel_values"
     config_class = InstructBlipVideoVisionConfig
@@ -1268,6 +1187,87 @@ def forward(
         )
 
 
+INSTRUCTBLIPVIDEO_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`InstructBlipVideoConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+INSTRUCTBLIPVIDEO_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`InstructBlipVideoProcessor`]. See
+            [`InstructBlipVideoProcessor.__call__`] for details.
+
+        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
+            to serve as text prompt, which the Q-Former model will encode.
+
+            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
+            provided to serve as text prompt, which the language model can continue.
+
+            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an
+            encoder-decoder language model (like T5) is used.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            Only relevant in case an encoder-decoder language model (like T5) is used.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
+"""
+
+
 @add_start_docstrings(
     """
     InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision
@@ -1660,27 +1660,10 @@ def generate(
                 )
                 generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
 
-        outputs = self.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-
-        # this is a temporary workaround to be consistent with other generation models and
-        # have BOS as the first token, even though under the hood we are calling LM with embeds
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
         if not self.language_model.config.is_encoder_decoder:
-            # the InstructBLIP authors used inconsistent tokenizer/model files during training,
-            # with the tokenizer's bos token being set to </s> which has ID=2,
-            # whereas the model's text config has bos token id = 0
-            bos_token_id = (
-                2
-                if self.config.text_config.architectures[0] == "LLaMAForCausalLM"
-                else self.config.text_config.bos_token_id
-            )
-            bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device)
-            if not isinstance(outputs, torch.Tensor):
-                outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1)
-            else:
-                outputs = torch.cat([bos_tokens, outputs], dim=-1)
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
 
         return outputs
diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
index 2128f25df6625d..b0dc8a215740f1 100644
--- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
@@ -32,7 +32,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -103,6 +103,11 @@ class InstructBlipVideoConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblipvideo"
+    sub_configs = {
+        "text_config": AutoConfig,
+        "qformer_config": InstructBlipVideoQFormerConfig,
+        "vision_config": InstructBlipVideoVisionConfig,
+    }
 
     def __init__(
         self,
@@ -468,27 +473,10 @@ def generate(
                 )
                 generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
 
-        outputs = self.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-
-        # this is a temporary workaround to be consistent with other generation models and
-        # have BOS as the first token, even though under the hood we are calling LM with embeds
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
         if not self.language_model.config.is_encoder_decoder:
-            # the InstructBLIP authors used inconsistent tokenizer/model files during training,
-            # with the tokenizer's bos token being set to </s> which has ID=2,
-            # whereas the model's text config has bos token id = 0
-            bos_token_id = (
-                2
-                if self.config.text_config.architectures[0] == "LLaMAForCausalLM"
-                else self.config.text_config.bos_token_id
-            )
-            bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device)
-            if not isinstance(outputs, torch.Tensor):
-                outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1)
-            else:
-                outputs = torch.cat([bos_tokens, outputs], dim=-1)
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
 
         return outputs
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index 3e96d279a42f8d..1d4e59e26b4621 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -63,8 +63,11 @@ class InstructBlipVideoProcessor(ProcessorMixin):
     qformer_tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
-        self.video_token = AddedToken("<video>", normalized=False, special=True)
-        tokenizer.add_tokens([self.video_token], special_tokens=True)
+        if not hasattr(tokenizer, "video_token"):
+            self.video_token = AddedToken("<video>", normalized=False, special=True)
+            tokenizer.add_tokens([self.video_token], special_tokens=True)
+        else:
+            self.video_token = tokenizer.video_token
         self.num_query_tokens = num_query_tokens
         super().__init__(image_processor, tokenizer, qformer_tokenizer)
 
diff --git a/src/transformers/models/kosmos2/configuration_kosmos2.py b/src/transformers/models/kosmos2/configuration_kosmos2.py
index e49074f8061b2c..921ec336c0be80 100644
--- a/src/transformers/models/kosmos2/configuration_kosmos2.py
+++ b/src/transformers/models/kosmos2/configuration_kosmos2.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """KOSMOS-2 model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -61,7 +58,7 @@ class Kosmos2TextConfig(PretrainedConfig):
         layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
             for more details.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -69,9 +66,16 @@ class Kosmos2TextConfig(PretrainedConfig):
             Scale embeddings by diving by sqrt(embed_dim).
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Token id used for padding.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            Token id used for beginning of string.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            Token id used for end of string.
     ```"""
 
     model_type = "kosmos_2_text_model"
+    base_config_key = "text_config"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {
         "num_attention_heads": "attention_heads",
@@ -124,24 +128,6 @@ def __init__(
         self.scale_embedding = scale_embedding
         self.use_cache = use_cache
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from Kosmos2Config
-        if config_dict.get("model_type") == "kosmos-2":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Kosmos2VisionConfig(PretrainedConfig):
     r"""
@@ -171,18 +157,19 @@ class Kosmos2VisionConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1):
+        initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
     ```"""
 
     model_type = "kosmos_2_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -215,24 +202,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from Kosmos2Config
-        if config_dict.get("model_type") == "kosmos-2":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Kosmos2Config(PretrainedConfig):
     r"""
@@ -267,7 +236,7 @@ class Kosmos2Config(PretrainedConfig):
     ```"""
 
     model_type = "kosmos-2"
-    is_composition = True
+    sub_configs = {"text_config": Kosmos2TextConfig, "vision_config": Kosmos2VisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index 76108789718b41..d7befd899f3ad3 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -428,6 +428,21 @@ def post_process_generation(self, text, cleanup_and_extract=True):
             return clean_text_and_extract_entities_with_bboxes(caption)
         return caption
 
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of the model to decode the text.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        generated_texts = self.batch_decode(generated_outputs, skip_special_tokens=True)
+        return [self.post_process_generation(text, cleanup_and_extract=False) for text in generated_texts]
+
     @property
     # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
     def model_input_names(self):
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index 248f16af8441c1..060748ea907ad5 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -297,7 +297,7 @@ def __getstate__(self):
         return state
 
     def __setstate__(self, d):
-        self.__dict__ = d
+        self.__dict__.update(d)
 
         # for backward compatibility
         if not hasattr(self, "sp_model_kwargs"):
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index a3667e06534564..98d5ecdd2a4fdb 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -141,6 +141,16 @@ class LlamaConfig(PretrainedConfig):
 
     model_type = "llama"
     keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `LlamaModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 4d95f01849d678..0408bb73c7f2da 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -21,7 +21,6 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
-import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 
@@ -240,25 +239,7 @@ def __init__(self, config):
         self.act_fn = ACT2FN[config.hidden_act]
 
     def forward(self, x):
-        if self.config.pretraining_tp > 1:
-            slice = self.intermediate_size // self.config.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat(
-                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
-            )
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [
-                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
-            ]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
 
 
@@ -320,31 +301,14 @@ def forward(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
 
-        if self.config.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
-            query_slices = self.q_proj.weight.split(
-                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
-            )
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
 
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
         if position_embeddings is None:
             logger.warning_once(
@@ -386,12 +350,7 @@ def forward(
 
         attn_output = attn_output.reshape(bsz, q_len, -1)
 
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
+        attn_output = self.o_proj(attn_output)
 
         if not output_attentions:
             attn_weights = None
@@ -564,9 +523,10 @@ def forward(
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
 
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
         if position_embeddings is None:
             logger.warning_once(
@@ -850,7 +810,10 @@ def __init__(self, config: LlamaConfig):
         )
         self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = LlamaRotaryEmbedding(config=config)
+
         self.gradient_checkpointing = False
+        if getattr(config, "pretraining_tp", 1) != 1:
+            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -930,7 +893,7 @@ def forward(
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
 
-        for decoder_layer in self.layers:
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
@@ -1113,6 +1076,7 @@ class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
 
 class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
 
     def __init__(self, config):
         super().__init__(config)
@@ -1211,13 +1175,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if self.config.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-            logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 8e99e4eef59d68..35f29bfa957806 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -214,7 +214,7 @@ def __getstate__(self):
         return state
 
     def __setstate__(self, d):
-        self.__dict__ = d
+        self.__dict__.update(d)
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
 
diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py
index 3a4cb09855f0ec..05034f5cfcf6f8 100644
--- a/src/transformers/models/llava/configuration_llava.py
+++ b/src/transformers/models/llava/configuration_llava.py
@@ -15,7 +15,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -73,7 +73,7 @@ class LlavaConfig(PretrainedConfig):
     ```"""
 
     model_type = "llava"
-    is_composition = True
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 50b3d4c6a89533..e8536ee50f94bb 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -354,7 +354,12 @@ def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, in
             (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
         )
         image_to_overwrite[batch_indices, text_to_overwrite] = False
-        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+        if left_padding:
+            image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+        else:
+            mask = torch.ones_like(image_to_overwrite, dtype=torch.bool).cumsum(-1) - 1
+            padding_mask = mask <= new_token_positions[:, -1:].to(target_device)
+            image_to_overwrite &= padding_mask
 
         if image_to_overwrite.sum() != image_features.shape[:-1].numel():
             raise ValueError(
@@ -467,6 +472,7 @@ def forward(
                 (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
             ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
 
+        image_features = None
         if pixel_values is not None:
             image_features = self.get_image_features(
                 pixel_values=pixel_values,
@@ -474,69 +480,68 @@ def forward(
                 vision_feature_select_strategy=vision_feature_select_strategy,
             )
 
-            if legacy_processing:
-                logger.warning_once(
-                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
-                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+        if legacy_processing:
+            logger.warning_once(
+                "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+            )
+            # prefill stage vs decoding stage (legacy behavior copied)
+            if input_ids.shape[1] != 1:
+                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+                    image_features, inputs_embeds, input_ids, attention_mask, labels
                 )
-                # prefill stage vs decoding stage (legacy behavior copied)
-                if input_ids.shape[1] != 1:
-                    inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                        image_features, inputs_embeds, input_ids, attention_mask, labels
-                    )
-                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-                else:
-                    # Retrieve the first layer to inspect the logits and mask out the hidden states
-                    # that are set to 0
-                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                    # Get the target length
-                    target_length = input_ids.shape[1]
-                    past_length = first_layer_past_key_value.shape[-1]
-
-                    extended_attention_mask = torch.ones(
-                        (attention_mask.shape[0], past_length),
-                        dtype=attention_mask.dtype,
-                        device=attention_mask.device,
-                    )
-
-                    # Filter out only the tokens that can be un-attended, this can happen
-                    # if one uses Llava + Fused modules where the cache on the
-                    # first iteration is already big enough, or if one passes custom cache
-                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                    new_batch_index = batch_index[valid_indices]
-                    new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                    # Zero-out the places where we don't need to attend
-                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[
-                        -target_length:
-                    ]
-
-            # TODO: @raushan retain only the new behavior after v4.47
+                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
             else:
-                n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item()
-                n_image_features = image_features.shape[1]
-                if n_image_tokens != n_image_features:
-                    raise ValueError(
-                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.image_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
+                # Retrieve the first layer to inspect the logits and mask out the hidden states
+                # that are set to 0
+                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+
+                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+
+                # Get the target length
+                target_length = input_ids.shape[1]
+                past_length = first_layer_past_key_value.shape[-1]
+
+                extended_attention_mask = torch.ones(
+                    (attention_mask.shape[0], past_length),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
                 )
-                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+                # Filter out only the tokens that can be un-attended, this can happen
+                # if one uses Llava + Fused modules where the cache on the
+                # first iteration is already big enough, or if one passes custom cache
+                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                new_batch_index = batch_index[valid_indices]
+                new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                # Zero-out the places where we don't need to attend
+                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+
+                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
+
+        # TODO: @raushan retain only the new behavior after v4.47
+        elif image_features is not None:
+            n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            n_image_features = image_features.shape[0] * image_features.shape[1]
+
+            if n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            special_image_mask = (
+                (input_ids == self.config.image_token_index)
+                .unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -597,12 +602,6 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        # Trigger the new behavior if we have more than image embeddings seq length tokens for images
-        legacy_processing = (
-            input_ids is not None
-            and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
-        )
-
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -613,7 +612,7 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        if legacy_processing or cache_position[0] == 0:
+        if cache_position[0] == 0:
             # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
             # Otherwise we need pixel values to be passed to model
             model_inputs["pixel_values"] = pixel_values
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 8a9597892c6021..08caa3d1d8a75a 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -58,10 +58,19 @@ class LlavaProcessor(ProcessorMixin):
             in a chat into a tokenizable string.
         image_token (`str`, *optional*, defaults to `"<image>"`):
             Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"]
+    valid_kwargs = [
+        "chat_template",
+        "patch_size",
+        "vision_feature_select_strategy",
+        "image_token",
+        "num_additional_image_tokens",
+    ]
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
@@ -73,11 +82,13 @@ def __init__(
         vision_feature_select_strategy=None,
         chat_template=None,
         image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        num_additional_image_tokens=0,
         **kwargs,
     ):
         self.patch_size = patch_size
+        self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.image_token = image_token
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
@@ -147,9 +158,11 @@ def __call__(
                 # Replace the image token with the expanded image token sequence
                 pixel_values = image_inputs["pixel_values"]
                 height, width = get_image_size(to_numpy_array(pixel_values[0]))
-                num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
+                num_image_tokens = (height // self.patch_size) * (
+                    width // self.patch_size
+                ) + self.num_additional_image_tokens
                 if self.vision_feature_select_strategy == "default":
-                    num_image_tokens -= 1
+                    num_image_tokens -= self.num_additional_image_tokens
 
                 prompt_strings = []
                 for sample in text:
@@ -160,7 +173,7 @@ def __call__(
                     "Expanding inputs for image tokens in LLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                     "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
                 )
 
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py
index e8768dde85722b..54616edbf96dce 100644
--- a/src/transformers/models/llava_next/configuration_llava_next.py
+++ b/src/transformers/models/llava_next/configuration_llava_next.py
@@ -15,7 +15,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -78,7 +78,7 @@ class LlavaNextConfig(PretrainedConfig):
     ```"""
 
     model_type = "llava_next"
-    is_composition = False
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 0cbda9cfd64b74..269663c7d6141a 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -138,12 +138,12 @@ def unpad_image(tensor, original_size):
 
     if original_aspect_ratio > current_aspect_ratio:
         scale_factor = current_width / original_width
-        new_height = int(original_height * scale_factor)
+        new_height = int(round(original_height * scale_factor, 7))
         padding = (current_height - new_height) // 2
         unpadded_tensor = tensor[:, padding : current_height - padding, :]
     else:
         scale_factor = current_height / original_height
-        new_width = int(original_width * scale_factor)
+        new_width = int(round(original_width * scale_factor, 7))
         padding = (current_width - new_width) // 2
         unpadded_tensor = tensor[:, :, padding : current_width - padding]
 
@@ -846,6 +846,7 @@ def forward(
                 (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
             ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
 
+        image_features = None
         if pixel_values is not None and pixel_values.size(0) > 0:
             image_features = self.get_image_features(
                 pixel_values,
@@ -861,74 +862,73 @@ def forward(
                 vision_feature_select_strategy=vision_feature_select_strategy,
                 image_newline=self.image_newline,
             )
-            if legacy_processing:
-                logger.warning_once(
-                    "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
-                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+
+        if legacy_processing:
+            logger.warning_once(
+                "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+            )
+            if input_ids.shape[1] != 1:
+                inputs_embeds = inputs_embeds.to(image_features.dtype)
+                inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features(
+                    image_features,
+                    feature_lens,
+                    inputs_embeds,
+                    input_ids,
+                    attention_mask,
+                    position_ids,
+                    labels=labels,
                 )
-                if input_ids.shape[1] != 1:
-                    inputs_embeds = inputs_embeds.to(image_features.dtype)
-                    inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features(
-                        image_features,
-                        feature_lens,
-                        inputs_embeds,
-                        input_ids,
-                        attention_mask,
-                        position_ids,
-                        labels=labels,
-                    )
-                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-                else:
-                    # Retrieve the first layer to inspect the logits and mask out the hidden states
-                    # that are set to 0
-                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                    # Get the target length
-                    target_length = input_ids.shape[1]
-                    past_length = first_layer_past_key_value.shape[-1]
-
-                    extended_attention_mask = torch.ones(
-                        (attention_mask.shape[0], past_length),
-                        dtype=attention_mask.dtype,
-                        device=attention_mask.device,
-                    )
+                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
+            else:
+                # Retrieve the first layer to inspect the logits and mask out the hidden states
+                # that are set to 0
+                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
 
-                    # Filter out only the tokens that can be un-attended, this can happen
-                    # if one uses Llava + Fused modules where the cache on the
-                    # first iteration is already big enough, or if one passes custom cache
-                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                    new_batch_index = batch_index[valid_indices]
-                    new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                    # Zero-out the places where we don't need to attend
-                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[
-                        -target_length:
-                    ]
+                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
 
-            # TODO: @raushan retain only the new behavior after v4.47
-            else:
-                n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
-                n_image_features = image_features.shape[0]
-                if n_image_tokens != n_image_features:
-                    raise ValueError(
-                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.image_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
+                # Get the target length
+                target_length = input_ids.shape[1]
+                past_length = first_layer_past_key_value.shape[-1]
+
+                extended_attention_mask = torch.ones(
+                    (attention_mask.shape[0], past_length),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+
+                # Filter out only the tokens that can be un-attended, this can happen
+                # if one uses Llava + Fused modules where the cache on the
+                # first iteration is already big enough, or if one passes custom cache
+                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                new_batch_index = batch_index[valid_indices]
+                new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                # Zero-out the places where we don't need to attend
+                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
+
+        # TODO: @raushan retain only the new behavior after v4.47
+        elif image_features is not None:
+            n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            n_image_features = image_features.shape[0]
+            if n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                 )
-                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+            special_image_mask = (
+                (input_ids == self.config.image_token_index)
+                .unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -990,11 +990,6 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        legacy_processing = (
-            input_ids is not None
-            and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
-        )
-
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -1007,7 +1002,7 @@ def prepare_inputs_for_generation(
 
         # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
         # Otherwise we need pixel values to be passed to model
-        if legacy_processing or cache_position[0] == 0:
+        if cache_position[0] == 0:
             model_inputs["pixel_values"] = pixel_values
             model_inputs["image_sizes"] = image_sizes
 
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index ce11be6d6309a8..db9d1276832b39 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -61,10 +61,19 @@ class LlavaNextProcessor(ProcessorMixin):
             in a chat into a tokenizable string.
         image_token (`str`, *optional*, defaults to `"<image>"`):
             Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"]
+    valid_kwargs = [
+        "chat_template",
+        "patch_size",
+        "vision_feature_select_strategy",
+        "image_token",
+        "num_additional_image_tokens",
+    ]
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
@@ -76,11 +85,13 @@ def __init__(
         vision_feature_select_strategy=None,
         chat_template=None,
         image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        num_additional_image_tokens=0,
         **kwargs,
     ):
         self.patch_size = patch_size
+        self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.image_token = image_token
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
@@ -143,7 +154,7 @@ def __call__(
                     "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                     "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
                 )
             else:
                 image_sizes = iter(image_inputs["image_sizes"])
@@ -152,10 +163,12 @@ def __call__(
                 for sample in text:
                     while self.image_token in sample:
                         image_size = next(image_sizes)
-                        orig_height, orig_width = image_size
+                        if not isinstance(image_size, (list, tuple)):
+                            # cast to list to avoid numerical precision errors when calculating unpadding
+                            orig_height, orig_width = image_size.tolist()
                         num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
                         if self.vision_feature_select_strategy == "default":
-                            num_image_tokens -= 1
+                            num_image_tokens -= self.num_additional_image_tokens
                         sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
                     prompt_strings.append(sample)
                 prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
@@ -178,7 +191,7 @@ def _get_number_of_features(self, orig_height: int, orig_width: int, height: int
             orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
         )
         # The base patch covers the entire image (+1 for the CLS)
-        base_features = patches_height * patches_width + 1
+        base_features = patches_height * patches_width + self.num_additional_image_tokens
         num_image_tokens = unpadded_features + newline_features + base_features
         return num_image_tokens
 
diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
index 0e4e39b4b3ab53..2fe889da60336b 100644
--- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
@@ -21,7 +21,7 @@
 
 
 from ...configuration_utils import PretrainedConfig
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 class LlavaNextVideoConfig(PretrainedConfig):
@@ -86,7 +86,7 @@ class LlavaNextVideoConfig(PretrainedConfig):
     ```"""
 
     model_type = "llava_next_video"
-    is_composition = True
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 96f4373afd9ec6..b0a20d6c5ccd93 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -25,7 +25,6 @@
 
 import numpy as np
 import torch
-import torch.utils.checkpoint
 from torch import nn
 
 from ...activations import ACT2FN
@@ -33,12 +32,7 @@
 from ...image_processing_utils import select_best_resolution
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from ..auto import AutoModel, AutoModelForCausalLM
 from .configuration_llava_next_video import LlavaNextVideoConfig
 
@@ -48,113 +42,6 @@
 _CONFIG_FOR_DOC = "LlavaNextVideoConfig"
 
 
-def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
-    """
-    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
-
-    Args:
-        image_size (`tuple`):
-            The size of the input image in the format (width, height).
-        grid_pinpoints (`List`):
-            A list containing possible resolutions. Each item in the list should be a tuple or list
-            of the form `(height, width)`.
-        patch_size (`int`):
-            The size of each image patch.
-
-    Returns:
-        tuple: The shape of the image patch grid in the format (width, height).
-    """
-    if not isinstance(grid_pinpoints, list):
-        raise TypeError("grid_pinpoints should be a list of tuples or lists")
-
-    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
-    if not isinstance(image_size, (list, tuple)):
-        if not isinstance(image_size, (torch.Tensor, np.ndarray)):
-            raise TypeError(
-                f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor"
-            )
-        image_size = image_size.tolist()
-
-    height, width = select_best_resolution(image_size, grid_pinpoints)
-    return height // patch_size, width // patch_size
-
-
-def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
-    """
-    Calculate the number of patches after the preprocessing for images of any resolution.
-
-    Args:
-        image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
-            The size of the input image in the format (height, width). ?
-        grid_pinpoints (`List`):
-            A list containing possible resolutions. Each item in the list should be a tuple or list
-            of the form `(height, width)`.
-        patch_size (`int`):
-            The size of each image patch.
-
-    Returns:
-        int: the number of patches
-    """
-    if not isinstance(grid_pinpoints, list):
-        raise TypeError("grid_pinpoints should be a list of tuples or lists")
-
-    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
-    if not isinstance(image_size, (list, tuple)):
-        if not isinstance(image_size, (torch.Tensor, np.ndarray)):
-            raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}")
-        image_size = image_size.tolist()
-
-    best_resolution = select_best_resolution(image_size, grid_pinpoints)
-    height, width = best_resolution
-    num_patches = 0
-    # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1
-    for i in range(0, height, patch_size):
-        for j in range(0, width, patch_size):
-            num_patches += 1
-    # add the base patch
-    num_patches += 1
-    return num_patches
-
-
-def unpad_image(tensor, original_size):
-    """
-    Unpads a PyTorch tensor of a padded and resized image.
-
-    Args:
-        tensor (`torch.Tensor`):
-            The image tensor, assumed to be of shape (num_channels, height, width).
-        original_size (`tuple`):
-            The original size of the image (height, width).
-
-    Returns:
-        `torch.Tensor`: The unpadded image tensor.
-    """
-    if not isinstance(original_size, (list, tuple)):
-        if not isinstance(original_size, (torch.Tensor, np.ndarray)):
-            raise TypeError(
-                f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
-            )
-        original_size = original_size.tolist()
-    original_height, original_width = original_size
-    current_height, current_width = tensor.shape[1:]
-
-    original_aspect_ratio = original_width / original_height
-    current_aspect_ratio = current_width / current_height
-
-    if original_aspect_ratio > current_aspect_ratio:
-        scale_factor = current_width / original_width
-        new_height = int(original_height * scale_factor)
-        padding = (current_height - new_height) // 2
-        unpadded_tensor = tensor[:, padding : current_height - padding, :]
-    else:
-        scale_factor = current_height / original_height
-        new_width = int(original_width * scale_factor)
-        padding = (current_width - new_width) // 2
-        unpadded_tensor = tensor[:, :, padding : current_width - padding]
-
-    return unpadded_tensor
-
-
 @dataclass
 class LlavaNextVideoCausalLMOutputWithPast(ModelOutput):
     """
@@ -208,7 +95,7 @@ def __init__(self, config):
         mode = config.spatial_pool_mode
         stride = config.spatial_pool_stride
         out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size)
-        self.image_size = config.vision_config.image_size // config.vision_config.patch_size**2
+        self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2
 
         if mode == "average":
             self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
@@ -304,6 +191,113 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
 
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (`tuple`):
+            The size of the input image in the format (width, height).
+        grid_pinpoints (`List`):
+            A list containing possible resolutions. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        patch_size (`int`):
+            The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if not isinstance(grid_pinpoints, list):
+        raise TypeError("grid_pinpoints should be a list of tuples or lists")
+
+    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
+    if not isinstance(image_size, (list, tuple)):
+        if not isinstance(image_size, (torch.Tensor, np.ndarray)):
+            raise TypeError(
+                f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor"
+            )
+        image_size = image_size.tolist()
+
+    height, width = select_best_resolution(image_size, grid_pinpoints)
+    return height // patch_size, width // patch_size
+
+
+def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
+    """
+    Calculate the number of patches after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
+            The size of the input image in the format (height, width). ?
+        grid_pinpoints (`List`):
+            A list containing possible resolutions. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        patch_size (`int`):
+            The size of each image patch.
+
+    Returns:
+        int: the number of patches
+    """
+    if not isinstance(grid_pinpoints, list):
+        raise TypeError("grid_pinpoints should be a list of tuples or lists")
+
+    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
+    if not isinstance(image_size, (list, tuple)):
+        if not isinstance(image_size, (torch.Tensor, np.ndarray)):
+            raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}")
+        image_size = image_size.tolist()
+
+    best_resolution = select_best_resolution(image_size, grid_pinpoints)
+    height, width = best_resolution
+    num_patches = 0
+    # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            num_patches += 1
+    # add the base patch
+    num_patches += 1
+    return num_patches
+
+
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+
+    Args:
+        tensor (`torch.Tensor`):
+            The image tensor, assumed to be of shape (num_channels, height, width).
+        original_size (`tuple`):
+            The original size of the image (height, width).
+
+    Returns:
+        `torch.Tensor`: The unpadded image tensor.
+    """
+    if not isinstance(original_size, (list, tuple)):
+        if not isinstance(original_size, (torch.Tensor, np.ndarray)):
+            raise TypeError(
+                f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
+            )
+        original_size = original_size.tolist()
+    original_height, original_width = original_size
+    current_height, current_width = tensor.shape[1:]
+
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(round(original_height * scale_factor, 7))
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding : current_height - padding, :]
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(round(original_width * scale_factor, 7))
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding : current_width - padding]
+
+    return unpadded_tensor
+
+
 LLAVA_NEXT_VIDEO_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -911,7 +905,8 @@ def forward(
 
         if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
-                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+                "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, "
+                "and must specify either one"
             )
 
         legacy_processing = False
@@ -1020,6 +1015,7 @@ def forward(
             if image_features is not None:
                 n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
                 n_image_features = image_features.shape[0]
+
                 if n_image_tokens != n_image_features:
                     raise ValueError(
                         f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
@@ -1110,17 +1106,6 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- extra custom processing
 
-        if input_ids is not None:
-            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
-                1
-            ).max() < self.config.image_seq_length
-            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
-                1
-            ).max() < self.config.video_seq_length
-            legacy_processing = (img_token_not_enough and pixel_values is not None) or (
-                video_token_not_enough and pixel_values_videos is not None
-            )
-
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -1133,7 +1118,7 @@ def prepare_inputs_for_generation(
 
         # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
         # Otherwise we need pixel values to be passed to model
-        if legacy_processing or cache_position[0] == 0:
+        if cache_position[0] == 0:
             model_inputs["pixel_values"] = pixel_values
             model_inputs["pixel_values_videos"] = pixel_values_videos
             model_inputs["image_sizes"] = image_sizes
diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py
index c1ed7571941b9e..3d6431d7ea29ba 100644
--- a/src/transformers/models/llava_next_video/modular_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py
@@ -30,9 +30,8 @@
 from ...configuration_utils import PretrainedConfig
 from ...utils import (
     logging,
-    replace_return_docstrings,
 )
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -100,7 +99,7 @@ class LlavaNextVideoConfig(PretrainedConfig):
     ```"""
 
     model_type = "llava_next_video"
-    is_composition = True
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
@@ -192,7 +191,7 @@ def __init__(self, config):
         mode = config.spatial_pool_mode
         stride = config.spatial_pool_stride
         out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size)
-        self.image_size = config.vision_config.image_size // config.vision_config.patch_size**2
+        self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2
 
         if mode == "average":
             self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
@@ -309,7 +308,6 @@ def get_video_features(
         video_features = torch.split(video_features, frames, dim=0)
         return video_features
 
-    @replace_return_docstrings(output_type=LlavaNextVideoCausalLMOutputWithPast, config_class="LlavaNextVideoConfig")
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -424,7 +422,8 @@ def forward(
 
         if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
-                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+                "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, "
+                "and must specify either one"
             )
 
         legacy_processing = False
@@ -533,6 +532,7 @@ def forward(
             if image_features is not None:
                 n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
                 n_image_features = image_features.shape[0]
+
                 if n_image_tokens != n_image_features:
                     raise ValueError(
                         f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
@@ -623,17 +623,6 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- extra custom processing
 
-        if input_ids is not None:
-            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
-                1
-            ).max() < self.config.image_seq_length
-            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
-                1
-            ).max() < self.config.video_seq_length
-            legacy_processing = (img_token_not_enough and pixel_values is not None) or (
-                video_token_not_enough and pixel_values_videos is not None
-            )
-
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -646,7 +635,7 @@ def prepare_inputs_for_generation(
 
         # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
         # Otherwise we need pixel values to be passed to model
-        if legacy_processing or cache_position[0] == 0:
+        if cache_position[0] == 0:
             model_inputs["pixel_values"] = pixel_values
             model_inputs["pixel_values_videos"] = pixel_values_videos
             model_inputs["image_sizes"] = image_sizes
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index e0e4534e42b565..5805782f779e5f 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -58,12 +58,22 @@ class LlavaNextVideoProcessor(ProcessorMixin):
             Special token used to denote video location.
         image_token (`str`, *optional*, defaults to `"<image>"`):
             Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
     """
 
     # video and image processor share same args, but have different processing logic
     # only image processor config is saved in the hub
     attributes = ["video_processor", "image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token", "video_token"]
+    valid_kwargs = [
+        "chat_template",
+        "patch_size",
+        "vision_feature_select_strategy",
+        "image_token",
+        "video_token",
+        "num_additional_image_tokens",
+    ]
     image_processor_class = "LlavaNextImageProcessor"
     video_processor_class = "LlavaNextVideoImageProcessor"
     tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
@@ -78,12 +88,14 @@ def __init__(
         vision_feature_select_strategy=None,
         video_token="<video>",
         image_token="<image>",
+        num_additional_image_tokens=0,
         **kwargs,
     ):
         self.patch_size = patch_size
+        self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.image_token = image_token
-        self.video_token = video_token
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
         super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
@@ -164,8 +176,9 @@ def __call__(
         if self.patch_size is None or self.vision_feature_select_strategy is None:
             logger.warning_once(
                 "Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                "Please add `patch_size`, `num_additional_image_tokens` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                "with `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` "
+                "and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
                 "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
             )
         else:
@@ -177,10 +190,12 @@ def __call__(
                 for sample in text:
                     while self.image_token in sample:
                         image_size = next(image_sizes)
-                        orig_height, orig_width = image_size
+                        if not isinstance(image_size, (list, tuple)):
+                            # cast to list to avoid numerical precision errors when calculating unpadding
+                            orig_height, orig_width = image_size.tolist()
                         num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
                         if self.vision_feature_select_strategy == "default":
-                            num_image_tokens -= 1
+                            num_image_tokens -= self.num_additional_image_tokens
                         sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
                     prompt_strings.append(sample)
                 text = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
@@ -190,6 +205,8 @@ def __call__(
                 one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
                 height, width = get_image_size(one_video[0])
                 num_frames = one_video.shape[0]  # frame dim is always after batch dim
+
+                # no `self.num_additional_image_tokens` added because video always has a default feature selection strategy
                 num_image_tokens = (height // self.patch_size) * (width // self.patch_size)
                 num_video_tokens = num_image_tokens // 4 * num_frames  # divide by 4 needed for avg pooling layer
                 prompt_strings = []
@@ -222,7 +239,7 @@ def _get_number_of_features(self, orig_height: int, orig_width: int, height: int
             orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
         )
         # The base patch covers the entire image (+1 for the CLS)
-        base_features = patches_height * patches_width + 1
+        base_features = patches_height * patches_width + self.num_additional_image_tokens
         num_image_tokens = unpadded_features + newline_features + base_features
         return num_image_tokens
 
diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py
index eef86c6c8c019b..46b65b35b1a5cb 100644
--- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py
@@ -18,7 +18,7 @@
 from ...utils import (
     logging,
 )
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -81,7 +81,7 @@ class LlavaOnevisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "llava_onevision"
-    is_composition = False
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
index 946688bfcf07f4..626db4d96aae2e 100644
--- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -139,12 +139,12 @@ def unpad_image(tensor, original_size):
 
     if original_aspect_ratio > current_aspect_ratio:
         scale_factor = current_width / original_width
-        new_height = int(original_height * scale_factor)
+        new_height = int(round(original_height * scale_factor, 7))
         padding = (current_height - new_height) // 2
         unpadded_tensor = tensor[:, padding : current_height - padding, :]
     else:
         scale_factor = current_height / original_height
-        new_width = int(original_width * scale_factor)
+        new_width = int(round(original_width * scale_factor, 7))
         padding = (current_width - new_width) // 2
         unpadded_tensor = tensor[:, :, padding : current_width - padding]
 
@@ -657,7 +657,8 @@ def forward(
 
         if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
-                "You cannot specify both pixel_values/pixel_values_videos and inputs_embeds at the same time, and must specify either one"
+                "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, "
+                "and must specify either one"
             )
 
         if inputs_embeds is None:
@@ -679,6 +680,7 @@ def forward(
             )
             n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
             n_image_features = image_features.shape[0]
+
             if n_image_tokens != n_image_features:
                 raise ValueError(
                     f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
@@ -704,6 +706,7 @@ def forward(
             )
             video_features = torch.cat((video_features, image_newline), dim=1)
             video_features = video_features.flatten(0, 1)
+
             n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
             n_video_features = video_features.shape[0]
             if n_video_tokens != n_video_features:
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index 039e05a7ec19a0..4f67f9e4c030e8 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -96,8 +96,8 @@ def __init__(
     ):
         self.num_image_tokens = num_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.image_token = image_token
-        self.video_token = video_token
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
 
     def __call__(
@@ -188,7 +188,10 @@ def _expand_image_tokens(
         for sample in text:
             while special_token in sample:
                 image_size_list = next(image_sizes)
-                orig_height, orig_width = image_size_list[0] if num_frames != 1 else image_size_list
+                original_size = image_size_list[0] if num_frames != 1 else image_size_list
+                if not isinstance(original_size, (list, tuple)):
+                    # cast to list to avoid numerical precision errors when calculating unpadding
+                    orig_height, orig_width = original_size.tolist()
                 num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
                 if self.vision_feature_select_strategy == "default":
                     num_image_tokens -= 1
diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py
index 110ae09a388704..c312b9b94351d2 100644
--- a/src/transformers/models/mamba2/modeling_mamba2.py
+++ b/src/transformers/models/mamba2/modeling_mamba2.py
@@ -805,6 +805,16 @@ class Mamba2CausalLMOutput(ModelOutput):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
+            If `cache_params` is passed, `cache_position` should also be passed.
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
 """
 
 
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
index 28ad6002958eae..555ee6e956709f 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -1034,7 +1034,8 @@ def post_process_instance_segmentation(
     ) -> List[Dict]:
         """
         Converts the output of [`Mask2FormerForUniversalSegmentationOutput`] into instance segmentation predictions.
-        Only supports PyTorch.
+        Only supports PyTorch. If instances could overlap, set either return_coco_annotation or return_binary_maps
+        to `True` to get the correct segmentation result.
 
         Args:
             outputs ([`Mask2FormerForUniversalSegmentation`]):
@@ -1056,9 +1057,10 @@ def post_process_instance_segmentation(
                 (one per detected instance).
         Returns:
             `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
-            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
+            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id`, or
               `List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
-              `True`. Set to `None` if no mask if found above `threshold`.
+              `True`, or a tensor of shape `(num_instances, height, width)` if return_binary_maps is set to `True`.
+              Set to `None` if no mask if found above `threshold`.
             - **segments_info** -- A dictionary that contains additional information on each segment.
                 - **id** -- An integer representing the `segment_id`.
                 - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index f4aea415adf5e6..e91d0357545102 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -926,7 +926,7 @@ def forward(
         encoder_attention_mask=None,
         position_embeddings: Optional[torch.Tensor] = None,
         reference_points=None,
-        spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         output_attentions: bool = False,
     ):
@@ -936,7 +936,8 @@ def forward(
 
         batch_size, num_queries, _ = hidden_states.shape
         batch_size, sequence_length, _ = encoder_hidden_states.shape
-        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+        total_elements = sum(height * width for height, width in spatial_shapes_list)
+        if total_elements != sequence_length:
             raise ValueError(
                 "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
             )
@@ -957,7 +958,11 @@ def forward(
         )
         # batch_size, num_queries, n_heads, n_levels, n_points, 2
         if reference_points.shape[-1] == 2:
-            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            offset_normalizer = torch.tensor(
+                [[shape[1], shape[0]] for shape in spatial_shapes_list],
+                dtype=torch.long,
+                device=reference_points.device,
+            )
             sampling_locations = (
                 reference_points[:, :, None, :, None, :]
                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
@@ -970,7 +975,7 @@ def forward(
         else:
             raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
 
-        output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        output = multi_scale_deformable_attention(value, spatial_shapes_list, sampling_locations, attention_weights)
         output = self.output_proj(output)
 
         return output, attention_weights
@@ -1001,7 +1006,7 @@ def forward(
         attention_mask: torch.Tensor,
         position_embeddings: torch.Tensor = None,
         reference_points=None,
-        spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         output_attentions: bool = False,
     ):
@@ -1015,8 +1020,8 @@ def forward(
                 Position embeddings, to be added to `hidden_states`.
             reference_points (`torch.FloatTensor`, *optional*):
                 Reference points.
-            spatial_shapes (`torch.LongTensor`, *optional*):
-                Spatial shapes of the backbone feature maps.
+            spatial_shapes_list (`list` of `tuple`):
+                Spatial shapes of the backbone feature maps as a list of tuples.
             level_start_index (`torch.LongTensor`, *optional*):
                 Level start index.
             output_attentions (`bool`, *optional*):
@@ -1033,7 +1038,7 @@ def forward(
             encoder_attention_mask=attention_mask,
             position_embeddings=position_embeddings,
             reference_points=reference_points,
-            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
             level_start_index=level_start_index,
             output_attentions=output_attentions,
         )
@@ -1086,13 +1091,13 @@ def __init__(self, config: Mask2FormerConfig):
         )
 
     @staticmethod
-    def get_reference_points(spatial_shapes, valid_ratios, device):
+    def get_reference_points(spatial_shapes_list, valid_ratios, device):
         """
         Get reference points for each feature map. Used in decoder.
 
         Args:
-            spatial_shapes (`torch.LongTensor`):
-                Spatial shapes of each feature map, has shape of `(num_feature_levels, 2)`.
+            spatial_shapes_list (`list` of `tuple`):
+                Spatial shapes of the backbone feature maps as a list of tuples.
             valid_ratios (`torch.FloatTensor`):
                 Valid ratios of each feature map, has shape of `(batch_size, num_feature_levels, 2)`.
             device (`torch.device`):
@@ -1101,7 +1106,7 @@ def get_reference_points(spatial_shapes, valid_ratios, device):
             `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
         """
         reference_points_list = []
-        for lvl, (height, width) in enumerate(spatial_shapes):
+        for lvl, (height, width) in enumerate(spatial_shapes_list):
             ref_y, ref_x = torch.meshgrid(
                 torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device),
                 torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device),
@@ -1122,7 +1127,7 @@ def forward(
         inputs_embeds=None,
         attention_mask=None,
         position_embeddings=None,
-        spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         valid_ratios=None,
         output_attentions=None,
@@ -1140,8 +1145,8 @@ def forward(
                 [What are attention masks?](../glossary#attention-mask)
             position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                 Position embeddings that are added to the queries and keys in each self-attention layer.
-            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
-                Spatial shapes of each feature map.
+            spatial_shapes_list (`list` of `tuple`):
+                Spatial shapes of each feature map as a list of tuples.
             level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
                 Starting index of each feature map.
             valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
@@ -1162,7 +1167,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         hidden_states = inputs_embeds
-        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
+        reference_points = self.get_reference_points(spatial_shapes_list, valid_ratios, device=inputs_embeds.device)
 
         all_hidden_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -1176,7 +1181,7 @@ def forward(
                 attention_mask,
                 position_embeddings=position_embeddings,
                 reference_points=reference_points,
-                spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
                 level_start_index=level_start_index,
                 output_attentions=output_attentions,
             )
@@ -1302,9 +1307,9 @@ def forward(
         ]
 
         # Prepare encoder inputs (by flattening)
-        spatial_shapes = [(embed.shape[2], embed.shape[3]) for embed in input_embeds]
+        spatial_shapes_list = [(embed.shape[2], embed.shape[3]) for embed in input_embeds]
         input_embeds_flat = torch.cat([embed.flatten(2).transpose(1, 2) for embed in input_embeds], 1)
-        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=input_embeds_flat.device)
+        spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=input_embeds_flat.device)
         masks_flat = torch.cat([mask.flatten(1) for mask in masks], 1)
 
         position_embeddings = [embed.flatten(2).transpose(1, 2) for embed in position_embeddings]
@@ -1320,7 +1325,7 @@ def forward(
                 inputs_embeds=input_embeds_flat,
                 attention_mask=masks_flat,
                 position_embeddings=level_pos_embed_flat,
-                spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
                 level_start_index=level_start_index,
                 valid_ratios=valid_ratios,
                 output_attentions=output_attentions,
@@ -1331,18 +1336,23 @@ def forward(
         last_hidden_state = encoder_outputs.last_hidden_state
         batch_size = last_hidden_state.shape[0]
 
+        # We compute level_start_index_list separately from the tensor version level_start_index
+        # to avoid iterating over a tensor which breaks torch.compile/export.
+        level_start_index_list = [0]
+        for height, width in spatial_shapes_list[:-1]:
+            level_start_index_list.append(level_start_index_list[-1] + height * width)
         split_sizes = [None] * self.num_feature_levels
         for i in range(self.num_feature_levels):
             if i < self.num_feature_levels - 1:
-                split_sizes[i] = level_start_index[i + 1] - level_start_index[i]
+                split_sizes[i] = level_start_index_list[i + 1] - level_start_index_list[i]
             else:
-                split_sizes[i] = last_hidden_state.shape[1] - level_start_index[i]
+                split_sizes[i] = last_hidden_state.shape[1] - level_start_index_list[i]
 
-        encoder_output = torch.split(last_hidden_state, [size.item() for size in split_sizes], dim=1)
+        encoder_output = torch.split(last_hidden_state, split_sizes, dim=1)
 
         # Compute final features
         outputs = [
-            x.transpose(1, 2).view(batch_size, -1, spatial_shapes[i][0], spatial_shapes[i][1])
+            x.transpose(1, 2).view(batch_size, -1, spatial_shapes_list[i][0], spatial_shapes_list[i][1])
             for i, x in enumerate(encoder_output)
         ]
 
@@ -1876,7 +1886,9 @@ def forward(
             else:
                 level_index = idx % self.num_feature_levels
 
-                attention_mask[torch.where(attention_mask.sum(-1) == attention_mask.shape[-1])] = False
+                where = (attention_mask.sum(-1) != attention_mask.shape[-1]).to(attention_mask.dtype)
+                # Multiply the attention mask instead of indexing to avoid issue in torch.export.
+                attention_mask = attention_mask * where.unsqueeze(-1)
 
                 layer_outputs = decoder_layer(
                     hidden_states,
@@ -2416,8 +2428,8 @@ def forward(
         >>> masks_queries_logits = outputs.masks_queries_logits
 
         >>> # Perform post-processing to get instance segmentation map
-        >>> pred_instance_map = image_processor.post_process_semantic_segmentation(
-        ...     outputs, target_sizes=[image.size[::-1]]
+        >>> pred_instance_map = image_processor.post_process_instance_segmentation(
+        ...     outputs, target_sizes=[(image.height, image.width)]
         ... )[0]
         >>> print(pred_instance_map.shape)
         torch.Size([480, 640])
@@ -2450,7 +2462,7 @@ def forward(
 
         >>> # Perform post-processing to get semantic segmentation map
         >>> pred_semantic_map = image_processor.post_process_semantic_segmentation(
-        ...     outputs, target_sizes=[image.size[::-1]]
+        ...     outputs, target_sizes=[(image.height, image.width)]
         ... )[0]
         >>> print(pred_semantic_map.shape)
         torch.Size([512, 683])
@@ -2484,7 +2496,7 @@ def forward(
 
         >>> # Perform post-processing to get panoptic segmentation map
         >>> pred_panoptic_map = image_processor.post_process_panoptic_segmentation(
-        ...     outputs, target_sizes=[image.size[::-1]]
+        ...     outputs, target_sizes=[(image.height, image.width)]
         ... )[0]["segmentation"]
         >>> print(pred_panoptic_map.shape)
         torch.Size([338, 676])
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index aeec214884155c..f4eb1bb56f4eb9 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -1080,7 +1080,8 @@ def post_process_instance_segmentation(
     ) -> List[Dict]:
         """
         Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into instance segmentation predictions. Only
-        supports PyTorch.
+        supports PyTorch. If instances could overlap, set either return_coco_annotation or return_binary_maps
+        to `True` to get the correct segmentation result.
 
         Args:
             outputs ([`MaskFormerForInstanceSegmentation`]):
@@ -1102,9 +1103,10 @@ def post_process_instance_segmentation(
                 (one per detected instance).
         Returns:
             `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
-            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
+            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id`, or
               `List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
-              `True`. Set to `None` if no mask if found above `threshold`.
+              `True`, or a tensor of shape `(num_instances, height, width)` if return_binary_maps is set to `True`.
+              Set to `None` if no mask if found above `threshold`.
             - **segments_info** -- A dictionary that contains additional information on each segment.
                 - **id** -- An integer representing the `segment_id`.
                 - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index cd6ef28566a262..a8398ec9725b30 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -1780,7 +1780,7 @@ def forward(
 
         >>> # you can pass them to image_processor for postprocessing
         >>> predicted_semantic_map = image_processor.post_process_semantic_segmentation(
-        ...     outputs, target_sizes=[image.size[::-1]]
+        ...     outputs, target_sizes=[(image.height, image.width)]
         ... )[0]
 
         >>> # we refer to the demo notebooks for visualization (see "Resources" section in the MaskFormer docs)
@@ -1810,7 +1810,7 @@ def forward(
         >>> masks_queries_logits = outputs.masks_queries_logits
 
         >>> # you can pass them to image_processor for postprocessing
-        >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
+        >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(image.height, image.width)])[0]
 
         >>> # we refer to the demo notebooks for visualization (see "Resources" section in the MaskFormer docs)
         >>> predicted_panoptic_map = result["segmentation"]
diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py
index 9a40e050459816..598e1d8186a24a 100644
--- a/src/transformers/models/maskformer/modeling_maskformer_swin.py
+++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py
@@ -520,16 +520,14 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class MaskFormerSwinLayer(nn.Module):
-    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
+    def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0):
         super().__init__()
         self.shift_size = shift_size
         self.window_size = config.window_size
         self.input_resolution = input_resolution
         self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
         self.attention = MaskFormerSwinAttention(config, dim, num_heads, self.window_size)
-        self.drop_path = (
-            MaskFormerSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
-        )
+        self.drop_path = MaskFormerSwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
         self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
         self.intermediate = MaskFormerSwinIntermediate(config, dim)
         self.output = MaskFormerSwinOutput(config, dim)
@@ -644,6 +642,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d
                     dim=dim,
                     input_resolution=input_resolution,
                     num_heads=num_heads,
+                    drop_path_rate=drop_path[i],
                     shift_size=0 if (i % 2 == 0) else config.window_size // 2,
                 )
                 for i in range(depth)
diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
index 502aa78263649a..284d8a3d454848 100644
--- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@@ -21,10 +21,11 @@
 import types
 
 import torch
+from huggingface_hub import split_torch_state_dict_into_shards
 from packaging import version
 
 from transformers import AutoTokenizer, GPT2Config
-from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, shard_checkpoint
+from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
 
 
 def add_checkpointing_args(parser):
@@ -571,7 +572,15 @@ def convert_checkpoint_from_megatron_to_transformers(args):
 
     # Store the state_dict to file.
     max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size
-    shards, index = shard_checkpoint(output_state_dict, max_shard_size=max_shard_size)
+    state_dict_split = split_torch_state_dict_into_shards(output_state_dict, max_shard_size=max_shard_size)
+    shards = index = None
+    for tensors in state_dict_split.filename_to_tensors.values():
+        shards = {tensor: state_dict[tensor] for tensor in tensors}
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
 
     # Save the model
     for shard_file, shard in shards.items():
diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py
index 514f9de706ec63..cbdd2c663c5844 100644
--- a/src/transformers/models/mimi/modeling_mimi.py
+++ b/src/transformers/models/mimi/modeling_mimi.py
@@ -1156,7 +1156,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
-                    diagonal_attend_mask |= sliding_attend_mask
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index f198e4abc85511..3b0fb75a4cb3ba 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -619,7 +619,7 @@ def _init_weights(self, module):
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
             `past_key_values`).
 
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
@@ -666,6 +666,10 @@ def _init_weights(self, module):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices indicating the position of the input sequence tokens in the sequence. Unlike `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """
 
 
@@ -961,7 +965,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
-                    diagonal_attend_mask |= sliding_attend_mask
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py
index 164988b4dc524e..686c214ef25ce5 100644
--- a/src/transformers/models/mixtral/configuration_mixtral.py
+++ b/src/transformers/models/mixtral/configuration_mixtral.py
@@ -53,6 +53,8 @@ class MixtralConfig(PretrainedConfig):
             converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
             by meanpooling all the original heads within that group. For more details checkout [this
             paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
+            The attention head dimension.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
         max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
@@ -116,6 +118,7 @@ def __init__(
         num_hidden_layers=32,
         num_attention_heads=32,
         num_key_value_heads=8,
+        head_dim=None,
         hidden_act="silu",
         max_position_embeddings=4096 * 32,
         initializer_range=0.02,
@@ -154,6 +157,7 @@ def __init__(
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.attention_dropout = attention_dropout
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
 
         self.num_experts_per_tok = num_experts_per_tok
         self.num_local_experts = num_local_experts
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index 192b7801af0575..de1cd1097a53ff 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -283,7 +283,7 @@ def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None):
 
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
+        self.head_dim = config.head_dim
         self.num_key_value_heads = config.num_key_value_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.max_position_embeddings = config.max_position_embeddings
@@ -291,11 +291,6 @@ def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None):
         self.is_causal = True
         self.attention_dropout = config.attention_dropout
 
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
         self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
         self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
@@ -374,7 +369,7 @@ def forward(
             )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = attn_output.reshape(bsz, q_len, -1)
 
         attn_output = self.o_proj(attn_output)
 
@@ -481,7 +476,7 @@ def forward(
             is_causal=self.is_causal,
         )
 
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
         attn_output = self.o_proj(attn_output)
 
         if not output_attentions:
@@ -575,7 +570,7 @@ def forward(
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = attn_output.view(bsz, q_len, -1)
 
         attn_output = self.o_proj(attn_output)
 
@@ -1174,7 +1169,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
-                    diagonal_attend_mask |= sliding_attend_mask
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py
index 539fc61ba4edba..635ca503205f5f 100644
--- a/src/transformers/models/mllama/configuration_mllama.py
+++ b/src/transformers/models/mllama/configuration_mllama.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 """Mllama model configuration"""
 
-import os
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional
 
 from ...configuration_utils import PretrainedConfig
 from ...modeling_rope_utils import rope_config_validation
@@ -59,7 +58,7 @@ class MllamaVisionConfig(PretrainedConfig):
             The size (resolution) of each image *tile*.
         patch_size (`int`, *optional*, defaults to 14):
             The size (resolution) of each patch.
-        norm_eps (`float`, *optional*, defaults to 1e-5):
+        norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         max_num_tiles (`int`, *optional*, defaults to 4):
             Maximum number of tiles for image splitting.
@@ -88,6 +87,7 @@ class MllamaVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "mllama_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -137,23 +137,6 @@ def __init__(
     def max_aspect_ratio_id(self) -> int:
         return len(self.supported_aspect_ratios)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "mllama":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class MllamaTextConfig(PretrainedConfig):
     r"""
@@ -178,12 +161,12 @@ class MllamaTextConfig(PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 32):
             Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*):
+        num_key_value_heads (`int`, *optional*, defaults to 8):
             This is the number of key_value heads that should be used to implement Grouped Query Attention. If not
             specified, will default to `num_attention_heads`.
         intermediate_size (`int`, *optional*, defaults to 14336):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        rope_theta (`float`, *optional*, defaults to 500000.0):
+        rope_theta (`float`, *optional*, defaults to `500000.0`):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
@@ -259,6 +242,7 @@ class MllamaTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "mllama_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -311,23 +295,6 @@ def __init__(
             **kwargs,
         )
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "mllama":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class MllamaConfig(PretrainedConfig):
     r"""
@@ -370,7 +337,7 @@ class MllamaConfig(PretrainedConfig):
     ```"""
 
     model_type = "mllama"
-    is_composition = True
+    sub_configs = {"text_config": MllamaTextConfig, "vision_config": MllamaVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index eea98f5bd66ac2..3d6c08c35cd258 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -213,8 +213,13 @@ class MllamaProcessor(ProcessorMixin):
     tokenizer_class = "PreTrainedTokenizerFast"
 
     def __init__(self, image_processor, tokenizer):
-        self.image_token = "<|image|>"
-        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        if not hasattr(tokenizer, "image_token"):
+            self.image_token = "<|image|>"
+            self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        else:
+            self.image_token = tokenizer.image_token
+            self.image_token_id = tokenizer.image_token_id
+
         self.python_token = "<|python_tag|>"
         self.python_token_id = tokenizer.convert_tokens_to_ids(self.python_token)
         self.bos_token = tokenizer.bos_token
@@ -342,6 +347,22 @@ def decode(self, *args, **kwargs):
         """
         return self.tokenizer.decode(*args, **kwargs)
 
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of the model to decode the text.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
     @property
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py
index 654e4e82a491b7..1b31141f020db5 100644
--- a/src/transformers/models/moshi/configuration_moshi.py
+++ b/src/transformers/models/moshi/configuration_moshi.py
@@ -235,8 +235,8 @@ class MoshiConfig(PretrainedConfig):
     ```"""
 
     model_type = "moshi"
-    is_composition = True
     keys_to_ignore_at_inference = ["past_key_values"]
+    sub_configs = {"audio_encoder_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py
index 97200b7d042e61..9975996d21d144 100644
--- a/src/transformers/models/moshi/modeling_moshi.py
+++ b/src/transformers/models/moshi/modeling_moshi.py
@@ -1385,7 +1385,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
-                    diagonal_attend_mask |= sliding_attend_mask
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
@@ -1689,7 +1689,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
-                    diagonal_attend_mask |= sliding_attend_mask
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py
index ed822c813ba26e..8ee3f8c0c07428 100644
--- a/src/transformers/models/mpt/configuration_mpt.py
+++ b/src/transformers/models/mpt/configuration_mpt.py
@@ -41,22 +41,22 @@ class MptAttentionConfig(PretrainedConfig):
     Args:
         attn_type (`str`, *optional*, defaults to `"multihead_attention"`):
             type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`.
-        attn_pdrop (`float`, *optional*, defaults to 0.0):
+        attn_pdrop (`float`, *optional*, defaults to `0.0`):
             The dropout probability for the attention layers.
         attn_impl (`str`, *optional*, defaults to `"torch"`):
             The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`.
         clip_qkv (`float`, *optional*):
             If not `None`, clip the queries, keys, and values in the attention layer to this value.
-        softmax_scale (`float`, *optional*, defaults to `None`):
+        softmax_scale (`float`, *optional*):
             If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to
             `1/sqrt(hidden_size)`.
-        prefix_lm (`bool`, *optional*, defaults to `False`)):
+        prefix_lm (`bool`, *optional*, defaults to `False`):
             Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument
             which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another
             bi-directionally. Tokens outside the prefix use causal attention.
         qk_ln (`bool`, *optional*, defaults to `False`):
             Whether to apply layer normalization to the queries and keys in the attention layer.
-        attn_uses_sequence_id (`bool`, *optional*, defaults to `False`)):
+        attn_uses_sequence_id (`bool`, *optional*, defaults to `False`):
             Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train`
             mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each
             token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored.
@@ -66,6 +66,8 @@ class MptAttentionConfig(PretrainedConfig):
             The maximum value of the alibi bias.
     """
 
+    base_config_key = "attn_config"
+
     def __init__(
         self,
         attn_type="multihead_attention",
@@ -97,23 +99,6 @@ def __init__(
                 f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}"
             )
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "mpt":
-            config_dict = config_dict["attn_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class MptConfig(PretrainedConfig):
     """
@@ -188,6 +173,7 @@ class MptConfig(PretrainedConfig):
     """
 
     model_type = "mpt"
+    sub_configs = {"attn_config": MptAttentionConfig}
     attribute_map = {
         "num_attention_heads": "n_heads",
         "hidden_size": "d_model",
diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py
index 0d282355defa96..00c03072198092 100644
--- a/src/transformers/models/musicgen/configuration_musicgen.py
+++ b/src/transformers/models/musicgen/configuration_musicgen.py
@@ -76,6 +76,7 @@ class MusicgenDecoderConfig(PretrainedConfig):
     """
 
     model_type = "musicgen_decoder"
+    base_config_key = "decoder_config"
     keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
@@ -189,6 +190,11 @@ class MusicgenConfig(PretrainedConfig):
     ```"""
 
     model_type = "musicgen"
+    sub_configs = {
+        "text_encoder": AutoConfig,
+        "audio_encoder": AutoConfig,
+        "decoder": MusicgenDecoderConfig,
+    }
     is_composition = True
 
     def __init__(self, **kwargs):
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index c18e1d1c9d86b1..109ddfb626d26b 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -1562,7 +1562,7 @@ def generate(
 
         if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
-                input_ids, generation_config._pad_token_tensor, generation_config._eos_token_tensor
+                input_ids, generation_config, model_kwargs
             )
 
         # 5. Prepare `max_length` depending on other stopping criteria.
@@ -2578,7 +2578,7 @@ def generate(
 
         if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
-                inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor
+                inputs_tensor, generation_config, model_kwargs
             )
 
         if "encoder_outputs" not in model_kwargs:
diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py
index 8a77cea0252234..e65ad50021c3ab 100644
--- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py
@@ -78,6 +78,7 @@ class MusicgenMelodyDecoderConfig(PretrainedConfig):
     """
 
     model_type = "musicgen_melody_decoder"
+    base_config_key = "decoder_config"
     keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
@@ -195,6 +196,11 @@ class MusicgenMelodyConfig(PretrainedConfig):
     ```"""
 
     model_type = "musicgen_melody"
+    sub_configs = {
+        "text_encoder": AutoConfig,
+        "audio_encoder": AutoConfig,
+        "decoder": MusicgenMelodyDecoderConfig,
+    }
     is_composition = True
 
     def __init__(
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index d2f339afc41451..61f2ce414e1ddf 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -1484,7 +1484,7 @@ def generate(
 
         if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
-                input_ids, generation_config._pad_token_tensor, generation_config._eos_token_tensor
+                input_ids, generation_config, model_kwargs
             )
 
         # 5. Prepare `max_length` depending on other stopping criteria.
@@ -2425,7 +2425,7 @@ def generate(
 
         if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
-                inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor
+                inputs_tensor, generation_config, model_kwargs
             )
 
         if "encoder_hidden_states" not in model_kwargs:
diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py
index d4eb348260c1a4..8de6bc90ea3fec 100644
--- a/src/transformers/models/nemotron/modeling_nemotron.py
+++ b/src/transformers/models/nemotron/modeling_nemotron.py
@@ -980,7 +980,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         return causal_mask
 
 
-# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->NEMOTRON,Llama->Nemotron,llama->nemotron
+# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->NEMOTRON,Llama->Nemotron,llama->nemotron
 class NemotronForCausalLM(NemotronPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
 
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index 60225d4759c6ab..d865c51e50578e 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -1020,7 +1020,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         return causal_mask
 
 
-# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO,Llama->Olmo
+# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO,Llama->Olmo
 class OlmoForCausalLM(OlmoPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
 
diff --git a/src/transformers/models/olmo_1124/__init__.py b/src/transformers/models/olmo_1124/__init__.py
new file mode 100644
index 00000000000000..5d4127766c293f
--- /dev/null
+++ b/src/transformers/models/olmo_1124/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_olmo_1124 import *
+    from .modeling_olmo_1124 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/olmo_1124/configuration_olmo_1124.py b/src/transformers/models/olmo_1124/configuration_olmo_1124.py
new file mode 100644
index 00000000000000..b7f6c57ae446e7
--- /dev/null
+++ b/src/transformers/models/olmo_1124/configuration_olmo_1124.py
@@ -0,0 +1,166 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/olmo_1124/modular_olmo_1124.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_olmo_1124.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+
+from ...configuration_utils import PretrainedConfig
+
+
+class Olmo1124Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Olmo1124Model`]. It is used to instantiate an OLMo November 2024
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the [allenai/Olmo1124-7B-hf](https://huggingface.co/allenai/Olmo1124-7B-hf).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50304):
+            Vocabulary size of the Olmo1124 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Olmo1124Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 50279):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+
+    ```python
+    >>> from transformers import Olmo1124Model, Olmo1124Config
+
+    >>> # Initializing a Olmo November 2024 7B style configuration
+    >>> configuration = Olmo1124Config()
+
+    >>> # Initializing a model from the Olmo November 2024 7B style configuration
+    >>> model = Olmo1124Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "olmo_1124"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50304,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=None,
+        eos_token_id=50279,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        rms_norm_eps=1e-5,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        self.rms_norm_eps = rms_norm_eps
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+
+
+__all__ = ["Olmo1124Config"]
diff --git a/src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py b/src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py
new file mode 100644
index 00000000000000..da35fc8301fd37
--- /dev/null
+++ b/src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py
@@ -0,0 +1,304 @@
+# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import gc
+import json
+import os
+import shutil
+from pathlib import Path
+from typing import Any, Dict
+
+import torch
+import yaml
+from tokenizers import Tokenizer
+
+from transformers import Olmo1124Config, Olmo1124ForCausalLM
+from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
+
+
+"""
+Sample usage:
+
+```
+python src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py \
+    --input_dir /path/to/downloaded/olmo_1124/weights --model_size 7B --output_dir /output/path
+```
+
+Thereafter, models can be loaded via:
+
+```py
+from transformers import Olmo1124ForCausalLM, AutoTokenizer
+
+model = Olmo1124ForCausalLM.from_pretrained("/output/path")
+tokenizer = AutoTokenizer.from_pretrained("/output/path")
+```
+
+Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+"""
+
+
+def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
+    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
+
+
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+
+
+def write_model(
+    model_path,
+    input_base_path,
+    include_tokenizer=True,
+    tokenizer_path=None,
+    safe_serialization=True,
+    fix_eos_token_id=True,
+    tmp_cleanup=True,
+):
+    os.makedirs(model_path, exist_ok=True)
+    tmp_model_path = os.path.join(model_path, "tmp")
+    os.makedirs(tmp_model_path, exist_ok=True)
+
+    config_path = Path(input_base_path) / "config.yaml"
+    olmo_1124_config = yaml.safe_load(config_path.read_text())["model"]
+
+    if not olmo_1124_config.get("attention_layer_norm", False):
+        raise RuntimeError("OLMo November 2024 checkpoints must have attention layer norm")
+    if not olmo_1124_config.get("norm_after", False):
+        raise RuntimeError("OLMo November 2024 checkpoints must set norm_after to True")
+
+    n_layers = olmo_1124_config["n_layers"]
+    n_heads = olmo_1124_config["n_heads"]
+    dim = olmo_1124_config["d_model"]
+    dims_per_head = dim // n_heads
+    base = olmo_1124_config["rope_theta"]
+    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+    max_position_embeddings = olmo_1124_config["max_sequence_length"]
+
+    vocab_size = olmo_1124_config.get("embedding_size", olmo_1124_config["vocab_size"])
+
+    if olmo_1124_config.get("n_kv_heads", None) is not None:
+        num_key_value_heads = olmo_1124_config["n_kv_heads"]  # for GQA / MQA
+    elif olmo_1124_config["multi_query_attention"]:  # compatibility with other checkpoints
+        num_key_value_heads = 1
+    else:
+        num_key_value_heads = n_heads
+
+    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
+
+    # Not sharded
+    # (The sharded implementation would also work, but this is simpler.)
+    loaded = torch.load(os.path.join(input_base_path, "model.pt"), map_location="cpu")
+
+    param_count = 0
+    index_dict: Dict[str, Any] = {"weight_map": {}}
+    for layer_i in range(n_layers):
+        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
+        # Unsharded
+        # TODO: Layernorm stuff
+        # TODO: multi query attention
+        fused_dims = [dim, dims_per_head * num_key_value_heads, dims_per_head * num_key_value_heads]
+        q_proj_weight, k_proj_weight, v_proj_weight = torch.split(
+            loaded[f"transformer.blocks.{layer_i}.att_proj.weight"], fused_dims, dim=0
+        )
+        up_proj_weight, gate_proj_weight = torch.chunk(
+            loaded[f"transformer.blocks.{layer_i}.ff_proj.weight"], 2, dim=0
+        )
+        state_dict = {
+            f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj_weight,
+            f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj_weight,
+            f"model.layers.{layer_i}.self_attn.v_proj.weight": v_proj_weight,
+            f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"transformer.blocks.{layer_i}.attn_out.weight"],
+            f"model.layers.{layer_i}.self_attn.q_norm.weight": loaded[f"transformer.blocks.{layer_i}.q_norm.weight"],
+            f"model.layers.{layer_i}.self_attn.k_norm.weight": loaded[f"transformer.blocks.{layer_i}.k_norm.weight"],
+            f"model.layers.{layer_i}.mlp.gate_proj.weight": gate_proj_weight,
+            f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"transformer.blocks.{layer_i}.ff_out.weight"],
+            f"model.layers.{layer_i}.mlp.up_proj.weight": up_proj_weight,
+            f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
+                f"transformer.blocks.{layer_i}.attn_norm.weight"
+            ],
+            f"model.layers.{layer_i}.post_feedforward_layernorm.weight": loaded[
+                f"transformer.blocks.{layer_i}.ff_norm.weight"
+            ],
+        }
+
+        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
+
+        for k, v in state_dict.items():
+            index_dict["weight_map"][k] = filename
+            param_count += v.numel()
+        torch.save(state_dict, os.path.join(tmp_model_path, filename))
+
+    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
+
+    # Unsharded
+    # TODO: Deal with weight-tying
+    state_dict = {
+        "model.embed_tokens.weight": loaded["transformer.wte.weight"],
+        "model.norm.weight": loaded["transformer.ln_f.weight"],
+        "lm_head.weight": loaded["transformer.ff_out.weight"]
+        if "transformer.ff_out.weight" in loaded
+        else loaded["transformer.wte.weight"],
+    }
+
+    for k, v in state_dict.items():
+        index_dict["weight_map"][k] = filename
+        param_count += v.numel()
+    torch.save(state_dict, os.path.join(tmp_model_path, filename))
+
+    # Write configs
+    index_dict["metadata"] = {"total_size": param_count * 2}
+    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
+
+    if olmo_1124_config.get("mlp_hidden_size", None) is not None:
+        intermediate_size = olmo_1124_config["mlp_hidden_size"] // 2
+    else:
+        intermediate_size = (dim * olmo_1124_config["mlp_ratio"]) // 2
+
+    if fix_eos_token_id and olmo_1124_config["eos_token_id"] == 0:
+        # Fixing a bug in OLMo where eos token id was incorrectly set
+        print("Changing eos_token_id from 0 to 50279.")
+        olmo_1124_config["eos_token_id"] = 50279
+
+    config = Olmo1124Config(
+        vocab_size=vocab_size,
+        hidden_size=dim,
+        intermediate_size=intermediate_size,
+        num_hidden_layers=n_layers,
+        num_attention_heads=n_heads,
+        num_key_value_heads=num_key_value_heads,
+        max_position_embeddings=max_position_embeddings,
+        pad_token_id=olmo_1124_config["pad_token_id"],
+        bos_token_id=None,
+        eos_token_id=olmo_1124_config["eos_token_id"],
+        tie_word_embeddings=olmo_1124_config["weight_tying"],
+        rms_norm_eps=olmo_1124_config["layer_norm_eps"],
+        rope_theta=base,
+    )
+    config.save_pretrained(tmp_model_path)
+
+    # Make space so we can load the model properly now.
+    del state_dict
+    del loaded
+    gc.collect()
+
+    if include_tokenizer:
+        _write_tokenizer(model_path, config, input_base_path, tokenizer_path)
+
+    print("Loading the checkpoint in a OLMo November 2024 model.")
+    model = Olmo1124ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
+    # Avoid saving this as part of the config.
+    del model.config._name_or_path
+    print("Saving in the Transformers format.")
+    model.save_pretrained(model_path, safe_serialization=safe_serialization)
+    if tmp_cleanup:
+        # Make cleanup optional; attempting to `rmtree` the `tmp_model_path` causes
+        # errors if using NFS.
+        shutil.rmtree(tmp_model_path)
+
+
+def _write_tokenizer(
+    output_path: Path,
+    config: Olmo1124Config,
+    checkpoint_dir: str,
+    input_tokenizer_path: Path | None,
+) -> None:
+    print(f"Saving a {GPT2TokenizerFast.__name__} to {output_path}.")
+
+    if input_tokenizer_path is not None:
+        base_tokenizer = Tokenizer.from_file(str(input_tokenizer_path))
+    else:
+        config_path = Path(checkpoint_dir) / "config.yaml"
+        tokenizer_config = yaml.safe_load(config_path.read_text())["tokenizer"]
+
+        # Initialize tokenizer and validate vocab size.
+        if Path(tokenizer_config["identifier"]).is_file():
+            base_tokenizer = Tokenizer.from_file(tokenizer_config["identifier"])
+        else:
+            base_tokenizer = Tokenizer.from_pretrained(tokenizer_config["identifier"])
+
+    eos_token_id = config.eos_token_id if config.eos_token_id is not None else base_tokenizer.get_vocab_size() - 1
+    pad_token_id = config.pad_token_id if config.pad_token_id is not None else eos_token_id
+
+    tokenizer = GPT2TokenizerFast(
+        tokenizer_object=base_tokenizer,
+        eos_token=base_tokenizer.decode([eos_token_id], skip_special_tokens=False),
+        pad_token=base_tokenizer.decode([pad_token_id], skip_special_tokens=False),
+    )
+
+    tokenizer.save_pretrained(output_path)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        required=True,
+        help="Location of OLMo November 2024 weights, which contains config.yaml and model.pt.",
+    )
+    parser.add_argument(
+        "--no_tokenizer",
+        action="store_false",
+        dest="include_tokenizer",
+        help="If set, do not convert OLMo tokenizer to HF tokenizer.",
+    )
+    parser.add_argument(
+        "--tokenizer_json_path",
+        type=Path,
+        default=None,
+        help="Location of OLMo November 2024 tokenizer json file. Defaults to what is set in the config file.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        required=True,
+        help="Location to write HF model and tokenizer",
+    )
+    parser.add_argument(
+        "--no_fix_eos_token_id",
+        action="store_false",
+        dest="fix_eos_token_id",
+        help="If set, does not change eos token id from 0 to 50279 if it is 0. Changing 0 to 50279 is a bug fix, so use this option with care.",
+    )
+    parser.add_argument(
+        "--no_tmp_cleanup",
+        action="store_false",
+        dest="tmp_cleanup",
+        help="If passed, don't remove temp dir at end of HF conversion.",
+    )
+    parser.add_argument(
+        "--no_safe_serialization",
+        action="store_false",
+        dest="safe_serialization",
+        help="Whether or not to save using `safetensors`.",
+    )
+    args = parser.parse_args()
+    write_model(
+        model_path=args.output_dir,
+        input_base_path=args.input_dir,
+        safe_serialization=args.safe_serialization,
+        include_tokenizer=args.include_tokenizer,
+        tokenizer_path=args.tokenizer_json_path,
+        fix_eos_token_id=args.fix_eos_token_id,
+        tmp_cleanup=args.tmp_cleanup,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/olmo_1124/modeling_olmo_1124.py b/src/transformers/models/olmo_1124/modeling_olmo_1124.py
new file mode 100644
index 00000000000000..5a9cca39b88570
--- /dev/null
+++ b/src/transformers/models/olmo_1124/modeling_olmo_1124.py
@@ -0,0 +1,1096 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/olmo_1124/modular_olmo_1124.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_olmo_1124.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import _flash_attention_forward
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_olmo_1124 import Olmo1124Config
+
+
+if is_flash_attn_2_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Olmo1124Config"
+
+
+class Olmo1124RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Olmo1124RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo1124
+# TODO(joao): add me back asap :)
+class Olmo1124RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        super().__init__()
+        self.scaling_factor = scaling_factor
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # For BC we register cos and sin cached
+        self.max_seq_len_cached = max_position_embeddings
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo1124
+# TODO(joao): add me back asap :)
+class Olmo1124LinearScalingRotaryEmbedding(Olmo1124RotaryEmbedding):
+    """Olmo1124RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def forward(self, x, position_ids):
+        # difference to the original RoPE: a scaling factor is aplied to the position ids
+        position_ids = position_ids.float() / self.scaling_factor
+        cos, sin = super().forward(x, position_ids)
+        return cos, sin
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo1124
+# TODO(joao): add me back asap :)
+class Olmo1124DynamicNTKScalingRotaryEmbedding(Olmo1124RotaryEmbedding):
+    """Olmo1124RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def forward(self, x, position_ids):
+        # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (
+                base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device) / self.dim)
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: this may break with compilation
+
+        cos, sin = super().forward(x, position_ids)
+        return cos, sin
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Olmo1124Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo1124
+    # TODO(joao): add me back asap :)
+    def __init__(self, config: Olmo1124Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
+        self._init_rope()
+        self.q_norm = Olmo1124RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps)
+        self.k_norm = Olmo1124RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps)
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = Olmo1124RotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = Olmo1124LinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = Olmo1124DynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_norm(self.q_proj(hidden_states))
+        key_states = self.k_norm(self.k_proj(hidden_states))
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Olmo1124FlashAttention2(Olmo1124Attention):
+    """
+    Olmo1124 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+
+    OLMo November 2024 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_norm(self.q_proj(hidden_states))
+        key_states = self.k_norm(self.k_proj(hidden_states))
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (OlmoRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Olmo1124SdpaAttention(Olmo1124Attention):
+    """
+    Olmo1124 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Olmo1124Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from Olmo1124Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Olmo1124Model is using Olmo1124SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_norm(self.q_proj(hidden_states))
+        key_states = self.k_norm(self.k_proj(hidden_states))
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        # if attention_mask is not None and cache_position is not None:
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+
+
+class Olmo1124MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+OLMO_1124_ATTENTION_CLASSES = {
+    "eager": Olmo1124Attention,
+    "flash_attention_2": Olmo1124FlashAttention2,
+    "sdpa": Olmo1124SdpaAttention,
+}
+
+
+class Olmo1124DecoderLayer(nn.Module):
+    def __init__(self, config: Olmo1124Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = OLMO_1124_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+        self.mlp = Olmo1124MLP(config)
+        self.post_attention_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    # copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer.forward
+    # TODO(joao): add me back asap :)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+
+
+OLMO_1124_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Olmo1124Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Olmo1124 Model outputting raw hidden-states without any specific head on top.",
+    OLMO_1124_START_DOCSTRING,
+)
+class Olmo1124PreTrainedModel(PreTrainedModel):
+    config_class = Olmo1124Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Olmo1124DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+OLMO_1124_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Olmo1124 Model outputting raw hidden-states without any specific head on top.",
+    OLMO_1124_START_DOCSTRING,
+)
+class Olmo1124Model(Olmo1124PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Olmo1124DecoderLayer`]
+
+    Args:
+        config: Olmo1124Config
+    """
+
+    def __init__(self, config: Olmo1124Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Olmo1124DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(OLMO_1124_INPUTS_DOCSTRING)
+    # copied from transformers.models.llama.modeling_llama.LlamaModel.forward
+    # TODO(joao): add me back asap :)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO_1124,Llama->Olmo1124
+class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: Olmo1124Config):
+        super().__init__(config)
+        self.model = Olmo1124Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(OLMO_1124_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    # Ignore copy
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Olmo1124ForCausalLM
+
+        >>> model = Olmo1124ForCausalLM.from_pretrained("allenai/Olmo1124-1B-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo1124-1B-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        'Hey, are you conscious? Can you talk to me?\nI’m not sure if you’re conscious of this, but I’m'
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["Olmo1124ForCausalLM", "Olmo1124Model", "Olmo1124PreTrainedModel"]
diff --git a/src/transformers/models/olmo_1124/modular_olmo_1124.py b/src/transformers/models/olmo_1124/modular_olmo_1124.py
new file mode 100644
index 00000000000000..2305b1f400054a
--- /dev/null
+++ b/src/transformers/models/olmo_1124/modular_olmo_1124.py
@@ -0,0 +1,489 @@
+import math
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+
+from ...cache_utils import Cache
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
+from ...utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging
+from ..llama.modeling_llama import LlamaRMSNorm
+from ..olmo.configuration_olmo import OlmoConfig
+from ..olmo.modeling_olmo import (
+    OlmoAttention,
+    OlmoDecoderLayer,
+    OlmoFlashAttention2,
+    OlmoForCausalLM,
+    OlmoModel,
+    OlmoPreTrainedModel,
+    OlmoSdpaAttention,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
+
+
+if is_flash_attn_2_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+logger = logging.get_logger(__name__)
+
+
+class Olmo1124Config(OlmoConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Olmo1124Model`]. It is used to instantiate an OLMo November 2024
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the [allenai/Olmo1124-7B-hf](https://huggingface.co/allenai/Olmo1124-7B-hf).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50304):
+            Vocabulary size of the Olmo1124 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Olmo1124Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 50279):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+
+    ```python
+    >>> from transformers import Olmo1124Model, Olmo1124Config
+
+    >>> # Initializing a Olmo November 2024 7B style configuration
+    >>> configuration = Olmo1124Config()
+
+    >>> # Initializing a model from the Olmo November 2024 7B style configuration
+    >>> model = Olmo1124Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "olmo_1124"
+
+    def __init__(
+        self,
+        vocab_size=50304,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=None,
+        eos_token_id=50279,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        rms_norm_eps=1e-5,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            hidden_act=hidden_act,
+            max_position_embeddings=max_position_embeddings,
+            initializer_range=initializer_range,
+            use_cache=use_cache,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            attention_bias=attention_bias,
+            attention_dropout=attention_dropout,
+            **kwargs,
+        )
+
+        self.rms_norm_eps = rms_norm_eps
+        del self.clip_qkv
+
+
+class Olmo1124RMSNorm(LlamaRMSNorm):
+    pass
+
+
+ALL_LAYERNORM_LAYERS.append(Olmo1124RMSNorm)
+
+
+# Olmo1124 attention is identical to OLMo attention except:
+# - Norm is applied to attention queries and keys.
+# - No qkv clipping.
+class Olmo1124Attention(OlmoAttention):
+    def __init__(self, config: Olmo1124Config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx=layer_idx)
+        self.q_norm = Olmo1124RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps)
+        self.k_norm = Olmo1124RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_norm(self.q_proj(hidden_states))
+        key_states = self.k_norm(self.k_proj(hidden_states))
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Olmo1124FlashAttention2(OlmoFlashAttention2, Olmo1124Attention):
+    """
+    OLMo November 2024 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        Olmo1124Attention.__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_norm(self.q_proj(hidden_states))
+        key_states = self.k_norm(self.k_proj(hidden_states))
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (OlmoRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Olmo1124SdpaAttention(OlmoSdpaAttention, Olmo1124Attention):
+    # Adapted from Olmo1124Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Olmo1124Model is using Olmo1124SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_norm(self.q_proj(hidden_states))
+        key_states = self.k_norm(self.k_proj(hidden_states))
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        # if attention_mask is not None and cache_position is not None:
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+
+
+# The OLMo November 2024 layers are identical to those of the OLMo model except:
+# - RMSNorm is used instead of standard layer norm.
+# - Norm is applied after attention/feedforward rather than before.
+class Olmo1124DecoderLayer(OlmoDecoderLayer):
+    def __init__(self, config: Olmo1124Config, layer_idx: int):
+        super().__init__(config, layer_idx=layer_idx)
+        self.post_attention_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        del self.input_layernorm
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+
+
+class Olmo1124PreTrainedModel(OlmoPreTrainedModel):
+    pass
+
+
+# The OLMo November 2024 model is identical to the OLMo model, except RMSNorm is used instead of
+# standard layer norm for the output norm.
+class Olmo1124Model(OlmoModel):
+    def __init__(self, config: Olmo1124Config):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [Olmo1124DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+# The heads now only need to redefine the model inside to the correct `RobertaModel`
+class Olmo1124ForCausalLM(OlmoForCausalLM):
+    def __init__(self, config: Olmo1124Config):
+        super().__init__(config)
+        self.model = Olmo1124Model(config)
+
+
+__all__ = [
+    "Olmo1124Config",
+    "Olmo1124ForCausalLM",
+    "Olmo1124Model",
+    "Olmo1124PreTrainedModel",
+]
diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py
index cbb8db0f59dd02..169827ffd75777 100644
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -888,7 +888,7 @@ def _init_weights(self, module):
     "The bare Olmoe Model outputting raw hidden-states without any specific head on top.",
     OLMOE_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Olmoe
+# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Olmoe
 class OlmoeModel(OlmoePreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OlmoeDecoderLayer`]
@@ -995,7 +995,7 @@ def forward(
         all_router_logits = () if output_router_logits else None
         next_decoder_cache = None
 
-        for decoder_layer in self.layers:
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index aeeccb68a92fb7..e237467c242bbc 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -3161,7 +3161,7 @@ def forward(
 
         >>> # you can pass them to processor for semantic postprocessing
         >>> predicted_semantic_map = processor.post_process_semantic_segmentation(
-        ...     outputs, target_sizes=[image.size[::-1]]
+        ...     outputs, target_sizes=[(image.height, image.width)]
         ... )[0]
         >>> f"👉 Semantic Predictions Shape: {list(predicted_semantic_map.shape)}"
         '👉 Semantic Predictions Shape: [512, 683]'
@@ -3178,7 +3178,7 @@ def forward(
 
         >>> # you can pass them to processor for instance postprocessing
         >>> predicted_instance_map = processor.post_process_instance_segmentation(
-        ...     outputs, target_sizes=[image.size[::-1]]
+        ...     outputs, target_sizes=[(image.height, image.width)]
         ... )[0]["segmentation"]
         >>> f"👉 Instance Predictions Shape: {list(predicted_instance_map.shape)}"
         '👉 Instance Predictions Shape: [512, 683]'
@@ -3195,7 +3195,7 @@ def forward(
 
         >>> # you can pass them to processor for panoptic postprocessing
         >>> predicted_panoptic_map = processor.post_process_panoptic_segmentation(
-        ...     outputs, target_sizes=[image.size[::-1]]
+        ...     outputs, target_sizes=[(image.height, image.width)]
         ... )[0]["segmentation"]
         >>> f"👉 Panoptic Predictions Shape: {list(predicted_panoptic_map.shape)}"
         '👉 Panoptic Predictions Shape: [512, 683]'
diff --git a/src/transformers/models/owlv2/configuration_owlv2.py b/src/transformers/models/owlv2/configuration_owlv2.py
index 43019553c5c6dc..f9085eaf9c1546 100644
--- a/src/transformers/models/owlv2/configuration_owlv2.py
+++ b/src/transformers/models/owlv2/configuration_owlv2.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """OWLv2 model configuration"""
 
-import os
-from typing import TYPE_CHECKING, Dict, Union
+from typing import TYPE_CHECKING, Dict
 
 
 if TYPE_CHECKING:
@@ -90,6 +89,7 @@ class Owlv2TextConfig(PretrainedConfig):
     ```"""
 
     model_type = "owlv2_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -123,24 +123,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from Owlv2Config
-        if config_dict.get("model_type") == "owlv2":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTVisionConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2, 32->16
 class Owlv2VisionConfig(PretrainedConfig):
@@ -197,6 +179,7 @@ class Owlv2VisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "owlv2_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -229,24 +212,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from Owlv2Config
-        if config_dict.get("model_type") == "owlv2":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2
 class Owlv2Config(PretrainedConfig):
@@ -276,6 +241,7 @@ class Owlv2Config(PretrainedConfig):
     """
 
     model_type = "owlv2"
+    sub_configs = {"text_config": Owlv2TextConfig, "vision_config": Owlv2VisionConfig}
 
     def __init__(
         self,
@@ -304,20 +270,6 @@ def __init__(
         self.return_dict = return_dict
         self.initializer_factor = 1.0
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
     @classmethod
     def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
         r"""
diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py
index dd32dc9f141183..3dcf145ea41ffc 100644
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 
-from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
     center_to_corners_format,
     pad,
@@ -399,6 +399,7 @@ def preprocess(
         image_std = image_std if image_std is not None else self.image_std
 
         size = size if size is not None else self.size
+        size = get_size_dict(size)  # for BC
 
         images = make_list_of_images(images)
 
diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py
index 877b348f32c121..8be707ce99a1c6 100644
--- a/src/transformers/models/owlvit/configuration_owlvit.py
+++ b/src/transformers/models/owlvit/configuration_owlvit.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 """OWL-ViT model configuration"""
 
-import os
 from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional
 
 
 if TYPE_CHECKING:
@@ -92,6 +91,7 @@ class OwlViTTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "owlvit_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -125,24 +125,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from OwlViTConfig
-        if config_dict.get("model_type") == "owlvit":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class OwlViTVisionConfig(PretrainedConfig):
     r"""
@@ -198,6 +180,7 @@ class OwlViTVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "owlvit_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -230,24 +213,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from OwlViTConfig
-        if config_dict.get("model_type") == "owlvit":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class OwlViTConfig(PretrainedConfig):
     r"""
@@ -276,6 +241,7 @@ class OwlViTConfig(PretrainedConfig):
     """
 
     model_type = "owlvit"
+    sub_configs = {"text_config": OwlViTTextConfig, "vision_config": OwlViTVisionConfig}
 
     def __init__(
         self,
@@ -304,20 +270,6 @@ def __init__(
         self.return_dict = return_dict
         self.initializer_factor = 1.0
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
     @classmethod
     def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
         r"""
diff --git a/src/transformers/models/paligemma/configuration_paligemma.py b/src/transformers/models/paligemma/configuration_paligemma.py
index 64598436dbbf1f..de60c501292b30 100644
--- a/src/transformers/models/paligemma/configuration_paligemma.py
+++ b/src/transformers/models/paligemma/configuration_paligemma.py
@@ -17,7 +17,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -73,7 +73,7 @@ class PaliGemmaConfig(PretrainedConfig):
     ```"""
 
     model_type = "paligemma"
-    is_composition = False
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py
index 77103a4eabbaf0..dab0d60ad56d15 100644
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -160,11 +160,15 @@ def __init__(
 
         self.image_seq_length = image_processor.image_seq_length
 
-        image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True)
-        tokens_to_add = {"additional_special_tokens": [image_token]}
-        tokenizer.add_special_tokens(tokens_to_add)
+        if not hasattr(tokenizer, "image_token"):
+            image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True)
+            tokens_to_add = {"additional_special_tokens": [image_token]}
+            tokenizer.add_special_tokens(tokens_to_add)
+            self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+        else:
+            self.image_token_id = tokenizer.image_token_id
+
         tokenizer.add_tokens(EXTRA_TOKENS)
-        self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
         tokenizer.add_bos_token = False
         tokenizer.add_eos_token = False
 
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index a1a86e3672d5fc..bae3f6d4cdaeaa 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -1136,7 +1136,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
-                    diagonal_attend_mask |= sliding_attend_mask
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py
index 791f6df50bb40f..f3690e5f686fbb 100644
--- a/src/transformers/models/phimoe/modeling_phimoe.py
+++ b/src/transformers/models/phimoe/modeling_phimoe.py
@@ -1305,7 +1305,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
-                    diagonal_attend_mask |= sliding_attend_mask
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py
index d74bb84ce6abb0..3b6ec9b2d844e0 100644
--- a/src/transformers/models/pix2struct/configuration_pix2struct.py
+++ b/src/transformers/models/pix2struct/configuration_pix2struct.py
@@ -91,6 +91,10 @@ class Pix2StructTextConfig(PretrainedConfig):
         "hidden_size": "hidden_size",
         "num_attention_heads": "num_heads",
         "num_hidden_layers": "num_layers",
+        "decoder_attention_heads": "num_heads",
+        "encoder_attention_heads": "num_heads",
+        "encoder_layers": "num_layers",
+        "decoder_layers": "num_layers",
     }
 
     def __init__(
@@ -354,6 +358,8 @@ def __init__(
             vision_config = {}
             logger.info("vision_config is None. Initializing the Pix2StructVisionConfig with default values.")
 
+        text_config["is_encoder_decoder"] = is_encoder_decoder
+        text_config["tie_word_embeddings"] = tie_word_embeddings
         self.text_config = Pix2StructTextConfig(**text_config)
         self.vision_config = Pix2StructVisionConfig(**vision_config)
 
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index b1ac81bb1f21b6..176dadd5b883e1 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -762,11 +762,14 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets
         return relative_buckets
 
     # Adapted from transformers.models.t5.modeling_t5.T5Attention.compute_bias
-    def compute_bias(self, query_length, key_length, device=None):
+    def compute_bias(self, query_length, key_length, device=None, cache_position=None):
         """Compute binned relative position bias"""
         if device is None:
             device = self.relative_attention_bias.weight.device
-        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+        if cache_position is None:
+            context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+        else:
+            context_position = cache_position[:, None].to(device)
         memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
         relative_position = memory_position - context_position  # shape (query_length, key_length)
         relative_position_bucket = self._relative_position_bucket(
@@ -779,6 +782,7 @@ def compute_bias(self, query_length, key_length, device=None):
         values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
         return values
 
+    # Adapted from transformers.models.t5.modeling_t5.T5Attention.forward
     def forward(
         self,
         hidden_states,
@@ -796,61 +800,66 @@ def forward(
         Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
         """
         # Input is (batch_size, seq_length, dim)
-        # Mask is (batch_size, 1, 1, key_length) (non-causal) or (batch_size, 1, query_length, key_length)
+        # Mask is (batch_size, 1, 1, key_length) (non-causal) or (batch_size, 1, seq_length, key_length) (causal decoder)
         batch_size, seq_length = hidden_states.shape[:2]
 
         # if key_value_states are provided this layer is used as a cross-attention layer for the decoder
         is_cross_attention = key_value_states is not None
 
-        query_states = self.query(hidden_states).contiguous()
+        query_states = self.query(hidden_states)
         query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
 
         if past_key_value is not None:
             is_updated = past_key_value.is_updated.get(self.layer_idx)
             if is_cross_attention:
                 # after the first generated id, we can subsequently re-use all key/value_states from cache
-                past_key_value = past_key_value.cross_attention_cache
+                curr_past_key_value = past_key_value.cross_attention_cache
             else:
-                past_key_value = past_key_value.self_attention_cache
+                curr_past_key_value = past_key_value.self_attention_cache
 
-        # get key/value states
         current_states = key_value_states if is_cross_attention else hidden_states
         if is_cross_attention and past_key_value and is_updated:
             # reuse k,v, cross_attentions
-            key_states = past_key_value.key_cache[self.layer_idx]
-            value_states = past_key_value.value_cache[self.layer_idx]
+            key_states = curr_past_key_value.key_cache[self.layer_idx]
+            value_states = curr_past_key_value.value_cache[self.layer_idx]
         else:
-            key_states = self.key(current_states).contiguous()
-            value_states = self.value(current_states).contiguous()
+            key_states = self.key(current_states)
+            value_states = self.value(current_states)
             key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
             value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
             if past_key_value is not None:
                 # save all key/value_states to cache to be re-used for fast auto-regressive generation
                 cache_position = cache_position if not is_cross_attention else None
-                key_states, value_states = past_key_value.update(
+                key_states, value_states = curr_past_key_value.update(
                     key_states, value_states, self.layer_idx, {"cache_position": cache_position}
                 )
                 # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
                 if is_cross_attention:
                     past_key_value.is_updated[self.layer_idx] = True
 
-        # compute scores
+        # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
         scores = torch.matmul(query_states, key_states.transpose(3, 2))
 
         if position_bias is None:
-            real_seq_length = cache_position[-1] + 1 if query_length is None else query_length
-            key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+            key_length = key_states.shape[-2]
+            # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past)
+            real_seq_length = query_length if query_length is not None else cache_position[-1] + 1
             if not self.has_relative_attention_bias:
                 position_bias = torch.zeros(
-                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+                    (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype
                 )
                 if self.gradient_checkpointing and self.training:
                     position_bias.requires_grad = True
             else:
-                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
+                position_bias = self.compute_bias(
+                    real_seq_length, key_length, device=scores.device, cache_position=cache_position
+                )
+                position_bias = position_bias[:, :, -seq_length:, :]
 
             if mask is not None:
-                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+                causal_mask = mask[:, :, :, : key_states.shape[-2]]
+                position_bias = position_bias + causal_mask
 
         if self.pruned_heads:
             mask = torch.ones(position_bias.shape[1])
@@ -860,10 +869,9 @@ def forward(
             position_bias_masked = position_bias
 
         scores += position_bias_masked
-        # (batch_size, n_heads, seq_length, key_length)
-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
 
         # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
         attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
 
         # Mask heads if we want to
@@ -871,12 +879,12 @@ def forward(
             attn_weights = attn_weights * layer_head_mask
 
         attn_output = torch.matmul(attn_weights, value_states)
-        # (batch_size, seq_length, dim)
-        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
 
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, -1, self.inner_dim)
         attn_output = self.output(attn_output)
 
-        outputs = (attn_output,) + (past_key_value,) + (position_bias,)
+        outputs = (attn_output, past_key_value, position_bias)
 
         if output_attentions:
             outputs = outputs + (attn_weights,)
@@ -969,7 +977,10 @@ def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optiona
             layer_idx=layer_idx,
         )
 
-        self.encoder_decoder_attention = Pix2StructTextLayerCrossAttention(config)
+        self.encoder_decoder_attention = Pix2StructTextLayerCrossAttention(
+            config,
+            layer_idx=layer_idx,
+        )
 
         self.mlp = Pix2StructTextLayerFF(config)
 
@@ -1019,7 +1030,6 @@ def forward(
                 query_length=cache_position[-1] + 1,
                 use_cache=use_cache,
                 output_attentions=output_attentions,
-                cache_position=cache_position,
             )
             hidden_states, past_key_value = cross_attention_outputs[:2]
 
diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py
index de8c594f94c9f2..bf02531ffb864f 100644
--- a/src/transformers/models/pix2struct/processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/processing_pix2struct.py
@@ -21,6 +21,7 @@
 from ...feature_extraction_utils import BatchFeature
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...utils import logging
 
 
 class Pix2StructImagesKwargs(ImagesKwargs, total=False):
@@ -48,6 +49,9 @@ class Pix2StructProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+logger = logging.get_logger(__name__)
+
+
 class Pix2StructProcessor(ProcessorMixin):
     r"""
     Constructs a PIX2STRUCT processor which wraps a BERT tokenizer and PIX2STRUCT image processor into a single
@@ -85,6 +89,15 @@ def __call__(
 
         Please refer to the docstring of the above two methods for more information.
         """
+        legacy = kwargs.pop("legacy", True)
+        if legacy:
+            logger.warning_once(
+                "Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. "
+                "In the new behavior, If both images and text are provided, image_processor is not a VQA processor, and `add_special_tokens` is unset, "
+                "the default value of `add_special_tokens` will be changed to `False` when calling the tokenizer. "
+                "To test the new behavior, set `legacy=False`as a processor call argument."
+            )
+
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
@@ -93,8 +106,12 @@ def __call__(
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
+        add_special_tokens = output_kwargs["text_kwargs"].pop("add_special_tokens", None)
         # Get only text
         if images is None and not self.image_processor.is_vqa:
+            output_kwargs["text_kwargs"]["add_special_tokens"] = (
+                add_special_tokens if add_special_tokens is not None else True
+            )
             self.current_processor = self.tokenizer
             text_encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
             return text_encoding
@@ -108,6 +125,9 @@ def __call__(
             encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
 
         if text is not None and not self.image_processor.is_vqa:
+            output_kwargs["text_kwargs"]["add_special_tokens"] = (
+                add_special_tokens if add_special_tokens is not None else legacy
+            )
             text_encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
 
             if "attention_mask" in text_encoding:
diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
index 32325a929411ba..14db51b947e664 100644
--- a/src/transformers/models/pixtral/configuration_pixtral.py
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -52,6 +52,8 @@ class PixtralVisionConfig(PretrainedConfig):
             Dropout probability for the attention layers.
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 
     Example:
 
@@ -82,6 +84,7 @@ def __init__(
         hidden_act="gelu",
         attention_dropout=0.0,
         rope_theta=10000.0,
+        initializer_range=0.02,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -97,3 +100,4 @@ def __init__(
         self.hidden_act = hidden_act
         self.rope_theta = rope_theta
         self.head_dim = hidden_size // num_attention_heads
+        self.initializer_range = initializer_range
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index a75704fc3dbac8..b4ec0e50c9ccc3 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -120,6 +120,7 @@ def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
         isinstance(images, (list, tuple))
         and len(images) > 0
         and isinstance(images[0], (list, tuple))
+        and len(images[0]) > 0
         and is_valid_image(images[0][0])
     ):
         pass
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 06b9701a75661a..b65fbd634ba789 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -407,7 +407,7 @@ def _init_weights(self, module):
         std = (
             self.config.initializer_range
             if hasattr(self.config, "initializer_range")
-            else self.config.text_config.initializer_range
+            else self.config.initializer_range
         )
 
         if isinstance(module, (nn.Linear, nn.Conv2d)):
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index 70d28fb7b79c93..5913e8688d00be 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -206,14 +206,15 @@ def __call__(
             if is_image_or_image_url(images):
                 images = [[images]]
             elif isinstance(images, list) and is_image_or_image_url(images[0]):
-                images = [images]
-            elif (
-                not isinstance(images, list)
-                and not isinstance(images[0], list)
-                and not is_image_or_image_url(images[0][0])
-            ):
+                if isinstance(text, list):
+                    images = [[im] for im in images]
+                else:
+                    images = [images]
+            elif isinstance(images, list) and isinstance(images[0], list) and is_image_or_image_url(images[0][0]):
+                pass
+            else:
                 raise ValueError(
-                    "Invalid input images. Please provide a single image or a list of images or a list of list of images."
+                    "Invalid input images. Please provide a single image, a list of images, or a list of lists of images."
                 )
             images = [[load_image(im) for im in sample] for sample in images]
             image_inputs = self.image_processor(images, patch_size=self.patch_size, **output_kwargs["images_kwargs"])
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index 0d97f2ffb724a0..0883fac1aebafc 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -1059,7 +1059,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
-                    diagonal_attend_mask |= sliding_attend_mask
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
diff --git a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py
index deb276f334723c..925aa60a8dc6de 100644
--- a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py
@@ -15,7 +15,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -157,7 +157,7 @@ class Qwen2AudioConfig(PretrainedConfig):
     ```"""
 
     model_type = "qwen2_audio"
-    is_composition = False
+    sub_configs = {"text_config": AutoConfig, "audio_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index 36de586265ce60..7f4f19aba1f3eb 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -1239,7 +1239,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
-                    diagonal_attend_mask |= sliding_attend_mask
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
index 1349006e768cd4..55042327de4ec3 100644
--- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """Qwen2VL model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...modeling_rope_utils import rope_config_validation
 from ...utils import logging
@@ -27,6 +24,7 @@
 
 class Qwen2VLVisionConfig(PretrainedConfig):
     model_type = "qwen2_vl"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -55,23 +53,6 @@ def __init__(
         self.spatial_merge_size = spatial_merge_size
         self.temporal_patch_size = temporal_patch_size
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "qwen2_vl":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Qwen2VLConfig(PretrainedConfig):
     r"""
@@ -180,6 +161,7 @@ class Qwen2VLConfig(PretrainedConfig):
     ```"""
 
     model_type = "qwen2_vl"
+    sub_configs = {"vision_config": Qwen2VLVisionConfig}
     keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
index 4e9401c77e4d7d..eabae7b2b0df06 100644
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -1000,6 +1000,7 @@ def __init__(self, config) -> None:
         self.merger = PatchMerger(
             dim=config.hidden_size, context_dim=config.embed_dim, spatial_merge_size=config.spatial_merge_size
         )
+        self.gradient_checkpointing = False
 
     def get_dtype(self) -> torch.dtype:
         return self.blocks[0].mlp.fc2.weight.dtype
@@ -1046,7 +1047,12 @@ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.
         cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
 
         for blk in self.blocks:
-            hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    blk.__call__, hidden_states, cu_seqlens, rotary_pos_emb
+                )
+            else:
+                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
 
         return self.merger(hidden_states)
 
@@ -1321,7 +1327,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
-                    diagonal_attend_mask |= sliding_attend_mask
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
@@ -1503,13 +1509,14 @@ def get_rope_index(
         mrope_position_deltas = []
         if image_grid_thw is not None or video_grid_thw is not None:
             total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
             position_ids = torch.ones(
                 3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device
             )
             image_index, video_index = 0, 0
             for i, input_ids in enumerate(total_input_ids):
-                if attention_mask is not None:
-                    input_ids = input_ids[attention_mask[i] == 1]
+                input_ids = input_ids[attention_mask[i] == 1]
                 image_nums, video_nums = 0, 0
                 vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
                 vision_tokens = input_ids[vision_start_indices + 1]
@@ -1719,6 +1726,9 @@ def forward(
             if attention_mask is not None:
                 attention_mask = attention_mask.to(inputs_embeds.device)
 
+        if position_ids is None and input_ids is not None:
+            position_ids, _ = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask)
+
         outputs = self.model(
             input_ids=None,
             position_ids=position_ids,
diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
index 6c0e8d98014ede..f7a59aa15f250f 100644
--- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
@@ -61,6 +61,8 @@ class Qwen2VLProcessor(ProcessorMixin):
     tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
@@ -132,23 +134,23 @@ def __call__(
             merge_length = self.image_processor.merge_size**2
             index = 0
             for i in range(len(text)):
-                while "<|image_pad|>" in text[i]:
+                while self.image_token in text[i]:
                     text[i] = text[i].replace(
-                        "<|image_pad|>", "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length), 1
+                        self.image_token, "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length), 1
                     )
                     index += 1
-                text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>")
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
 
         if video_grid_thw is not None:
             merge_length = self.image_processor.merge_size**2
             index = 0
             for i in range(len(text)):
-                while "<|video_pad|>" in text[i]:
+                while self.video_token in text[i]:
                     text[i] = text[i].replace(
-                        "<|video_pad|>", "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length), 1
+                        self.video_token, "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length), 1
                     )
                     index += 1
-                text[i] = text[i].replace("<|placeholder|>", "<|video_pad|>")
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
 
         text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
@@ -168,6 +170,22 @@ def decode(self, *args, **kwargs):
         """
         return self.tokenizer.decode(*args, **kwargs)
 
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of the model to decode the text.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
     @property
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
index d3164b17fe130c..2b3cf7eb0cb82e 100644
--- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
+++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
@@ -775,7 +775,7 @@ def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
         return causal_mask
 
 
-# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->RECURRENTGEMMA,Llama->RecurrentGemma,llama->gemma
+# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->RECURRENTGEMMA,Llama->RecurrentGemma,llama->gemma
 class RecurrentGemmaForCausalLM(RecurrentGemmaPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
 
diff --git a/src/transformers/models/rt_detr/__init__.py b/src/transformers/models/rt_detr/__init__.py
index 94a428c66685a6..52453f38b2c4f4 100644
--- a/src/transformers/models/rt_detr/__init__.py
+++ b/src/transformers/models/rt_detr/__init__.py
@@ -26,6 +26,7 @@
     pass
 else:
     _import_structure["image_processing_rt_detr"] = ["RTDetrImageProcessor"]
+    _import_structure["image_processing_rt_detr_fast"] = ["RTDetrImageProcessorFast"]
 
 try:
     if not is_torch_available():
@@ -55,6 +56,7 @@
         pass
     else:
         from .image_processing_rt_detr import RTDetrImageProcessor
+        from .image_processing_rt_detr_fast import RTDetrImageProcessorFast
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py
index 44b2702aa634bc..eead5b18693d2f 100644
--- a/src/transformers/models/rt_detr/image_processing_rt_detr.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py
@@ -1062,10 +1062,8 @@ def post_process_object_detection(
                 raise ValueError(
                     "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                 )
-
             if isinstance(target_sizes, List):
-                img_h = torch.Tensor([i[0] for i in target_sizes])
-                img_w = torch.Tensor([i[1] for i in target_sizes])
+                img_h, img_w = torch.as_tensor(target_sizes).unbind(1)
             else:
                 img_h, img_w = target_sizes.unbind(1)
             scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
@@ -1089,10 +1087,13 @@ def post_process_object_detection(
                 boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
 
         results = []
-        for s, l, b in zip(scores, labels, boxes):
-            score = s[s > threshold]
-            label = l[s > threshold]
-            box = b[s > threshold]
-            results.append({"scores": score, "labels": label, "boxes": box})
+        for score, label, box in zip(scores, labels, boxes):
+            results.append(
+                {
+                    "scores": score[score > threshold],
+                    "labels": label[score > threshold],
+                    "boxes": box[score > threshold],
+                }
+            )
 
         return results
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
new file mode 100644
index 00000000000000..d447ee8c22ae8b
--- /dev/null
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
@@ -0,0 +1,800 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for RT-DETR."""
+
+import functools
+import pathlib
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ...image_processing_utils import BatchFeature, get_size_dict
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    SizeDict,
+    get_image_size_for_max_height_width,
+    get_max_height_width,
+    safe_squeeze,
+)
+from ...image_transforms import (
+    center_to_corners_format,
+    corners_to_center_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    ImageType,
+    PILImageResampling,
+    get_image_size,
+    get_image_type,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    validate_annotations,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+    requires_backends,
+)
+from .image_processing_rt_detr import (
+    get_size_with_aspect_ratio,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+if is_torchvision_available():
+    from ...image_utils import pil_torch_interpolation_mapping
+
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
+
+
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by RT-DETR.
+    """
+    image_height, image_width = image.size()[-2:]
+
+    image_id = target["image_id"]
+    image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    classes = []
+    area = []
+    boxes = []
+    keypoints = []
+    for obj in annotations:
+        if "iscrowd" not in obj or obj["iscrowd"] == 0:
+            classes.append(obj["category_id"])
+            area.append(obj["area"])
+            boxes.append(obj["bbox"])
+            if "keypoints" in obj:
+                keypoints.append(obj["keypoints"])
+
+    classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+    # guard against no boxes via resizing
+    boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {
+        "image_id": image_id,
+        "class_labels": classes[keep],
+        "boxes": boxes[keep],
+        "area": area[keep],
+        "iscrowd": iscrowd[keep],
+        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+    }
+
+    if keypoints:
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    return new_target
+
+
+class RTDetrImageProcessorFast(BaseImageProcessorFast):
+    r"""
+    Constructs a fast RT-DETR DETR image processor.
+
+    Args:
+        format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+            in the `preprocess` method. Available options are:
+                - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                    Do NOT keep the aspect ratio.
+                - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                    the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                    less or equal to `longest_edge`.
+                - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                    aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                    `max_width`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `False`):
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `False`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: Union[PILImageResampling, "F.InterpolationMode"] = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = False,
+        image_mean: Union[float, List[float]] = None,
+        image_std: Union[float, List[float]] = None,
+        do_convert_annotations: bool = True,
+        do_pad: bool = False,
+        pad_size: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ) -> None:
+        size = size if size is not None else {"height": 640, "width": 640}
+        size = get_size_dict(size, default_to_square=False)
+
+        if do_convert_annotations is None:
+            do_convert_annotations = do_normalize
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_convert_annotations = do_convert_annotations
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+
+    def prepare_annotation(
+        self,
+        image: torch.Tensor,
+        target: Dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: bool = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Dict:
+        """
+        Prepare an annotation for feeding into RTDETR model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize
+    def resize(
+        self,
+        image: torch.Tensor,
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                Resampling filter to use if resizing the image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            # Resize the image so that the shortest edge or the longest edge is of the given size
+            # while maintaining the aspect ratio of the original image.
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size["shortest_edge"],
+                size["longest_edge"],
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
+        elif size.height and size.width:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+
+        image = F.resize(
+            image,
+            size=new_size,
+            interpolation=interpolation,
+            **kwargs,
+        )
+        return image
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize_annotation
+    def resize_annotation(
+        self,
+        annotation: Dict[str, Any],
+        orig_size: Tuple[int, int],
+        target_size: Tuple[int, int],
+        threshold: float = 0.5,
+        interpolation: "F.InterpolationMode" = None,
+    ):
+        """
+        Resizes an annotation to a target size.
+
+        Args:
+            annotation (`Dict[str, Any]`):
+                The annotation dictionary.
+            orig_size (`Tuple[int, int]`):
+                The original size of the input image.
+            target_size (`Tuple[int, int]`):
+                The target size of the image, as returned by the preprocessing `resize` step.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The threshold used to binarize the segmentation masks.
+            resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`):
+                The resampling filter to use when resizing the masks.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST
+        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
+
+        new_annotation = {}
+        new_annotation["size"] = target_size
+
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                scaled_boxes = boxes * torch.as_tensor(
+                    [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
+                )
+                new_annotation["boxes"] = scaled_boxes
+            elif key == "area":
+                area = value
+                scaled_area = area * (ratio_width * ratio_height)
+                new_annotation["area"] = scaled_area
+            elif key == "masks":
+                masks = value[:, None]
+                masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
+                masks = torch.stack(masks).to(torch.float32)
+                masks = masks[:, 0] > threshold
+                new_annotation["masks"] = masks
+            elif key == "size":
+                new_annotation["size"] = target_size
+            else:
+                new_annotation[key] = value
+
+        return new_annotation
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.normalize_annotation
+    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+        image_height, image_width = image_size
+        norm_annotation = {}
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                boxes = corners_to_center_format(boxes)
+                boxes /= torch.as_tensor(
+                    [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
+                )
+                norm_annotation[key] = boxes
+            else:
+                norm_annotation[key] = value
+        return norm_annotation
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._update_annotation_for_padded_image
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: Dict,
+        input_image_size: Tuple[int, int],
+        output_image_size: Tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> Dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+        ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = F.pad(
+                    masks,
+                    padding,
+                    fill=0,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.pad
+    def pad(
+        self,
+        image: torch.Tensor,
+        padded_size: Tuple[int, int],
+        annotation: Optional[Dict[str, Any]] = None,
+        update_bboxes: bool = True,
+        fill: int = 0,
+    ):
+        original_size = image.size()[-2:]
+        padding_bottom = padded_size[0] - original_size[0]
+        padding_right = padded_size[1] - original_size[1]
+        if padding_bottom < 0 or padding_right < 0:
+            raise ValueError(
+                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+                f"original size. Got padded size: {padded_size}, original size: {original_size}."
+            )
+        if original_size != padded_size:
+            padding = [0, 0, padding_right, padding_bottom]
+            image = F.pad(image, padding, fill=fill)
+            if annotation is not None:
+                annotation = self._update_annotation_for_padded_image(
+                    annotation, original_size, padded_size, padding, update_bboxes
+                )
+
+        # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+        pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
+        pixel_mask[: original_size[0], : original_size[1]] = 1
+
+        return image, pixel_mask, annotation
+
+    @functools.lru_cache(maxsize=1)
+    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._validate_input_arguments
+    def _validate_input_arguments(
+        self,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+        do_resize: bool,
+        size: Dict[str, int],
+        resample: "PILImageResampling",
+        data_format: Union[str, ChannelDimension],
+        return_tensors: Union[TensorType, str],
+    ):
+        if return_tensors != "pt":
+            raise ValueError("Only returning PyTorch tensors is currently supported.")
+
+        if data_format != ChannelDimension.FIRST:
+            raise ValueError("Only channel first data format is currently supported.")
+
+        if do_resize and None in (size, resample):
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and None in (image_mean, image_std):
+            raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
+
+    @filter_out_non_signature_kwargs(extra=["device"])
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        return_segmentation_masks: bool = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: Optional[Union[PILImageResampling, "F.InterpolationMode"]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        do_convert_annotations: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        pad_size: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+                Whether to return segmentation masks.
+            masks_path (`str` or `pathlib.Path`, *optional*):
+                Path to the directory containing the segmentation masks.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+                Whether to convert the annotations to the format expected by the model. Converts the bounding
+                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+                and in relative coordinates.
+            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+                the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+                dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+                Format of the annotations.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            pad_size (`Dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, default_to_square=True)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_convert_annotations = (
+            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+        )
+        do_pad = self.do_pad if do_pad is None else do_pad
+        pad_size = self.pad_size if pad_size is None else pad_size
+        format = self.format if format is None else format
+        return_tensors = "pt" if return_tensors is None else return_tensors
+        device = kwargs.pop("device", None)
+
+        # Make hashable for cache
+        size = SizeDict(**size)
+        image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
+        image_std = tuple(image_std) if isinstance(image_std, list) else image_std
+
+        images = make_list_of_images(images)
+        image_type = get_image_type(images[0])
+
+        if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
+            raise ValueError(f"Unsupported input image type {image_type}")
+
+        self._validate_input_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            return_tensors=return_tensors,
+            data_format=data_format,
+        )
+
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        data = {}
+        if image_type == ImageType.PIL:
+            images = [F.pil_to_tensor(image) for image in images]
+        elif image_type == ImageType.NUMPY:
+            # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays
+            images = [torch.from_numpy(image).contiguous() for image in images]
+
+        if device is not None:
+            images = [image.to(device) for image in images]
+
+        # We assume that all images have the same channel dimension format.
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(images[0])
+        if input_data_format == ChannelDimension.LAST:
+            images = [image.permute(2, 0, 1).contiguous() for image in images]
+            input_data_format = ChannelDimension.FIRST
+
+        if do_rescale and do_normalize:
+            # fused rescale and normalize
+            new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor)
+            new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor)
+
+        processed_images = []
+        processed_annotations = []
+        pixel_masks = []  # Initialize pixel_masks here
+        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+            # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+            if annotations is not None:
+                annotation = self.prepare_annotation(
+                    image,
+                    annotation,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
+                )
+
+            if do_resize:
+                interpolation = (
+                    pil_torch_interpolation_mapping[resample]
+                    if isinstance(resample, (PILImageResampling, int))
+                    else resample
+                )
+                resized_image = self.resize(image, size=size, interpolation=interpolation)
+                if annotations is not None:
+                    annotation = self.resize_annotation(
+                        annotation,
+                        orig_size=image.size()[-2:],
+                        target_size=resized_image.size()[-2:],
+                    )
+                image = resized_image
+
+            if do_rescale and do_normalize:
+                # fused rescale and normalize
+                image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std)
+            elif do_rescale:
+                image = image * rescale_factor
+            elif do_normalize:
+                image = F.normalize(image, image_mean, image_std)
+
+            if do_convert_annotations and annotations is not None:
+                annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+
+            processed_images.append(image)
+            processed_annotations.append(annotation)
+        images = processed_images
+        annotations = processed_annotations if annotations is not None else None
+
+        if do_pad:
+            # depends on all resized image shapes so we need another loop
+            if pad_size is not None:
+                padded_size = (pad_size["height"], pad_size["width"])
+            else:
+                padded_size = get_max_height_width(images)
+
+            padded_images = []
+            padded_annotations = []
+            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+                if padded_size == image.size()[-2:]:
+                    padded_images.append(image)
+                    pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+                    padded_annotations.append(annotation)
+                    continue
+                image, pixel_mask, annotation = self.pad(
+                    image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+                )
+                padded_images.append(image)
+                padded_annotations.append(annotation)
+                pixel_masks.append(pixel_mask)
+            images = padded_images
+            annotations = padded_annotations if annotations is not None else None
+            data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+        return encoded_inputs
+
+    # Copied from transformers.models.rt_detr.image_processing_rt_detr.RTDetrImageProcessor.post_process_object_detection
+    def post_process_object_detection(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        target_sizes: Union[TensorType, List[Tuple]] = None,
+        use_focal_loss: bool = True,
+    ):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+            use_focal_loss (`bool` defaults to `True`):
+                Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied
+                to compute the scores of each detection, otherwise, a softmax function is used.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        requires_backends(self, ["torch"])
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+        # convert from relative cxcywh to absolute xyxy
+        boxes = center_to_corners_format(out_bbox)
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+            if isinstance(target_sizes, List):
+                img_h, img_w = torch.as_tensor(target_sizes).unbind(1)
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        num_top_queries = out_logits.shape[1]
+        num_classes = out_logits.shape[2]
+
+        if use_focal_loss:
+            scores = torch.nn.functional.sigmoid(out_logits)
+            scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1)
+            labels = index % num_classes
+            index = index // num_classes
+            boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
+        else:
+            scores = torch.nn.functional.softmax(out_logits)[:, :, :-1]
+            scores, labels = scores.max(dim=-1)
+            if scores.shape[1] > num_top_queries:
+                scores, index = torch.topk(scores, num_top_queries, dim=-1)
+                labels = torch.gather(labels, dim=1, index=index)
+                boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+        results = []
+        for score, label, box in zip(scores, labels, boxes):
+            results.append(
+                {
+                    "scores": score[score > threshold],
+                    "labels": label[score > threshold],
+                    "boxes": box[score > threshold],
+                }
+            )
+
+        return results
diff --git a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
index 44cf17b1cf1899..a0c97fc4e234ab 100644
--- a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
+++ b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
@@ -21,10 +21,10 @@
 import re
 
 import torch
-from huggingface_hub import hf_hub_download
+from huggingface_hub import hf_hub_download, split_torch_state_dict_into_shards
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig
-from transformers.modeling_utils import WEIGHTS_INDEX_NAME, shard_checkpoint
+from transformers.modeling_utils import WEIGHTS_INDEX_NAME
 
 
 NUM_HIDDEN_LAYERS_MAPPING = {
@@ -116,7 +116,16 @@ def convert_rmkv_checkpoint_to_hf_format(
     state_dict = convert_state_dict(state_dict)
 
     # 4. Split in shards and save
-    shards, index = shard_checkpoint(state_dict)
+    state_dict_split = split_torch_state_dict_into_shards(state_dict)
+    shards = index = None
+    for tensors in state_dict_split.filename_to_tensors.values():
+        shards = {tensor: state_dict[tensor] for tensor in tensors}
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+
     for shard_file, shard in shards.items():
         torch.save(shard, os.path.join(output_dir, shard_file))
 
diff --git a/src/transformers/models/seggpt/modeling_seggpt.py b/src/transformers/models/seggpt/modeling_seggpt.py
index 174aeaad00ae41..c0f1f24a31781f 100644
--- a/src/transformers/models/seggpt/modeling_seggpt.py
+++ b/src/transformers/models/seggpt/modeling_seggpt.py
@@ -962,7 +962,7 @@ def forward(
 
         >>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")
         >>> outputs = model(**inputs)
-        >>> result = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[image_input.size[::-1]])[0]
+        >>> result = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[(image_input.height, image_input.width)])[0]
         >>> print(list(result.shape))
         [170, 297]
         ```
diff --git a/src/transformers/models/siglip/configuration_siglip.py b/src/transformers/models/siglip/configuration_siglip.py
index 73622373cbab5d..cc8fae93cdb25b 100644
--- a/src/transformers/models/siglip/configuration_siglip.py
+++ b/src/transformers/models/siglip/configuration_siglip.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """Siglip model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -79,6 +76,7 @@ class SiglipTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "siglip_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -110,24 +108,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.attention_dropout = attention_dropout
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from SiglipConfig
-        if config_dict.get("model_type") == "siglip":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class SiglipVisionConfig(PretrainedConfig):
     r"""
@@ -178,6 +158,7 @@ class SiglipVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "siglip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -206,24 +187,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from SiglipConfig
-        if config_dict.get("model_type") == "siglip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class SiglipConfig(PretrainedConfig):
     r"""
@@ -268,6 +231,7 @@ class SiglipConfig(PretrainedConfig):
     ```"""
 
     model_type = "siglip"
+    sub_configs = {"text_config": SiglipTextConfig, "vision_config": SiglipVisionConfig}
 
     def __init__(self, text_config=None, vision_config=None, **kwargs):
         super().__init__(**kwargs)
diff --git a/src/transformers/models/siglip/image_processing_siglip.py b/src/transformers/models/siglip/image_processing_siglip.py
index 5bbeeb74c8f13f..293716b48fa236 100644
--- a/src/transformers/models/siglip/image_processing_siglip.py
+++ b/src/transformers/models/siglip/image_processing_siglip.py
@@ -198,12 +198,12 @@ def preprocess(
             size=size,
             resample=resample,
         )
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
         if do_convert_rgb:
             images = [convert_to_rgb(image) for image in images]
 
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
         if is_scaled_image(images[0]) and do_rescale:
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index a3d06cbb4792b4..a42bcd0e17461e 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -308,7 +308,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
 
     def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
         _, _, height, width = pixel_values.shape
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
 
         if interpolate_pos_encoding:
diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
index 32a58ec5589eed..d7e0211610b657 100644
--- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
@@ -71,6 +71,7 @@ class SpeechEncoderDecoderConfig(PretrainedConfig):
     ```"""
 
     model_type = "speech-encoder-decoder"
+    sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
     is_composition = True
 
     def __init__(self, **kwargs):
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index c8f22dee43fe2c..93adc80d161aa7 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -1,3 +1,9 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/starcoder2/modular_starcoder2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_starcoder2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
 #
@@ -17,20 +23,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch Starcoder2 model."""
 
 import math
 from typing import List, Optional, Tuple, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
-from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import _flash_attention_forward
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -56,12 +60,10 @@
 
 
 logger = logging.get_logger(__name__)
-
 _CHECKPOINT_FOR_DOC = "bigcode/starcoder2-7b"
 _CONFIG_FOR_DOC = "Starcoder2Config"
 
 
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Starcoder2
 class Starcoder2RotaryEmbedding(nn.Module):
     def __init__(
         self,
@@ -149,7 +151,23 @@ def forward(self, x, position_ids):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
-# Copied from transformers.models.llama.modeling_llama.rotate_half
+class Starcoder2MLP(nn.Module):
+    def __init__(self, config: Starcoder2Config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = nn.Linear(embed_dim, config.intermediate_size, bias=config.use_bias)
+        self.c_proj = nn.Linear(config.intermediate_size, embed_dim, bias=config.use_bias)
+        self.act = ACT2FN[config.hidden_act]
+        self.residual_dropout = config.residual_dropout
+
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.residual_dropout, training=self.training)
+        return hidden_states
+
+
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -157,7 +175,6 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
@@ -185,24 +202,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
 
 
-class Starcoder2MLP(nn.Module):
-    def __init__(self, config: Starcoder2Config):
-        super().__init__()
-        embed_dim = config.hidden_size
-        self.c_fc = nn.Linear(embed_dim, config.intermediate_size, bias=config.use_bias)
-        self.c_proj = nn.Linear(config.intermediate_size, embed_dim, bias=config.use_bias)
-        self.act = ACT2FN[config.hidden_act]
-        self.residual_dropout = config.residual_dropout
-
-    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
-        hidden_states = self.c_fc(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.c_proj(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.residual_dropout, training=self.training)
-        return hidden_states
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
@@ -331,7 +330,6 @@ class Starcoder2FlashAttention2(Starcoder2Attention):
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -340,7 +338,6 @@ def __init__(self, *args, **kwargs):
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
 
-    # Ignore copy
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -406,7 +403,7 @@ def forward(
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
 
-        # Reashape to the expected shape for Flash Attention
+        # Reshape to the expected shape for Flash Attention
         query_states = query_states.transpose(1, 2)
         key_states = key_states.transpose(1, 2)
         value_states = value_states.transpose(1, 2)
@@ -434,7 +431,6 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-# Copied from transformers.models.mixtral.modeling_mixtral.MixtralSdpaAttention with Mixtral->Starcoder2
 class Starcoder2SdpaAttention(Starcoder2Attention):
     """
     Starcoder2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -442,7 +438,6 @@ class Starcoder2SdpaAttention(Starcoder2Attention):
     SDPA API.
     """
 
-    # Ignore copy
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -552,7 +547,6 @@ def __init__(self, config: Starcoder2Config, layer_idx: int):
         self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
 
-    # Copied from transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -642,7 +636,6 @@ def forward(
     "The bare Starcoder2 Model outputting raw hidden-states without any specific head on top.",
     STARCODER2_START_DOCSTRING,
 )
-# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2PreTrainedModel with Qwen2->Starcoder2
 class Starcoder2PreTrainedModel(PreTrainedModel):
     config_class = Starcoder2Config
     base_model_prefix = "model"
@@ -760,14 +753,15 @@ def __init__(self, config: Starcoder2Config):
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.embedding_dropout = config.embedding_dropout
         self.layers = nn.ModuleList(
             [Starcoder2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
         self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
         self.rotary_emb = Starcoder2RotaryEmbedding(config=config)
+
         self.gradient_checkpointing = False
+        self.embedding_dropout = config.embedding_dropout
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -904,7 +898,6 @@ def forward(
             attentions=all_self_attns,
         )
 
-    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -981,7 +974,6 @@ def _update_causal_mask(
         return causal_mask
 
     @staticmethod
-    # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Starcoder2
     def _prepare_4d_causal_attention_mask_with_cache_position(
         attention_mask: torch.Tensor,
         sequence_length: int,
@@ -1033,7 +1025,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
-                    diagonal_attend_mask |= sliding_attend_mask
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
@@ -1049,7 +1041,6 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         return causal_mask
 
 
-# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM with QWEN2->STARCODER2,Qwen2->Starcoder2
 class Starcoder2ForCausalLM(Starcoder2PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
 
@@ -1082,7 +1073,6 @@ def get_decoder(self):
 
     @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    # Ignore copy
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1097,6 +1087,7 @@ def forward(
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
+        **loss_kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -1117,8 +1108,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, Starcoder2ForCausalLM
 
-        >>> model = Starcoder2ForCausalLM.from_pretrained("bigcode/starcoder2-7b")
-        >>> tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder2-7b")
+        >>> model = Starcoder2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
@@ -1155,18 +1146,7 @@ def forward(
 
         loss = None
         if labels is not None:
-            # Upcast to float if we need to compute the loss to avoid potential precision issues
-            logits = logits.float()
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Ensure tensors are on the same device
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits, shift_labels)
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
 
         if not return_dict:
             output = (logits,) + outputs[1:]
@@ -1196,7 +1176,6 @@ def forward(
     """,
     STARCODER2_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Starcoder2, LLAMA->STARCODER2
 class Starcoder2ForSequenceClassification(Starcoder2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1293,7 +1272,6 @@ def forward(
     """,
     STARCODER2_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Starcoder2, LLAMA->STARCODER2
 class Starcoder2ForTokenClassification(Starcoder2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/starcoder2/modular_starcoder2.py b/src/transformers/models/starcoder2/modular_starcoder2.py
new file mode 100644
index 00000000000000..b323a3ce9e4d5b
--- /dev/null
+++ b/src/transformers/models/starcoder2/modular_starcoder2.py
@@ -0,0 +1,573 @@
+# coding=utf-8
+# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Starcoder2 model."""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+)
+from ...utils import (
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+)
+from ..llama.modeling_llama import (
+    LlamaForSequenceClassification,
+    LlamaForTokenClassification,
+    LlamaRotaryEmbedding,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
+from ..qwen2.modeling_qwen2 import Qwen2DecoderLayer, Qwen2ForCausalLM, Qwen2Model, Qwen2PreTrainedModel
+from .configuration_starcoder2 import Starcoder2Config
+
+
+if is_flash_attn_2_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Starcoder2Config"
+_CHECKPOINT_FOR_DOC = "bigcode/starcoder2-7b"
+
+
+class Starcoder2RotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+class Starcoder2MLP(nn.Module):
+    def __init__(self, config: Starcoder2Config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = nn.Linear(embed_dim, config.intermediate_size, bias=config.use_bias)
+        self.c_proj = nn.Linear(config.intermediate_size, embed_dim, bias=config.use_bias)
+        self.act = ACT2FN[config.hidden_act]
+        self.residual_dropout = config.residual_dropout
+
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.residual_dropout, training=self.training)
+        return hidden_states
+
+
+class Starcoder2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Starcoder2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.rope_theta = config.rope_theta
+        self.use_bias = config.use_bias
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        self.residual_dropout = config.residual_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=self.use_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=self.use_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=self.use_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=self.use_bias)
+
+        self.rotary_emb = Starcoder2RotaryEmbedding(config=self.config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights += causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+        attn_output = nn.functional.dropout(attn_output, p=self.residual_dropout, training=self.training)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Starcoder2FlashAttention2(Starcoder2Attention):
+    """
+    Starcoder2 flash attention module. This module inherits from `Starcoder2Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ):
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reshape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        attn_output = nn.functional.dropout(attn_output, p=self.residual_dropout, training=self.training)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Starcoder2SdpaAttention(Starcoder2Attention):
+    """
+    Starcoder2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Starcoder2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Starcoder2Model is using Starcoder2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+        # The difference with Mistral is that here it uses dropout
+        attn_output = nn.functional.dropout(attn_output, p=self.residual_dropout, training=self.training)
+
+        return attn_output, None, past_key_value
+
+
+STARCODER2_ATTENTION_CLASSES = {
+    "eager": Starcoder2Attention,
+    "flash_attention_2": Starcoder2FlashAttention2,
+    "sdpa": Starcoder2SdpaAttention,
+}
+
+
+class Starcoder2DecoderLayer(Qwen2DecoderLayer, nn.Module):
+    def __init__(self, config: Starcoder2Config, layer_idx: int):
+        nn.Module.__init__(self)
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = STARCODER2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        self.mlp = Starcoder2MLP(config)
+
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+
+
+class Starcoder2PreTrainedModel(Qwen2PreTrainedModel):
+    pass
+
+
+STARCODER2_INPUTS_DOCSTRING = None  # will be automatically redefined
+
+
+class Starcoder2Model(Qwen2Model):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Starcoder2DecoderLayer`]
+
+    Args:
+        config: Starcoder2Config
+    """
+
+    def __init__(self, config: Starcoder2Config):
+        super().__init__(config)
+        self.embedding_dropout = config.embedding_dropout
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+
+    @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.embedding_dropout, training=self.training)
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Starcoder2ForCausalLM(Qwen2ForCausalLM):
+    pass
+
+
+class Starcoder2ForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+class Starcoder2ForTokenClassification(LlamaForTokenClassification):
+    pass
diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index fbbb717570cb70..65309b1c1826f2 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 """Image processor class for SuperPoint."""
 
-from typing import Dict, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
-from ... import is_vision_available
+from ... import is_torch_available, is_vision_available
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
@@ -32,6 +32,12 @@
 from ...utils import TensorType, logging, requires_backends
 
 
+if is_torch_available():
+    import torch
+
+if TYPE_CHECKING:
+    from .modeling_superpoint import SuperPointKeypointDescriptionOutput
+
 if is_vision_available():
     import PIL
 
@@ -270,3 +276,52 @@ def preprocess(
         data = {"pixel_values": images}
 
         return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_keypoint_detection(
+        self, outputs: "SuperPointKeypointDescriptionOutput", target_sizes: Union[TensorType, List[Tuple]]
+    ) -> List[Dict[str, "torch.Tensor"]]:
+        """
+        Converts the raw output of [`SuperPointForKeypointDetection`] into lists of keypoints, scores and descriptors
+        with coordinates absolute to the original image sizes.
+
+        Args:
+            outputs ([`SuperPointKeypointDescriptionOutput`]):
+                Raw outputs of the model containing keypoints in a relative (x, y) format, with scores and descriptors.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. This must be the original
+                image size (before any processing).
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints in absolute format according
+            to target_sizes, scores and descriptors for an image in the batch as predicted by the model.
+        """
+        if len(outputs.mask) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the mask")
+
+        if isinstance(target_sizes, List):
+            image_sizes = torch.tensor(target_sizes)
+        else:
+            if target_sizes.shape[1] != 2:
+                raise ValueError(
+                    "Each element of target_sizes must contain the size (h, w) of each image of the batch"
+                )
+            image_sizes = target_sizes
+
+        # Flip the image sizes to (width, height) and convert keypoints to absolute coordinates
+        image_sizes = torch.flip(image_sizes, [1])
+        masked_keypoints = outputs.keypoints * image_sizes[:, None]
+
+        # Convert masked_keypoints to int
+        masked_keypoints = masked_keypoints.to(torch.int32)
+
+        results = []
+        for image_mask, keypoints, scores, descriptors in zip(
+            outputs.mask, masked_keypoints, outputs.scores, outputs.descriptors
+        ):
+            indices = torch.nonzero(image_mask).squeeze(1)
+            keypoints = keypoints[indices]
+            scores = scores[indices]
+            descriptors = descriptors[indices]
+            results.append({"keypoints": keypoints, "scores": scores, "descriptors": descriptors})
+
+        return results
diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py
index cfd3dfd86e8ee9..1075de299a9f40 100644
--- a/src/transformers/models/superpoint/modeling_superpoint.py
+++ b/src/transformers/models/superpoint/modeling_superpoint.py
@@ -239,7 +239,10 @@ def _get_pixel_scores(self, encoded: torch.Tensor) -> torch.Tensor:
         return scores
 
     def _extract_keypoints(self, scores: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Based on their scores, extract the pixels that represent the keypoints that will be used for descriptors computation"""
+        """
+        Based on their scores, extract the pixels that represent the keypoints that will be used for descriptors computation.
+        The keypoints are in the form of relative (x, y) coordinates.
+        """
         _, height, width = scores.shape
 
         # Threshold keypoints by score value
@@ -447,7 +450,7 @@ def forward(
 
         pixel_values = self.extract_one_channel_pixel_values(pixel_values)
 
-        batch_size = pixel_values.shape[0]
+        batch_size, _, height, width = pixel_values.shape
 
         encoder_outputs = self.encoder(
             pixel_values,
@@ -485,6 +488,9 @@ def forward(
             descriptors[i, : _descriptors.shape[0]] = _descriptors
             mask[i, : _scores.shape[0]] = 1
 
+        # Convert to relative coordinates
+        keypoints = keypoints / torch.tensor([width, height], device=keypoints.device)
+
         hidden_states = encoder_outputs[1] if output_hidden_states else None
         if not return_dict:
             return tuple(v for v in [loss, keypoints, scores, descriptors, mask, hidden_states] if v is not None)
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 45383a36d9bea8..23f0ba6da620cd 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -635,7 +635,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class SwinLayer(nn.Module):
-    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
+    def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.shift_size = shift_size
@@ -643,7 +643,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
         self.input_resolution = input_resolution
         self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
         self.attention = SwinAttention(config, dim, num_heads, window_size=self.window_size)
-        self.drop_path = SwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.drop_path = SwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
         self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
         self.intermediate = SwinIntermediate(config, dim)
         self.output = SwinOutput(config, dim)
@@ -771,6 +771,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d
                     dim=dim,
                     input_resolution=input_resolution,
                     num_heads=num_heads,
+                    drop_path_rate=drop_path[i],
                     shift_size=0 if (i % 2 == 0) else config.window_size // 2,
                 )
                 for i in range(depth)
diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py
index 035b31e8d43b80..f1aa0bfef743ad 100644
--- a/src/transformers/models/swin/modeling_tf_swin.py
+++ b/src/transformers/models/swin/modeling_tf_swin.py
@@ -742,7 +742,14 @@ def build(self, input_shape=None):
 
 class TFSwinLayer(keras.layers.Layer):
     def __init__(
-        self, config, dim, input_resolution: Tuple[int, int], num_heads: int, shift_size: int = 0, **kwargs
+        self,
+        config,
+        dim,
+        input_resolution: Tuple[int, int],
+        num_heads: int,
+        drop_path_rate: float = 0.0,
+        shift_size: int = 0,
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
@@ -754,8 +761,8 @@ def __init__(
         self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
         self.attention = TFSwinAttention(config, dim, num_heads, name="attention")
         self.drop_path = (
-            TFSwinDropPath(config.drop_path_rate, name="drop_path")
-            if config.drop_path_rate > 0.0
+            TFSwinDropPath(drop_path_rate, name="drop_path")
+            if drop_path_rate > 0.0
             else keras.layers.Activation("linear", name="drop_path")
         )
         self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
@@ -913,6 +920,7 @@ def __init__(
                 input_resolution=input_resolution,
                 num_heads=num_heads,
                 shift_size=0 if (i % 2 == 0) else config.window_size // 2,
+                drop_path_rate=drop_path[i],
                 name=f"blocks.{i}",
             )
             for i in range(depth)
diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py
index b0a773c8af3472..d6bd8da9bed638 100644
--- a/src/transformers/models/swin2sr/modeling_swin2sr.py
+++ b/src/transformers/models/swin2sr/modeling_swin2sr.py
@@ -482,7 +482,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 # Copied from transformers.models.swinv2.modeling_swinv2.Swinv2Layer with Swinv2->Swin2SR
 class Swin2SRLayer(nn.Module):
-    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretrained_window_size=0):
+    def __init__(
+        self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0, pretrained_window_size=0
+    ):
         super().__init__()
         self.input_resolution = input_resolution
         window_size, shift_size = self._compute_window_shift(
@@ -500,7 +502,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretr
             else (pretrained_window_size, pretrained_window_size),
         )
         self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
-        self.drop_path = Swin2SRDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.drop_path = Swin2SRDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
         self.intermediate = Swin2SRIntermediate(config, dim)
         self.output = Swin2SROutput(config, dim)
         self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py
index 0c30e739a48f91..191923958cfbde 100644
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -683,7 +683,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class Swinv2Layer(nn.Module):
-    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretrained_window_size=0):
+    def __init__(
+        self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0, pretrained_window_size=0
+    ):
         super().__init__()
         self.input_resolution = input_resolution
         window_size, shift_size = self._compute_window_shift(
@@ -701,7 +703,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretr
             else (pretrained_window_size, pretrained_window_size),
         )
         self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
-        self.drop_path = Swinv2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.drop_path = Swinv2DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
         self.intermediate = Swinv2Intermediate(config, dim)
         self.output = Swinv2Output(config, dim)
         self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
@@ -819,6 +821,7 @@ def __init__(
                 dim=dim,
                 input_resolution=input_resolution,
                 num_heads=num_heads,
+                drop_path_rate=drop_path[i],
                 shift_size=0 if (i % 2 == 0) else config.window_size // 2,
                 pretrained_window_size=pretrained_window_size,
             )
diff --git a/src/transformers/models/timm_backbone/modeling_timm_backbone.py b/src/transformers/models/timm_backbone/modeling_timm_backbone.py
index ffe83daf7bc23b..ae25852d082851 100644
--- a/src/transformers/models/timm_backbone/modeling_timm_backbone.py
+++ b/src/transformers/models/timm_backbone/modeling_timm_backbone.py
@@ -50,11 +50,6 @@ def __init__(self, config, **kwargs):
         if config.backbone is None:
             raise ValueError("backbone is not set in the config. Please set it to a timm model name.")
 
-        # Certain timm models have the structure `model_name.version` e.g. vit_large_patch14_dinov2.lvd142m
-        base_backbone_model = config.backbone.split(".")[0]
-        if base_backbone_model not in timm.list_models():
-            raise ValueError(f"backbone {base_backbone_model} is not supported by timm.")
-
         if hasattr(config, "out_features") and config.out_features is not None:
             raise ValueError("out_features is not supported by TimmBackbone. Please use out_indices instead.")
 
diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py
index ddd5d484a98883..33349af0366d77 100644
--- a/src/transformers/models/udop/processing_udop.py
+++ b/src/transformers/models/udop/processing_udop.py
@@ -208,20 +208,6 @@ def decode(self, *args, **kwargs):
         """
         return self.tokenizer.decode(*args, **kwargs)
 
-    def post_process_image_text_to_text(self, generated_outputs):
-        """
-        Post-process the output of the model to decode the text.
-
-        Args:
-            generated_outputs (`torch.Tensor` or `np.ndarray`):
-                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
-                or `(sequence_length,)`.
-
-        Returns:
-            `List[str]`: The decoded text.
-        """
-        return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
-
     @property
     def model_input_names(self):
         return ["pixel_values", "input_ids", "bbox", "attention_mask"]
diff --git a/src/transformers/models/udop/tokenization_udop.py b/src/transformers/models/udop/tokenization_udop.py
index e40c07a58aceb7..88708e8b29a3be 100644
--- a/src/transformers/models/udop/tokenization_udop.py
+++ b/src/transformers/models/udop/tokenization_udop.py
@@ -412,7 +412,7 @@ def __getstate__(self):
         return state
 
     def __setstate__(self, d):
-        self.__dict__ = d
+        self.__dict__.update(d)
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(self.vocab_file)
 
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 52ba08f5d4eda5..6ce5e77706d358 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -1577,7 +1577,7 @@ def forward(
         quantized_features, codevector_perplexity = self.quantizer(extract_features)
 
         # project quantized features twice
-        quantized_features = self.project_q(quantized_features)
+        quantized_features = self.project_q(quantized_features.to(self.project_q.weight.dtype))
         quantized_features = self.project_hid(quantized_features)
 
         prob_replace_matrix = torch.empty(transformer_features.size(0), transformer_features.size(1)).fill_(
diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py
index 8738a02585e039..87d96ca24ffdb4 100644
--- a/src/transformers/models/video_llava/configuration_video_llava.py
+++ b/src/transformers/models/video_llava/configuration_video_llava.py
@@ -15,7 +15,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -78,7 +78,7 @@ class VideoLlavaConfig(PretrainedConfig):
     ```"""
 
     model_type = "video_llava"
-    is_composition = False
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 0fe89676b92d63..30adcb6ab5c089 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -339,7 +339,12 @@ def _merge_input_ids_with_visual_features(
         # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
         image_to_overwrite = torch.full((batch_size, max_seq_len), True, dtype=torch.bool, device=inputs_embeds.device)
         image_to_overwrite[batch_indices, text_to_overwrite] = False
-        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+        if left_padding:
+            image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+        else:
+            mask = torch.ones_like(image_to_overwrite, dtype=torch.bool).cumsum(-1) - 1
+            padding_mask = mask <= new_token_positions[:, -1:].to(target_device)
+            image_to_overwrite &= padding_mask
 
         if image_to_overwrite.sum() != visual_features.shape[:-1].numel():
             visual_type = "videos" if num_frames == 8 else "images"
@@ -529,7 +534,8 @@ def forward(
 
         if (pixel_values_images is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
-                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+                "You cannot specify both `pixel_values_images`/`pixel_values_videos` and `inputs_embeds` at the same "
+                "time, and must specify either one"
             )
 
         legacy_processing = False
@@ -572,7 +578,7 @@ def forward(
                 "Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
                 "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                 "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
             )
             if input_ids.shape[1] != 1:
                 for features, frames in ((image_features, 1), (video_features, num_frames)):
@@ -623,8 +629,8 @@ def forward(
         # TODO: @raushan retain only the new behavior after v4.47
         else:
             if pixel_values_images is not None:
-                n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item()
-                n_image_features = image_features.shape[1]
+                n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+                n_image_features = image_features.shape[0] * image_features.shape[1]
                 if n_image_tokens != n_image_features:
                     raise ValueError(
                         f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
@@ -639,8 +645,8 @@ def forward(
                 inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
             if pixel_values_videos is not None:
-                n_video_tokens = (input_ids == self.config.video_token_index).sum(dim=-1)[0].item()
-                n_video_features = video_features.shape[1]
+                n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
+                n_video_features = video_features.shape[0] * video_features.shape[1]
                 if n_video_tokens != n_video_features:
                     raise ValueError(
                         f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
@@ -715,17 +721,6 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        if input_ids is not None:
-            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
-                1
-            ).max() < self.config.image_seq_length
-            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
-                1
-            ).max() < self.config.video_seq_length
-            legacy_processing = (img_token_not_enough and pixel_values_images is not None) or (
-                video_token_not_enough and pixel_values_videos is not None
-            )
-
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -736,7 +731,7 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        if legacy_processing or cache_position[0] == 0:
+        if cache_position[0] == 0:
             # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
             # Otherwise we need pixel values to be passed to model
             model_inputs["pixel_values_images"] = pixel_values_images
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index bd6f91270965bb..5d04d7b6134455 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -51,10 +51,20 @@ class VideoLlavaProcessor(ProcessorMixin):
             Special token used to denote video location.
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
             in a chat into a tokenizable string.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token", "video_token"]
+    valid_kwargs = [
+        "chat_template",
+        "patch_size",
+        "vision_feature_select_strategy",
+        "image_token",
+        "video_token",
+        "num_additional_image_tokens",
+    ]
     image_processor_class = "VideoLlavaImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
@@ -67,12 +77,14 @@ def __init__(
         image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
         video_token="<video>",
         chat_template=None,
+        num_additional_image_tokens=0,
         **kwargs,
     ):
         self.patch_size = patch_size
+        self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.image_token = image_token
-        self.video_token = video_token
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
@@ -149,9 +161,10 @@ def __call__(
         if encoded_images is not None and (self.patch_size is None or self.vision_feature_select_strategy is None):
             logger.warning_once(
                 "Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set "
+                "directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = "
+                "{{vision_feature_select_strategy}}`. Using processors without these attributes in the config is "
+                "deprecated and will throw an error in v4.50."
             )
         # Replace the image/video tokens with the expanded token sequence
         elif encoded_images is not None:
@@ -164,13 +177,17 @@ def __call__(
                 height, width = get_image_size(one_video[0])
                 num_frames = one_video.shape[0]  # frame dim is always after batch dim
 
-            num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
+            num_image_tokens = (height // self.patch_size) * (
+                width // self.patch_size
+            ) + self.num_additional_image_tokens
             num_video_tokens = num_image_tokens * num_frames
 
-            num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
+            num_image_tokens = (height // self.patch_size) * (
+                width // self.patch_size
+            ) + self.num_additional_image_tokens
             num_video_tokens = num_image_tokens * num_frames
             if self.vision_feature_select_strategy == "default":
-                num_image_tokens -= 1
+                num_image_tokens -= self.num_additional_image_tokens
 
             prompt_strings = []
             for sample in text:
diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py
index f88be5adfba028..f26c2b2f50fb6a 100644
--- a/src/transformers/models/vipllava/configuration_vipllava.py
+++ b/src/transformers/models/vipllava/configuration_vipllava.py
@@ -15,7 +15,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -72,7 +72,7 @@ class VipLlavaConfig(PretrainedConfig):
     ```"""
 
     model_type = "vipllava"
-    is_composition = False
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index dd7baa34406fb0..b45325d2194e24 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -350,7 +350,12 @@ def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, in
             (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
         )
         image_to_overwrite[batch_indices, text_to_overwrite] = False
-        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+        if left_padding:
+            image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+        else:
+            mask = torch.ones_like(image_to_overwrite, dtype=torch.bool).cumsum(-1) - 1
+            padding_mask = mask <= new_token_positions[:, -1:].to(target_device)
+            image_to_overwrite &= padding_mask
 
         if image_to_overwrite.sum() != image_features.shape[:-1].numel():
             raise ValueError(
@@ -461,72 +466,71 @@ def forward(
                 (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
             ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
 
+        image_features = None
         if pixel_values is not None:
             image_features = self.get_image_features(
                 pixel_values=pixel_values, vision_feature_layers=vision_feature_layers
             )
 
-            if legacy_processing:
-                logger.warning_once(
-                    "Expanding inputs for image tokens in VipLLaVa should be done in processing. "
-                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+        if legacy_processing:
+            logger.warning_once(
+                "Expanding inputs for image tokens in VipLLaVa should be done in processing. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+            )
+            # prefill stage vs decoding stage (legacy behavior copied)
+            if input_ids.shape[1] != 1:
+                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+                    image_features, inputs_embeds, input_ids, attention_mask, labels
                 )
-                # prefill stage vs decoding stage (legacy behavior copied)
-                if input_ids.shape[1] != 1:
-                    inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                        image_features, inputs_embeds, input_ids, attention_mask, labels
-                    )
-                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-                else:
-                    # Retrieve the first layer to inspect the logits and mask out the hidden states
-                    # that are set to 0
-                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                    # Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                    target_length = input_ids.shape[1]
-                    past_length = first_layer_past_key_value.shape[-1]
-
-                    extended_attention_mask = torch.ones(
-                        (attention_mask.shape[0], past_length),
-                        dtype=attention_mask.dtype,
-                        device=attention_mask.device,
-                    )
-
-                    # Filter out only the tokens that can be un-attended, this can happen
-                    # in the case one uses Llava + Fused modules where the cache on the
-                    # first iteration is already big enough, or if one passes custom cache
-                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                    new_batch_index = batch_index[valid_indices]
-                    new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                    # Zero-out the places where we don't need to attend
-                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[
-                        -target_length:
-                    ]
-
-            # TODO: @raushan retain only the new behavior after v4.47
+                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
             else:
-                n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item()
-                n_image_features = image_features.shape[1]
-                if n_image_tokens != n_image_features:
-                    raise ValueError(
-                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.image_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
+                # Retrieve the first layer to inspect the logits and mask out the hidden states
+                # that are set to 0
+                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+
+                # Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+
+                target_length = input_ids.shape[1]
+                past_length = first_layer_past_key_value.shape[-1]
+
+                extended_attention_mask = torch.ones(
+                    (attention_mask.shape[0], past_length),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
                 )
-                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+                # Filter out only the tokens that can be un-attended, this can happen
+                # in the case one uses Llava + Fused modules where the cache on the
+                # first iteration is already big enough, or if one passes custom cache
+                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                new_batch_index = batch_index[valid_indices]
+                new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                # Zero-out the places where we don't need to attend
+                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+
+                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
+
+        # TODO: @raushan retain only the new behavior after v4.47
+        elif image_features is not None:
+            n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            n_image_features = image_features.shape[0] * image_features.shape[1]
+            if n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            special_image_mask = (
+                (input_ids == self.config.image_token_index)
+                .unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -585,12 +589,6 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        # Trigger the new behavior if we have more than image embeddings seq length tokens for images
-        legacy_processing = (
-            input_ids is not None
-            and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
-        )
-
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -601,7 +599,7 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        if legacy_processing or cache_position[0] == 0:
+        if cache_position[0] == 0:
             # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
             # Otherwise we need pixel values to be passed to model
             model_inputs["pixel_values"] = pixel_values
diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
index a4aa663f98526f..59678f2573ff0e 100644
--- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
@@ -78,6 +78,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
     ```"""
 
     model_type = "vision-encoder-decoder"
+    sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
     is_composition = True
 
     def __init__(self, **kwargs):
diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
index 4cea34ca2313bc..0d79720e1aa8d2 100644
--- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
@@ -75,6 +75,7 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
     ```"""
 
     model_type = "vision-text-dual-encoder"
+    sub_configs = {"vision_config": AutoConfig, "text_config": AutoConfig}
     is_composition = True
 
     def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs):
diff --git a/src/transformers/models/vit/image_processing_vit.py b/src/transformers/models/vit/image_processing_vit.py
index 7c0d8abefd8be9..05bb8bae049748 100644
--- a/src/transformers/models/vit/image_processing_vit.py
+++ b/src/transformers/models/vit/image_processing_vit.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import resize, to_channel_dimension_format
+from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -68,6 +68,8 @@ class ViTImageProcessor(BaseImageProcessor):
         image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*):
+            Whether to convert the image to RGB.
     """
 
     model_input_names = ["pixel_values"]
@@ -82,6 +84,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -95,6 +98,7 @@ def __init__(
         self.rescale_factor = rescale_factor
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_convert_rgb = do_convert_rgb
 
     def resize(
         self,
@@ -159,6 +163,7 @@ def preprocess(
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        do_convert_rgb: Optional[bool] = None,
     ):
         """
         Preprocess an image or batch of images.
@@ -203,6 +208,8 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
@@ -211,6 +218,7 @@ def preprocess(
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
 
         size = size if size is not None else self.size
         size_dict = get_size_dict(size)
@@ -233,6 +241,9 @@ def preprocess(
             resample=resample,
         )
 
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
diff --git a/src/transformers/models/vit/image_processing_vit_fast.py b/src/transformers/models/vit/image_processing_vit_fast.py
index 21f5a99a3e3d78..98ecfb3927a342 100644
--- a/src/transformers/models/vit/image_processing_vit_fast.py
+++ b/src/transformers/models/vit/image_processing_vit_fast.py
@@ -20,7 +20,7 @@
 from ...image_processing_base import BatchFeature
 from ...image_processing_utils import get_size_dict
 from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
-from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale
+from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale, convert_to_rgb
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -76,6 +76,8 @@ class ViTImageProcessorFast(BaseImageProcessorFast):
         image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*):
+            Whether to convert the image to RGB.
     """
 
     model_input_names = ["pixel_values"]
@@ -101,6 +103,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -114,6 +117,7 @@ def __init__(
         self.rescale_factor = rescale_factor
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_convert_rgb = do_convert_rgb
 
     def _build_transforms(
         self,
@@ -199,6 +203,7 @@ def preprocess(
         return_tensors: Optional[Union[str, TensorType]] = "pt",
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        do_convert_rgb: Optional[bool] = None,
         **kwargs,
     ):
         """
@@ -237,6 +242,8 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        do_convert_rgb (`bool`, *optional*):
+            Whether to convert the image to RGB.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
@@ -246,6 +253,7 @@ def preprocess(
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
         size = size if size is not None else self.size
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
         # Make hashable for cache
         size = SizeDict(**size)
         image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
@@ -271,6 +279,9 @@ def preprocess(
             image_type=image_type,
         )
 
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
         transforms = self.get_transforms(
             do_resize=do_resize,
             do_rescale=do_rescale,
diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index 827046b6c35380..3d3b92d2c8c02e 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """X-CLIP model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -79,6 +76,7 @@ class XCLIPTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "xclip_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -112,24 +110,6 @@ def __init__(
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from XCLIPConfig
-        if config_dict.get("model_type") == "xclip":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class XCLIPVisionConfig(PretrainedConfig):
     r"""
@@ -195,6 +175,7 @@ class XCLIPVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "xclip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -239,24 +220,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.drop_path_rate = drop_path_rate
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from XCLIPConfig
-        if config_dict.get("model_type") == "xclip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class XCLIPConfig(PretrainedConfig):
     r"""
@@ -295,6 +258,7 @@ class XCLIPConfig(PretrainedConfig):
     """
 
     model_type = "xclip"
+    sub_configs = {"text_config": XCLIPTextConfig, "vision_config": XCLIPVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/zoedepth/modeling_zoedepth.py b/src/transformers/models/zoedepth/modeling_zoedepth.py
index 979b78aba678a5..5cbbdcdc04b756 100644
--- a/src/transformers/models/zoedepth/modeling_zoedepth.py
+++ b/src/transformers/models/zoedepth/modeling_zoedepth.py
@@ -185,12 +185,13 @@ def forward(self, hidden_states):
         hidden_states = hidden_states[::-1]
 
         fused_hidden_states = []
-        # first layer only uses the last hidden_state
-        fused_hidden_state = self.layers[0](hidden_states[0])
-        fused_hidden_states.append(fused_hidden_state)
-        # looping from the last layer to the second
-        for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
-            fused_hidden_state = layer(fused_hidden_state, hidden_state)
+        fused_hidden_state = None
+        for hidden_state, layer in zip(hidden_states, self.layers):
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state)
             fused_hidden_states.append(fused_hidden_state)
 
         return fused_hidden_states
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 40b3dc1015c001..07156b3cf1dbe2 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -67,6 +67,7 @@
 from .image_classification import ImageClassificationPipeline
 from .image_feature_extraction import ImageFeatureExtractionPipeline
 from .image_segmentation import ImageSegmentationPipeline
+from .image_text_to_text import ImageTextToTextPipeline
 from .image_to_image import ImageToImagePipeline
 from .image_to_text import ImageToTextPipeline
 from .mask_generation import MaskGenerationPipeline
@@ -119,6 +120,7 @@
         AutoModelForDocumentQuestionAnswering,
         AutoModelForImageClassification,
         AutoModelForImageSegmentation,
+        AutoModelForImageTextToText,
         AutoModelForMaskedLM,
         AutoModelForMaskGeneration,
         AutoModelForObjectDetection,
@@ -384,6 +386,17 @@
         },
         "type": "multimodal",
     },
+    "image-text-to-text": {
+        "impl": ImageTextToTextPipeline,
+        "tf": (),
+        "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "2c9ba3b"),
+            }
+        },
+        "type": "multimodal",
+    },
     "object-detection": {
         "impl": ObjectDetectionPipeline,
         "tf": (),
@@ -601,6 +614,7 @@ def pipeline(
             - `"image-classification"`: will return a [`ImageClassificationPipeline`].
             - `"image-feature-extraction"`: will return an [`ImageFeatureExtractionPipeline`].
             - `"image-segmentation"`: will return a [`ImageSegmentationPipeline`].
+            - `"image-text-to-text"`: will return a [`ImageTextToTextPipeline`].
             - `"image-to-image"`: will return a [`ImageToImagePipeline`].
             - `"image-to-text"`: will return a [`ImageToTextPipeline`].
             - `"mask-generation"`: will return a [`MaskGenerationPipeline`].
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 25c2a11564c3f1..d2d4f198d41847 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -951,6 +951,14 @@ def __init__(
         self._num_workers = kwargs.pop("num_workers", None)
         self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
 
+        # In processor only mode, we can get the modality processors from the processor
+        if self.processor is not None and all(
+            [self.tokenizer is None, self.feature_extractor is None, self.image_processor is None]
+        ):
+            self.tokenizer = getattr(self.processor, "tokenizer", None)
+            self.feature_extractor = getattr(self.processor, "feature_extractor", None)
+            self.image_processor = getattr(self.processor, "image_processor", None)
+
         if self.image_processor is None and self.feature_extractor is not None:
             if isinstance(self.feature_extractor, BaseImageProcessor):
                 # Backward compatible change, if users called
diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py
index ae86c552a720af..2203ac09c9cf9b 100644
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@@ -1,4 +1,3 @@
-import warnings
 from typing import List, Union
 
 from ..utils import (
@@ -72,6 +71,9 @@ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Imag
                 A dictionary of argument names to parameter values, to control pipeline behaviour.
                 The only parameter available right now is `timeout`, which is the length of time, in seconds,
                 that the pipeline should wait before giving up on trying to download an image.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
@@ -93,9 +95,6 @@ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Imag
     def _sanitize_parameters(self, timeout=None, parameters=None, **kwargs):
         preprocess_params = {}
         if timeout is not None:
-            warnings.warn(
-                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
-            )
             preprocess_params["timeout"] = timeout
         if isinstance(parameters, dict) and "timeout" in parameters:
             preprocess_params["timeout"] = parameters["timeout"]
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index 20ad72e79055e2..0085e5eb73f826 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import warnings
 from typing import List, Union
 
 import numpy as np
@@ -113,9 +112,6 @@ def __init__(self, *args, **kwargs):
     def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None):
         preprocess_params = {}
         if timeout is not None:
-            warnings.warn(
-                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
-            )
             preprocess_params["timeout"] = timeout
         postprocess_params = {}
         if top_k is not None:
@@ -159,6 +155,9 @@ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Imag
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 0ac653fd1e8725..d388e591bf9df4 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -1,4 +1,3 @@
-import warnings
 from typing import Any, Dict, List, Union
 
 import numpy as np
@@ -91,9 +90,6 @@ def _sanitize_parameters(self, **kwargs):
         if "overlap_mask_area_threshold" in kwargs:
             postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
         if "timeout" in kwargs:
-            warnings.warn(
-                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
-            )
             preprocess_kwargs["timeout"] = kwargs["timeout"]
 
         return preprocess_kwargs, {}, postprocess_kwargs
@@ -122,6 +118,9 @@ def __call__(self, inputs=None, **kwargs) -> Union[Predictions, List[Prediction]
                 Threshold to use when turning the predicted masks into binary values.
             overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
                 Mask overlap threshold to eliminate small, disconnected segments.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py
new file mode 100644
index 00000000000000..5afba0d7c0410e
--- /dev/null
+++ b/src/transformers/pipelines/image_text_to_text.py
@@ -0,0 +1,432 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+from typing import Dict, List, Optional, Union
+
+from ..processing_utils import ProcessingKwargs, Unpack
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_images, valid_images
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+    from .pt_utils import KeyDataset
+
+logger = logging.get_logger(__name__)
+
+IMAGE_TOKEN = "<image>"
+
+
+class ReturnType(enum.Enum):
+    TENSORS = 0
+    NEW_TEXT = 1
+    FULL_TEXT = 2
+
+
+class Chat:
+    """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
+    to this format because the rest of the pipeline code tends to assume that lists of messages are
+    actually a batch of samples rather than messages in the same conversation."""
+
+    def __init__(self, messages: Dict, images: Union[str, List[str], "Image.Image", List["Image.Image"]]):
+        for message in messages:
+            if not ("role" in message and "content" in message):
+                raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
+        images = retrieve_images_in_messages(messages, images)
+
+        self.messages = messages
+        self.images = images
+
+
+def retrieve_images_in_messages(
+    messages: dict, images: Optional[Union[str, List[str], "Image.Image", List["Image.Image"]]]
+):
+    """
+    Retrieve and combine images from the chat and the images passed as input.
+    """
+    if images is None:
+        images = []
+    idx_images = 0
+    retrieved_images = []
+    for message in messages:
+        for content in message["content"]:
+            if isinstance(content, dict):
+                if content.get("type") == "image":
+                    for key in ["image", "url", "path", "base64"]:
+                        if key in content:
+                            retrieved_images.append(content[key])
+                            break
+                    else:
+                        if idx_images < len(images):
+                            retrieved_images.append(images[idx_images])
+                            idx_images += 1
+                        else:
+                            raise ValueError(
+                                "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
+                            )
+                # Add support for OpenAI/TGI chat format
+                elif content.get("type") == "image_url":
+                    if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]:
+                        retrieved_images.append(content["image_url"]["url"])
+                        # Rewrite content to be in the Transformers chat format
+                        content["type"] = "image"
+                        content["image"] = content["image_url"]["url"]
+                        del content["image_url"]
+                    else:
+                        raise ValueError(
+                            "Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key."
+                        )
+
+    # The number of images passed should be consistent with the number of images in the chat without an image key
+    if idx_images != len(images):
+        raise ValueError(
+            "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
+        )
+
+    return retrieved_images
+
+
+@add_end_docstrings(build_pipeline_init_args(has_processor=True))
+class ImageTextToTextPipeline(Pipeline):
+    """
+    Image-text-to-text pipeline using an `AutoModelForImageTextToText`. This pipeline generates text given an image and text.
+    When the underlying model is a conversational model, it can also accept one or more chats,
+    in which case the pipeline will operate in chat mode and will continue the chat(s) by adding its response(s).
+    Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base")
+    >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
+    [{'generated_text': 'a photo of two birds'}]
+    ```
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+    >>> messages = [
+    >>>     {
+    >>>         "role": "user",
+    >>>         "content": [
+    >>>             {
+    >>>                 "type": "image",
+    >>>                 "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+    >>>             },
+    >>>             {"type": "text", "text": "Describe this image."},
+    >>>         ],
+    >>>     },
+    >>>     {
+    >>>         "role": "assistant",
+    >>>         "content": [
+    >>>             {"type": "text", "text": "There is a dog and"},
+    >>>         ],
+    >>>     },
+    >>> ]
+    >>> pipe(text=messages, max_new_tokens=20, return_full_text=False)
+    [{'input_text': [{'role': 'user',
+        'content': [{'type': 'image',
+        'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
+        {'type': 'text', 'text': 'Describe this image.'}]},
+    {'role': 'assistant',
+        'content': [{'type': 'text', 'text': 'There is a dog and'}]}],
+    'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image-text to text pipeline can currently be loaded from pipeline() using the following task identifier:
+    "image-text-to-text".
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text).
+    """
+
+    _load_processor = True
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
+
+    def _sanitize_parameters(
+        self,
+        max_new_tokens=None,
+        generate_kwargs=None,
+        timeout=None,
+        return_full_text=None,
+        return_tensors=None,
+        return_type=None,
+        continue_final_message=None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ):
+        forward_kwargs = {}
+        preprocess_params = {}
+        postprocess_params = {}
+
+        preprocess_params["processing_kwargs"] = kwargs
+
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+
+        if continue_final_message is not None:
+            preprocess_params["continue_final_message"] = continue_final_message
+
+        if generate_kwargs is not None:
+            forward_kwargs["generate_kwargs"] = generate_kwargs
+
+        if max_new_tokens is not None:
+            if "generate_kwargs" not in forward_kwargs:
+                forward_kwargs["generate_kwargs"] = {}
+            if "max_new_tokens" in forward_kwargs["generate_kwargs"]:
+                raise ValueError(
+                    "'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter,"
+                    " please use only one"
+                )
+            forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens
+
+        if return_full_text is not None and return_type is None:
+            if return_tensors is not None:
+                raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`")
+            return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT
+        if return_tensors is not None and return_type is None:
+            return_type = ReturnType.TENSORS
+        if return_type is not None:
+            postprocess_params["return_type"] = return_type
+        if continue_final_message is not None:
+            postprocess_params["continue_final_message"] = continue_final_message
+
+        return preprocess_params, forward_kwargs, postprocess_params
+
+    def __call__(
+        self,
+        images: Optional[
+            Union[str, List[str], List[List[str]], "Image.Image", List["Image.Image"], List[List["Image.Image"]]]
+        ] = None,
+        text: Optional[Union[str, List[str], List[dict]]] = None,
+        **kwargs,
+    ):
+        """
+        Generate a text given text and the image(s) passed as inputs.
+
+        Args:
+            images (`str`, `List[str]`, `PIL.Image or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a HTTP(s) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images.
+            text (str, List[str], `List[Dict[str, Union[str, PIL.Image]]]`):
+                The text to be used for generation. If a list of strings is passed, the length of the list should be the
+                same as the number of images. Text can also follow the chat format: a list of dictionaries where each
+                dictionary represents a message in a conversation. Each dictionary should have two keys: 'role' and
+                'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of dictionary
+                containing the text of the message and the type of the message. The type of the message can be either
+                'text' or 'image'. If the type is 'image', no text is needed.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Returns the tensors of predictions (as token indices) in the outputs. If set to
+                `True`, the decoded text is not returned.
+            return_text (`bool`, *optional*):
+                Returns the decoded texts in the outputs.
+            return_full_text (`bool`, *optional*, defaults to `True`):
+                If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
+                specified at the same time as `return_text`.
+            continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
+                last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
+                By default this is `True` when the final message in the input chat has the `assistant` role and
+                `False` otherwise, but you can manually override that behaviour by setting this flag.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following key (cannot return a combination
+            of both `generated_text` and `generated_token_ids`):
+
+            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+            - **generated_token_ids** (`torch.Tensor`, present when `return_tensors=True`) -- The token
+                ids of the generated text.
+            - **input_text** (`str`) -- The input text.
+        """
+        if images is None and text is None:
+            raise ValueError("You must at least provide either text or images.")
+        if images is not None and text is None and not valid_images(images):
+            """
+            Supports the following format
+            - {"image": image, "text": text}
+            - [{"image": image, "text": text}]
+            - Generator and datasets
+            This is a common pattern in other multimodal pipelines, so we support it here as well.
+            """
+            return super().__call__(images, **kwargs)
+
+        if isinstance(text, (list, tuple, KeyDataset)) and isinstance(text[0], (list, tuple, dict)):
+            # We have one or more prompts in list-of-dicts format, so this is chat mode
+            if isinstance(text[0], dict):
+                return super().__call__(Chat(text, images), **kwargs)
+            else:
+                if images is None:
+                    images = [None] * len(text)
+                chats = [Chat(chat, image) for chat, image in zip(text, images)]  # 🐈 🐈 🐈
+                return super().__call__(chats, **kwargs)
+
+        # encourage the user to use the chat format if supported
+        if getattr(self.processor, "chat_template", None) is not None:
+            logger.warning_once(
+                "The input data was not formatted as a chat with dicts containing 'role' and 'content' keys, even though this model supports chat. "
+                "Consider using the chat format for better results. For more information, see https://huggingface.co/docs/transformers/en/chat_templating"
+            )
+
+        # support text only generation
+        if images is None:
+            return super().__call__(text, **kwargs)
+        if text is None:
+            raise ValueError("You must provide text for this pipeline.")
+
+        return super().__call__({"images": images, "text": text}, **kwargs)
+
+    def preprocess(self, inputs=None, timeout=None, continue_final_message=None, processing_kwargs=None):
+        # In case we only have text inputs
+        if isinstance(inputs, (list, tuple, str)):
+            images = None
+            text = inputs
+            inputs_text = inputs
+        else:
+            if isinstance(inputs, Chat):
+                # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
+                # because very few models support multiple separate, consecutive assistant messages
+                if continue_final_message is None:
+                    continue_final_message = inputs.messages[-1]["role"] == "assistant"
+                text = self.processor.apply_chat_template(
+                    inputs.messages,
+                    add_generation_prompt=not continue_final_message,
+                    continue_final_message=continue_final_message,
+                    return_tensors=self.framework,
+                )
+                inputs_text = inputs
+                images = inputs.images
+            else:
+                text = inputs["text"]
+                inputs_text = inputs["text"]
+                images = inputs["images"]
+
+            images = load_images(images)
+
+        # if batched text inputs, we set padding to True unless specified otherwise
+        if isinstance(text, (list, tuple)) and len(text) > 1:
+            processing_kwargs.setdefault("padding", True)
+        model_inputs = self.processor(
+            images=images, text=text, return_tensors=self.framework, legacy=False, **processing_kwargs
+        ).to(dtype=self.torch_dtype)
+
+        model_inputs["text"] = inputs_text
+
+        return model_inputs
+
+    def _forward(self, model_inputs, generate_kwargs=None):
+        generate_kwargs = {} if generate_kwargs is None else generate_kwargs
+        prompt_text = model_inputs.pop("text")
+        input_ids = (
+            model_inputs["input_ids"] if "input_ids" in model_inputs else model_inputs["decoder_input_ids"]
+        )  # for decoder-only models
+        generated_sequence = self.model.generate(**model_inputs, **generate_kwargs)
+
+        return {"generated_sequence": generated_sequence, "prompt_text": prompt_text, "input_ids": input_ids}
+
+    def postprocess(self, model_outputs, return_type=ReturnType.FULL_TEXT, continue_final_message=None):
+        input_texts = model_outputs["prompt_text"]
+        input_texts = [input_texts] if isinstance(input_texts, (str, Chat)) else input_texts
+        generated_sequence = model_outputs["generated_sequence"]
+        input_ids = model_outputs["input_ids"]
+        if return_type == ReturnType.TENSORS:
+            return [
+                {"input_text": input_texts[i], "generated_token_ids": generated_sequence[i]}
+                for i in range(len(input_texts))
+            ]
+
+        # Decode inputs and outputs the same way to remove input text from generated text if present
+        generated_texts = self.processor.post_process_image_text_to_text(generated_sequence)
+        decoded_inputs = self.processor.post_process_image_text_to_text(input_ids)
+
+        # Force consistent behavior for including the input text in the output
+        if return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
+            # Remove the input text from the generated text if the generated text starts with the input text
+            # (accounting for the possibility of a space between the input and generated text)
+            new_generated_texts = []
+            for text_generated, decoded_input in zip(generated_texts, decoded_inputs):
+                # There can be added characters before the input text, so we need to find the beginning of the input text in the generated text
+                index_input_text = text_generated.find(decoded_input)
+                # Limit the search to 2 residual characters, like spaces or new lines, to avoid removing a large part of the answer
+                if 0 <= index_input_text <= 2:
+                    # If the input text is found, we remove it
+                    new_generated_texts.append(text_generated[index_input_text + len(decoded_input) :])
+                else:
+                    new_generated_texts.append(text_generated)
+            generated_texts = new_generated_texts
+        if return_type == ReturnType.FULL_TEXT:
+            full_texts = []
+            for prompt_text, generated_text in zip(input_texts, generated_texts):
+                if isinstance(prompt_text, str):
+                    generated_text = prompt_text + generated_text
+                elif isinstance(prompt_text, Chat):
+                    if continue_final_message is None:
+                        # If the user passes a chat ending in an assistant message, we treat it as a prefill by
+                        # default because very few models support multiple separate, consecutive assistant messages
+                        continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+                    if continue_final_message:
+                        # With assistant prefill, concat onto the end of the last message
+                        new_text = dict(prompt_text.messages[-1]["content"][-1].items())
+                        new_text["text"] += generated_text
+                        generated_text = list(prompt_text.messages)[:-1] + [
+                            {
+                                "role": prompt_text.messages[-1]["role"],
+                                "content": prompt_text.messages[-1]["content"][:-1] + [new_text],
+                            }
+                        ]
+                    else:
+                        # When we're not starting from a prefill, the output is a new assistant message
+                        generated_text = list(prompt_text.messages) + [
+                            {"role": "assistant", "content": generated_text}
+                        ]
+                full_texts.append(generated_text)
+            generated_texts = full_texts
+
+        records = [
+            {
+                "input_text": input_text.messages if isinstance(input_text, Chat) else input_text,
+                "generated_text": generated_text,
+            }
+            for input_text, generated_text in zip(input_texts, generated_texts)
+        ]
+
+        return records
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 4beaa481920054..afd67b6ac9edee 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import warnings
 from typing import List, Union
 
 from ..utils import (
@@ -81,9 +80,6 @@ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt
         if prompt is not None:
             preprocess_params["prompt"] = prompt
         if timeout is not None:
-            warnings.warn(
-                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
-            )
             preprocess_params["timeout"] = timeout
 
         if max_new_tokens is not None:
@@ -118,6 +114,10 @@ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Imag
             generate_kwargs (`Dict`, *optional*):
                 Pass it to send all of these arguments directly to `generate` allowing full control of this function.
 
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
         Return:
             A list or a list of list of `dict`: Each result comes as a dictionary with the following key:
 
@@ -134,6 +134,10 @@ def preprocess(self, image, prompt=None, timeout=None):
         image = load_image(image, timeout=timeout)
 
         if prompt is not None:
+            logger.warning_once(
+                "Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48"
+                " of 🤗 Transformers. Use the `image-text-to-text` pipeline instead",
+            )
             if not isinstance(prompt, str):
                 raise ValueError(
                     f"Received an invalid text input, got - {type(prompt)} - but expected a single string. "
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index c135b1e131acb9..c84f17b2bd6ad0 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -1,4 +1,3 @@
-import warnings
 from typing import Any, Dict, List, Union
 
 from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
@@ -64,9 +63,6 @@ def __init__(self, *args, **kwargs):
     def _sanitize_parameters(self, **kwargs):
         preprocess_params = {}
         if "timeout" in kwargs:
-            warnings.warn(
-                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
-            )
             preprocess_params["timeout"] = kwargs["timeout"]
         postprocess_kwargs = {}
         if "threshold" in kwargs:
@@ -89,6 +85,9 @@ def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
                 same format: all as HTTP(S) links, all as local paths, or all as PIL images.
             threshold (`float`, *optional*, defaults to 0.5):
                 The probability necessary to make a prediction.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index 6039e5ad1ee989..7b876eefc49279 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -540,8 +540,14 @@ def postprocess(
         min_null_score = 1000000  # large and positive
         answers = []
         for output in model_outputs:
-            start_ = output["start"]
-            end_ = output["end"]
+            if self.framework == "pt" and output["start"].dtype == torch.bfloat16:
+                start_ = output["start"].to(torch.float32)
+            else:
+                start_ = output["start"]
+            if self.framework == "pt" and output["start"].dtype == torch.bfloat16:
+                end_ = output["end"].to(torch.float32)
+            else:
+                end_ = output["end"]
             example = output["example"]
             p_mask = output["p_mask"]
             attention_mask = (
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index 253c684fcbbdad..c53b515dcccd9c 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -94,6 +94,10 @@ def __call__(self, image: Union[str, List[str], "Image", List["Image"]] = None,
                 replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
                 already formatted.
 
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
         Return:
             A list of dictionaries containing one entry per proposed label. Each dictionary contains the
             following keys:
@@ -113,9 +117,6 @@ def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs):
         if "candidate_labels" in kwargs:
             preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
         if "timeout" in kwargs:
-            warnings.warn(
-                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
-            )
             preprocess_params["timeout"] = kwargs["timeout"]
         if "hypothesis_template" in kwargs:
             preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
diff --git a/src/transformers/pipelines/zero_shot_object_detection.py b/src/transformers/pipelines/zero_shot_object_detection.py
index 9ad575202266ee..ce8da7340bcce5 100644
--- a/src/transformers/pipelines/zero_shot_object_detection.py
+++ b/src/transformers/pipelines/zero_shot_object_detection.py
@@ -7,7 +7,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from ..image_utils import load_image
+    from ..image_utils import load_image, valid_images
 
 if is_torch_available():
     import torch
@@ -130,8 +130,23 @@ def __call__(
 
         if isinstance(image, (str, Image.Image)):
             inputs = {"image": image, "candidate_labels": candidate_labels}
+        elif isinstance(image, (list, tuple)) and valid_images(image):
+            return list(
+                super().__call__(
+                    ({"image": img, "candidate_labels": labels} for img, labels in zip(image, candidate_labels)),
+                    **kwargs,
+                )
+            )
         else:
+            """
+            Supports the following format
+            - {"image": image, "candidate_labels": candidate_labels}
+            - [{"image": image, "candidate_labels": candidate_labels}]
+            - Generator and datasets
+            This is a common pattern in other multimodal pipelines, so we support it here as well.
+            """
             inputs = image
+
         results = super().__call__(inputs, **kwargs)
         return results
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index b363f8c72cc481..b5b02f6a00aa09 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -874,12 +874,13 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
         else:
             # kwargs is a flat dictionary
             for key in kwargs:
-                if key not in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__.keys():
-                    logger.warning_once(
-                        f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
-                    )
-                elif key not in used_keys:
-                    output_kwargs["common_kwargs"][key] = kwargs[key]
+                if key not in used_keys:
+                    if key in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__.keys():
+                        output_kwargs["common_kwargs"][key] = kwargs[key]
+                    else:
+                        logger.warning_once(
+                            f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
+                        )
 
         # all modality-specific kwargs are updated with common kwargs
         for modality in output_kwargs:
@@ -1106,6 +1107,20 @@ def apply_chat_template(
             conversation, chat_template=chat_template, tokenize=tokenize, **kwargs
         )
 
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of a vlm to decode the text.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
+
 
 def _validate_images_text_input_order(images, text):
     """
diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index f3663c09902f52..6757f72350ba29 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -39,6 +39,14 @@
 is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12")
 
 
+if is_torch_greater_or_equal_than_2_4:
+    from torch.distributed.tensor import Replicate
+    from torch.distributed.tensor.parallel import (
+        ColwiseParallel,
+        RowwiseParallel,
+    )
+
+
 def softmax_backward_data(parent, grad_output, output, dim, self):
     """
     A function that calls the internal `_softmax_backward_data` PyTorch method and that adjusts the arguments according
@@ -314,7 +322,7 @@ def isin_mps_friendly(elements: torch.Tensor, test_elements: torch.Tensor | int)
 
     Args:
         elements (`torch.Tensor`): Input elements
-        test_elements (`torch.Tensor`): The elements to check against.
+        test_elements (`torch.Tensor` or `int`): The elements to check against.
 
     Returns:
         `torch.Tensor`: A boolean tensor of the same shape as `elements` that is True for `elements` in `test_elements`
@@ -322,7 +330,29 @@ def isin_mps_friendly(elements: torch.Tensor, test_elements: torch.Tensor | int)
     """
 
     if elements.device.type == "mps" and not is_torch_greater_or_equal_than_2_4:
+        test_elements = torch.tensor(test_elements)
+        if test_elements.ndim == 0:
+            test_elements = test_elements.unsqueeze(0)
         return elements.tile(test_elements.shape[0], 1).eq(test_elements.unsqueeze(1)).sum(dim=0).bool().squeeze()
     else:
         # Note: don't use named arguments in `torch.isin`, see https://github.com/pytorch/pytorch/issues/126045
         return torch.isin(elements, test_elements)
+
+
+def translate_to_torch_parallel_style(style: str):
+    """
+    In model configurations, we use a neutral type (string) to specify parallel
+    styles, here we translate them into torch.distributed tensor-parallel
+    types.
+    """
+    if not isinstance(style, str):
+        raise ValueError(f"Unsupported parallel style type {type(style)}, expected str")
+
+    if style == "colwise":
+        return ColwiseParallel()
+    elif style == "rowwise":
+        return RowwiseParallel()
+    elif style == "colwise_rep":
+        return ColwiseParallel(output_layouts=Replicate())
+    else:
+        raise ValueError(f"Unsupported parallel style value: {style}")
diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py
index 18b883429c5ec5..0c14c236d26036 100644
--- a/src/transformers/quantizers/quantizer_awq.py
+++ b/src/transformers/quantizers/quantizer_awq.py
@@ -57,14 +57,14 @@ def validate_environment(self, device_map, **kwargs):
                 raise RuntimeError(
                     "To use IPEX backend, you need autoawq>0.6.2. Please install the latest version or from source."
                 )
-            if (
-                device_map is not None
-                and isinstance(device_map, dict)
-                and (torch.device("cpu") not in device_map.values() or len(device_map.values()) > 1)
-            ):
+            if device_map is None:
+                logger.warning_once(
+                    "You have loaded an AWQ model without setting device_map, please set 'cpu' or 'xpu' or 'auto'"
+                )
+            elif isinstance(device_map, dict) and "disk" in device_map.values():
                 raise ValueError(
-                    "You are attempting to load an IPEX version AWQ model with a device_map that contains more than CPU."
-                    " This is not supported. Please make sure only cpu in the device_map."
+                    "You are attempting to load an IPEX version AWQ model with a device_map that contains disk device."
+                    " This is not supported. Please make sure only cpu and xpu in the device_map."
                 )
         else:
             if not torch.cuda.is_available():
diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
index f6bf431aa028d4..e6c2dc1ce36b3f 100644
--- a/src/transformers/quantizers/quantizer_torchao.py
+++ b/src/transformers/quantizers/quantizer_torchao.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
+import types
 from typing import TYPE_CHECKING, Union
 
 from packaging import version
@@ -30,9 +31,7 @@
 
 if is_torch_available():
     import torch
-
-if is_torchao_available():
-    from torchao.quantization import quantize_
+    import torch.nn as nn
 
 logger = logging.get_logger(__name__)
 
@@ -46,6 +45,25 @@ def find_parent(model, name):
     return parent
 
 
+def _quantization_type(weight):
+    from torchao.dtypes import AffineQuantizedTensor
+    from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor
+
+    if isinstance(weight, AffineQuantizedTensor):
+        return f"{weight.__class__.__name__}({weight._quantization_type()})"
+
+    if isinstance(weight, LinearActivationQuantizedTensor):
+        return f"{weight.__class__.__name__}(activation={weight.input_quant_func}, weight={_quantization_type(weight.original_weight_tensor)})"
+
+
+def _linear_extra_repr(self):
+    weight = _quantization_type(self.weight)
+    if weight is None:
+        return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight=None"
+    else:
+        return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight={weight}"
+
+
 class TorchAoHfQuantizer(HfQuantizer):
     """
     Quantizer for torchao: https://github.com/pytorch/ao/
@@ -73,6 +91,15 @@ def validate_environment(self, *args, **kwargs):
                     )
                 else:
                     self.offload = True
+        if self.pre_quantized:
+            weights_only = kwargs.get("weights_only", None)
+            if weights_only:
+                torch_version = version.parse(importlib.metadata.version("torch"))
+                if torch_version < version.parse("2.5.0"):
+                    raise RuntimeError(
+                        f"In order to use torchao pre-quantized model, you need to have torch>=2.5.0. However, the current version is {torch_version}."
+                        f" You can also set with `weights_only=False` in `from_pretrained` if you don't want to update torch"
+                    )
 
     def update_torch_dtype(self, torch_dtype):
         if self.quantization_config.quant_type == "int4_weight_only":
@@ -85,6 +112,10 @@ def update_torch_dtype(self, torch_dtype):
                     "Setting torch_dtype to torch.bfloat16 for int4_weight_only quantization since only bfloat16 is supported right now. Please set torch_dtype=torch.bfloat16 to remove this warning."
                 )
                 torch_dtype = torch.bfloat16
+        if self.quantization_config.quant_type == "int8_dynamic_activation_int8_weight":
+            if torch_dtype is None:
+                # we need to set the torch_dtype, otherwise we have dtype mismatch when performing the quantized linear op
+                torch_dtype = torch.float32
         return torch_dtype
 
     def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
@@ -152,9 +183,17 @@ def create_quantized_param(
         Each nn.Linear layer that needs to be quantized is processsed here.
         First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module.
         """
+        from torchao.quantization import quantize_
+
         module, tensor_name = get_module_from_name(model, param_name)
-        module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device)
-        quantize_(module, self.quantization_config.get_apply_tensor_subclass())
+
+        if self.pre_quantized:
+            module._parameters[tensor_name] = torch.nn.Parameter(param_value.to(device=target_device))
+            if isinstance(module, nn.Linear):
+                module.extra_repr = types.MethodType(_linear_extra_repr, module)
+        else:
+            module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device)
+            quantize_(module, self.quantization_config.get_apply_tensor_subclass())
 
     def _process_model_after_weight_loading(self, model):
         """No process required for torchao quantized model"""
@@ -172,6 +211,12 @@ def is_serializable(self, safe_serialization=None):
         )
         if not _is_torchao_serializable:
             logger.warning("torchao quantized model is only serializable after huggingface_hub >= 0.25.0 ")
+        if self.offload and self.quantization_config.modules_to_not_convert is None:
+            logger.warning(
+                "The model contains offloaded modules and these modules are not quantized. We don't recommend saving the model as we won't be able to reload them."
+                "If you want to specify modules to not quantize, please specify modules_to_not_convert in the quantization_config."
+            )
+            return False
         return _is_torchao_serializable
 
     @property
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 7bb2d5049dccf8..49c2aefa09260e 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -16,6 +16,7 @@
 import contextlib
 import doctest
 import functools
+import gc
 import importlib
 import inspect
 import logging
@@ -144,6 +145,7 @@
 
 if is_accelerate_available():
     from accelerate.state import AcceleratorState, PartialState
+    from accelerate.utils.imports import is_fp8_available
 
 
 if is_pytest_available():
@@ -1000,6 +1002,13 @@ def require_torch_fp16(test_case):
     )(test_case)
 
 
+def require_fp8(test_case):
+    """Decorator marking a test that requires supports for fp8"""
+    return unittest.skipUnless(is_accelerate_available() and is_fp8_available(), "test requires fp8 support")(
+        test_case
+    )
+
+
 def require_torch_bf16(test_case):
     """Decorator marking a test that requires a device that supports bf16"""
     return unittest.skipUnless(
@@ -2358,6 +2367,66 @@ def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
         test_case.fail(f'{results["error"]}')
 
 
+def run_test_using_subprocess(func):
+    """
+    To decorate a test to run in a subprocess using the `subprocess` module. This could avoid potential GPU memory
+    issues (GPU OOM or a test that causes many subsequential failing with `CUDA error: device-side assert triggered`).
+    """
+    import pytest
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if os.getenv("_INSIDE_SUB_PROCESS", None) == "1":
+            func(*args, **kwargs)
+        else:
+            test = " ".join(os.environ.get("PYTEST_CURRENT_TEST").split(" ")[:-1])
+            try:
+                import copy
+
+                env = copy.deepcopy(os.environ)
+                env["_INSIDE_SUB_PROCESS"] = "1"
+                # This prevents the entries in `short test summary info` given by the subprocess being truncated. so the
+                # full information can be passed to the parent pytest process.
+                # See: https://docs.pytest.org/en/stable/explanation/ci.html
+                env["CI"] = "true"
+
+                # If not subclass of `unitTest.TestCase` and `pytestconfig` is used: try to grab and use the arguments
+                if "pytestconfig" in kwargs:
+                    command = list(kwargs["pytestconfig"].invocation_params.args)
+                    for idx, x in enumerate(command):
+                        if x in kwargs["pytestconfig"].args:
+                            test = test.split("::")[1:]
+                            command[idx] = "::".join([f"{func.__globals__['__file__']}"] + test)
+                    command = [f"{sys.executable}", "-m", "pytest"] + command
+                    command = [x for x in command if x not in ["--no-summary"]]
+                # Otherwise, simply run the test with no option at all
+                else:
+                    command = [f"{sys.executable}", "-m", "pytest", f"{test}"]
+
+                subprocess.run(command, env=env, check=True, capture_output=True)
+            except subprocess.CalledProcessError as e:
+                exception_message = e.stdout.decode()
+                lines = exception_message.split("\n")
+                # Add a first line with more informative information instead of just `= test session starts =`.
+                # This makes the `short test summary info` section more useful.
+                if "= test session starts =" in lines[0]:
+                    text = ""
+                    for line in lines[1:]:
+                        if line.startswith("FAILED "):
+                            text = line[len("FAILED ") :]
+                            text = "".join(text.split(" - ")[1:])
+                        elif line.startswith("=") and line.endswith("=") and " failed in " in line:
+                            break
+                        elif len(text) > 0:
+                            text += f"\n{line}"
+                    text = "(subprocess) " + text
+                    lines = [text] + lines
+                exception_message = "\n".join(lines)
+                raise pytest.fail(exception_message, pytrace=False)
+
+    return wrapper
+
+
 """
 The following contains utils to run the documentation tests without having to overwrite any files.
 
@@ -2631,3 +2700,10 @@ def compare_pipeline_output_to_hub_spec(output, hub_spec):
         if unexpected_keys:
             error.append(f"Keys in pipeline output that are not in Hub spec: {unexpected_keys}")
         raise KeyError("\n".join(error))
+
+
+@require_torch
+def cleanup(device: str, gc_collect=False):
+    if gc_collect:
+        gc.collect()
+    backend_empty_cache(device)
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index d2433868cf1897..1bc13020e65b66 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -577,7 +577,7 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
                 token_index = current_vocab[token.content]
 
             if token.special and str(token) not in self.all_special_tokens:
-                self._additional_special_tokens.append(token)
+                self._special_tokens_map["additional_special_tokens"].append(token)
             # the setter automatically updates the reverse map
             self._added_tokens_decoder[token_index] = token
             self._added_tokens_encoder[token.content] = token_index
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 4f3187d510fad1..03df02d21ff32b 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -861,16 +861,10 @@ class SpecialTokensMixin:
     ]
 
     def __init__(self, verbose=False, **kwargs):
-        self._bos_token = None
-        self._eos_token = None
-        self._unk_token = None
-        self._sep_token = None
-        self._pad_token = None
-        self._cls_token = None
-        self._mask_token = None
         self._pad_token_type_id = 0
-        self._additional_special_tokens = []
         self.verbose = verbose
+        self._special_tokens_map = {attr: None for attr in self.SPECIAL_TOKENS_ATTRIBUTES}
+        self._special_tokens_map["additional_special_tokens"] = []  # for BC where it defaults to empty list
 
         # We directly set the hidden value to allow initialization with special tokens
         # which are not yet in the vocabulary. Necessary for serialization/de-serialization
@@ -932,7 +926,7 @@ def add_special_tokens(
                 assign the index of the `unk_token` to them).
             replace_additional_special_tokens (`bool`, *optional*,, defaults to `True`):
                 If `True`, the existing list of additional special tokens will be replaced by the list provided in
-                `special_tokens_dict`. Otherwise, `self._additional_special_tokens` is just extended. In the former
+                `special_tokens_dict`. Otherwise, `self._special_tokens_map["additional_special_tokens"]` is just extended. In the former
                 case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged
                 as non-special tokens. Remember, this only affects which tokens are skipped during decoding, not the
                 `added_tokens_encoder` and `added_tokens_decoder`. This means that the previous
@@ -983,7 +977,7 @@ def add_special_tokens(
                 if replace_additional_special_tokens and len(to_add) > 0:
                     setattr(self, key, list(to_add))
                 else:
-                    self._additional_special_tokens.extend(to_add)
+                    self._special_tokens_map["additional_special_tokens"].extend(to_add)
                 added_tokens += to_add
 
             else:
@@ -1053,192 +1047,6 @@ def add_tokens(
     def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
         raise NotImplementedError
 
-    @property
-    def bos_token(self) -> str:
-        """
-        `str`: Beginning of sentence token. Log an error if used while not having been set.
-        """
-        if self._bos_token is None:
-            if self.verbose:
-                logger.error("Using bos_token, but it is not set yet.")
-            return None
-        return str(self._bos_token)
-
-    @property
-    def eos_token(self) -> str:
-        """
-        `str`: End of sentence token. Log an error if used while not having been set.
-        """
-        if self._eos_token is None:
-            if self.verbose:
-                logger.error("Using eos_token, but it is not set yet.")
-            return None
-        return str(self._eos_token)
-
-    @property
-    def unk_token(self) -> str:
-        """
-        `str`: Unknown token. Log an error if used while not having been set.
-        """
-        if self._unk_token is None:
-            if self.verbose:
-                logger.error("Using unk_token, but it is not set yet.")
-            return None
-        return str(self._unk_token)
-
-    @property
-    def sep_token(self) -> str:
-        """
-        `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
-        having been set.
-        """
-        if self._sep_token is None:
-            if self.verbose:
-                logger.error("Using sep_token, but it is not set yet.")
-            return None
-        return str(self._sep_token)
-
-    @property
-    def pad_token(self) -> str:
-        """
-        `str`: Padding token. Log an error if used while not having been set.
-        """
-        if self._pad_token is None:
-            if self.verbose:
-                logger.error("Using pad_token, but it is not set yet.")
-            return None
-        return str(self._pad_token)
-
-    @property
-    def cls_token(self) -> str:
-        """
-        `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
-        depth of the model. Log an error if used while not having been set.
-        """
-        if self._cls_token is None:
-            if self.verbose:
-                logger.error("Using cls_token, but it is not set yet.")
-            return None
-        return str(self._cls_token)
-
-    @property
-    def mask_token(self) -> str:
-        """
-        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
-        having been set.
-        """
-        if self._mask_token is None:
-            if self.verbose:
-                logger.error("Using mask_token, but it is not set yet.")
-            return None
-        return str(self._mask_token)
-
-    @property
-    def additional_special_tokens(self) -> List[str]:
-        """
-        `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
-        set.
-        """
-        if self._additional_special_tokens is None:
-            if self.verbose:
-                logger.error("Using additional_special_tokens, but it is not set yet.")
-            return None
-        return [str(tok) for tok in self._additional_special_tokens]
-
-    @bos_token.setter
-    def bos_token(self, value):
-        if not isinstance(value, (str, AddedToken)) and value is not None:
-            raise ValueError("Cannot set a non-string value as the BOS token")
-        self._bos_token = value
-
-    @eos_token.setter
-    def eos_token(self, value):
-        if not isinstance(value, (str, AddedToken)) and value is not None:
-            raise ValueError("Cannot set a non-string value as the EOS token")
-        self._eos_token = value
-
-    @unk_token.setter
-    def unk_token(self, value):
-        if not isinstance(value, (str, AddedToken)) and value is not None:
-            raise ValueError("Cannot set a non-string value as the UNK token")
-        self._unk_token = value
-
-    @sep_token.setter
-    def sep_token(self, value):
-        if not isinstance(value, (str, AddedToken)) and value is not None:
-            raise ValueError("Cannot set a non-string value as the SEP token")
-        self._sep_token = value
-
-    @pad_token.setter
-    def pad_token(self, value):
-        if not isinstance(value, (str, AddedToken)) and value is not None:
-            raise ValueError("Cannot set a non-string value as the PAD token")
-        self._pad_token = value
-
-    @cls_token.setter
-    def cls_token(self, value):
-        if not isinstance(value, (str, AddedToken)) and value is not None:
-            raise ValueError("Cannot set a non-string value as the CLS token")
-        self._cls_token = value
-
-    @mask_token.setter
-    def mask_token(self, value):
-        if not isinstance(value, (str, AddedToken)) and value is not None:
-            raise ValueError("Cannot set a non-string value as the MASK token")
-        self._mask_token = value
-
-    @additional_special_tokens.setter
-    def additional_special_tokens(self, value):
-        self._additional_special_tokens = value if value is not None else None
-
-    @property
-    def bos_token_id(self) -> Optional[int]:
-        """
-        `Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns `None` if the token has not
-        been set.
-        """
-        if self._bos_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.bos_token)
-
-    @property
-    def eos_token_id(self) -> Optional[int]:
-        """
-        `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been
-        set.
-        """
-        if self._eos_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.eos_token)
-
-    @property
-    def unk_token_id(self) -> Optional[int]:
-        """
-        `Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been set.
-        """
-        if self._unk_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.unk_token)
-
-    @property
-    def sep_token_id(self) -> Optional[int]:
-        """
-        `Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
-        sequence. Returns `None` if the token has not been set.
-        """
-        if self._sep_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.sep_token)
-
-    @property
-    def pad_token_id(self) -> Optional[int]:
-        """
-        `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
-        """
-        if self._pad_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.pad_token)
-
     @property
     def pad_token_type_id(self) -> int:
         """
@@ -1246,67 +1054,55 @@ def pad_token_type_id(self) -> int:
         """
         return self._pad_token_type_id
 
-    @property
-    def cls_token_id(self) -> Optional[int]:
-        """
-        `Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input sequence
-        leveraging self-attention along the full depth of the model.
+    def __setattr__(self, key, value):
+        key_without_id = key
+        key_is_special_id = key.endswith("_id") or key.endswith("_ids")
+        if key_is_special_id:
+            key_without_id = key[:-3] if not key.endswith("_ids") else key[:-4]
 
-        Returns `None` if the token has not been set.
-        """
-        if self._cls_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.cls_token)
-
-    @property
-    def mask_token_id(self) -> Optional[int]:
-        """
-        `Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
-        modeling. Returns `None` if the token has not been set.
-        """
-        if self._mask_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.mask_token)
-
-    @property
-    def additional_special_tokens_ids(self) -> List[int]:
-        """
-        `List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not having
-        been set.
-        """
-        return self.convert_tokens_to_ids(self.additional_special_tokens)
-
-    @bos_token_id.setter
-    def bos_token_id(self, value):
-        self._bos_token = self.convert_ids_to_tokens(value) if value is not None else None
-
-    @eos_token_id.setter
-    def eos_token_id(self, value):
-        self._eos_token = self.convert_ids_to_tokens(value) if value is not None else None
-
-    @unk_token_id.setter
-    def unk_token_id(self, value):
-        self._unk_token = self.convert_ids_to_tokens(value) if value is not None else None
-
-    @sep_token_id.setter
-    def sep_token_id(self, value):
-        self._sep_token = self.convert_ids_to_tokens(value) if value is not None else None
+        if self.__dict__.get("_special_tokens_map", None) is not None and any(
+            name in self.__dict__["_special_tokens_map"] for name in [key, key_without_id]
+        ):
+            if key_is_special_id:
+                if value is not None:
+                    value = (
+                        self.convert_ids_to_tokens(value)
+                        if key != "additional_special_tokens"
+                        else [self.convert_ids_to_tokens(val) for val in value]
+                    )
+                key = key_without_id
 
-    @pad_token_id.setter
-    def pad_token_id(self, value):
-        self._pad_token = self.convert_ids_to_tokens(value) if value is not None else None
+            if key != "additional_special_tokens" and not isinstance(value, (str, AddedToken)) and value is not None:
+                raise ValueError(f"Cannot set a non-string value as the {key}")
+            self._special_tokens_map[key] = value
+        else:
+            super().__setattr__(key, value)
 
-    @cls_token_id.setter
-    def cls_token_id(self, value):
-        self._cls_token = self.convert_ids_to_tokens(value) if value is not None else None
+    def __getattr__(self, key):
+        key_without_id = key
+        key_is_special_id = key.endswith("_id") or key.endswith("_ids")
+        if key_is_special_id:
+            key_without_id = key[:-3] if not key.endswith("_ids") else key[:-4]
 
-    @mask_token_id.setter
-    def mask_token_id(self, value):
-        self._mask_token = self.convert_ids_to_tokens(value) if value is not None else None
+        if self.__dict__.get("_special_tokens_map", None) is not None and any(
+            name in self.__dict__["_special_tokens_map"] for name in [key, key_without_id]
+        ):
+            _special_tokens_map = self.__dict__["_special_tokens_map"]
+            if not key_is_special_id:
+                if _special_tokens_map[key] is None:
+                    if self.verbose:
+                        logger.error(f"Using {key}, but it is not set yet.")
+                    return None
+                value = _special_tokens_map[key]
+                return str(value) if key != "additional_special_tokens" else [str(tok) for tok in value]
+            else:
+                attr_as_tokens = getattr(self, key_without_id)
+                return self.convert_tokens_to_ids(attr_as_tokens) if attr_as_tokens is not None else None
 
-    @additional_special_tokens_ids.setter
-    def additional_special_tokens_ids(self, values):
-        self._additional_special_tokens = [self.convert_ids_to_tokens(value) for value in values]
+        if key not in self.__dict__:
+            raise AttributeError(f"{self.__class__.__name__} has no attribute {key}")
+        else:
+            return super().__getattr__(key)
 
     @property
     def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
@@ -1334,7 +1130,7 @@ def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[U
         """
         set_attr = {}
         for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
-            attr_value = getattr(self, "_" + attr)
+            attr_value = self._special_tokens_map[attr]
             if attr_value:
                 set_attr[attr] = attr_value
         return set_attr
@@ -1379,6 +1175,20 @@ def all_special_ids(self) -> List[int]:
         all_ids = self.convert_tokens_to_ids(all_toks)
         return all_ids
 
+    def _set_model_specific_special_tokens(self, special_tokens: List[str]):
+        """
+        Adds new special tokens to the "SPECIAL_TOKENS_ATTRIBUTES" list which will be part
+        of "self.special_tokens" and saved as a special token in tokenizer's config.
+        This allows us to dynamically add new model-type specific tokens after initilizing the tokenizer.
+        For example: if the model tokenizers is multimodal, we can support special image or audio tokens.
+        """
+        self.SPECIAL_TOKENS_ATTRIBUTES = self.SPECIAL_TOKENS_ATTRIBUTES + list(special_tokens.keys())
+        for key, value in special_tokens.items():
+            if isinstance(value, (str, AddedToken)):
+                self._special_tokens_map[key] = value
+            else:
+                raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}")
+
 
 ENCODE_KWARGS_DOCSTRING = r"""
             add_special_tokens (`bool`, *optional*, defaults to `True`):
@@ -1633,6 +1443,9 @@ def __init__(self, **kwargs):
 
         super().__init__(**kwargs)
 
+        self.extra_special_tokens = kwargs.pop("extra_special_tokens", {})
+        self._set_model_specific_special_tokens(special_tokens=self.extra_special_tokens)
+
     @property
     def max_len_single_sentence(self) -> int:
         """
@@ -1687,8 +1500,8 @@ def __repr__(self) -> str:
             f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
             f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
             f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
-            f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), "
-            " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}"
+            f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces},"
+            " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}\n)"
         )
 
     def __len__(self) -> int:
@@ -1877,8 +1690,12 @@ def apply_chat_template(
                 final_message = chat[-1]["content"]
                 if isinstance(final_message, (list, tuple)):
                     final_message = final_message[-1]["text"]
-                final_message = final_message.strip()
-                rendered_chat = rendered_chat[: rendered_chat.rindex(final_message) + len(final_message)].rstrip()
+                try:
+                    rendered_chat = rendered_chat[: rendered_chat.rindex(final_message) + len(final_message)]
+                except:  # noqa: E722
+                    # Some chat templates like Llama-3.1 trim messages before rendering, so we must do the same here.
+                    final_message = final_message.strip()
+                    rendered_chat = rendered_chat[: rendered_chat.rindex(final_message) + len(final_message)]
             rendered.append(rendered_chat)
 
         if not is_batched:
@@ -1909,7 +1726,7 @@ def apply_chat_template(
                             if start_token is None:
                                 # start_token is out of bounds maybe due to truncation.
                                 break
-                            for token_id in range(start_token, end_token + 1 if end_token else len(input_ids)):
+                            for token_id in range(start_token, end_token + 1 if end_token else len(input_ids[i])):
                                 current_mask[token_id] = 1
                         assistant_masks.append(current_mask)
                     out["assistant_masks"] = assistant_masks if is_batched else assistant_masks[0]
@@ -2591,8 +2408,11 @@ def save_pretrained(
             if hasattr(self, k):
                 tokenizer_config[k] = getattr(self, k)
 
-        # Let's make sure we properly save the special tokens.
+        # Let's make sure we properly save the special tokens
         tokenizer_config.update(self.special_tokens_map)
+        if "extra_special_tokens" not in tokenizer_config:
+            tokenizer_config["extra_special_tokens"] = self.extra_special_tokens
+            tokenizer_config.update(self.extra_special_tokens)
 
         if self.chat_template is not None:
             if isinstance(self.chat_template, dict):
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index fabc1a1d5ed81c..5d238a5715ffd2 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -863,13 +863,12 @@ def train_new_from_iterator(
         special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
         special_tokens_list.remove("additional_special_tokens")
         for token in special_tokens_list:
-            # Get the private one to avoid unnecessary warnings.
-            if getattr(self, f"_{token}") is not None:
+            if getattr(self, token) is not None:
                 special_token = getattr(self, token)
                 if special_tokens_map is not None and special_token in special_tokens_map:
                     special_token = special_tokens_map[special_token]
 
-                special_token_full = getattr(self, f"_{token}")
+                special_token_full = self._special_tokens_map.get(token, None)
                 if isinstance(special_token_full, AddedToken):
                     # Create an added token with the same parameters except the content
                     kwargs[token] = AddedToken(
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 1b13787007e9c3..3fd067edfc5b06 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -34,7 +34,7 @@
 import warnings
 from collections.abc import Mapping
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 
 # Integrations must be imported before ML frameworks:
@@ -66,7 +66,7 @@
 from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available
 from .integrations.tpu import tpu_spmd_dataloader
 from .modelcard import TrainingSummary
-from .modeling_utils import PreTrainedModel, load_sharded_checkpoint
+from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
 from .models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
     MODEL_MAPPING_NAMES,
@@ -117,9 +117,9 @@
     EvalPrediction,
     HPSearchBackend,
     HubStrategy,
-    IntervalStrategy,
     PredictionOutput,
     RemoveColumnsCollator,
+    SaveStrategy,
     TrainerMemoryTracker,
     TrainOutput,
     check_target_module_exists,
@@ -233,7 +233,6 @@
     from accelerate.utils import (
         DistributedDataParallelKwargs,
         DistributedType,
-        GradientAccumulationPlugin,
         load_fsdp_model,
         load_fsdp_optimizer,
         save_fsdp_model,
@@ -358,6 +357,11 @@ class Trainer:
         optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
             A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
             model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        optimizer_cls_and_kwargs (`Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*):
+            A tuple containing the optimizer class and keyword arguments to use.
+            Overrides `optim` and `optim_args` in `args`. Incompatible with the `optimizers` argument.
+
+            Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before initializing the Trainer.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
             A function that preprocess the logits right before caching them at each evaluation step. Must take two
             tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
@@ -401,7 +405,8 @@ def __init__(
         compute_loss_func: Optional[Callable] = None,
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        optimizers: Tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
+        optimizer_cls_and_kwargs: Optional[Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]] = None,
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
     ):
         if args is None:
@@ -419,6 +424,12 @@ def __init__(
             raise ValueError(
                 f"You have set `args.eval_strategy` to {args.eval_strategy} but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. "
             )
+        if args.save_strategy == SaveStrategy.BEST or args.load_best_model_at_end:
+            if args.metric_for_best_model is None:
+                raise ValueError(
+                    "`args.metric_for_best_model` must be provided when using 'best' save_strategy or if `args.load_best_model_at_end` is set to `True`."
+                )
+
         self.args = args
         self.compute_loss_func = compute_loss_func
         # Seed must be set before instantiating the model when using model
@@ -589,14 +600,19 @@ def __init__(
             if not _is_peft_model(unwrapped_model)
             else unwrapped_model.get_base_model().forward
         )
-
-        self.model_accepts_loss_kwargs = "loss_kwargs" in inspect.signature(model_forward).parameters
+        forward_params = inspect.signature(model_forward).parameters
+        self.model_accepts_loss_kwargs = (
+            "loss_kwargs" in forward_params and forward_params["loss_kwargs"].kind == inspect.Parameter.VAR_KEYWORD
+        )
 
         self.neftune_noise_alpha = args.neftune_noise_alpha
 
         self.compute_metrics = compute_metrics
         self.preprocess_logits_for_metrics = preprocess_logits_for_metrics
         self.optimizer, self.lr_scheduler = optimizers
+        self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs
+        if self.optimizer_cls_and_kwargs is not None and self.optimizer is not None:
+            raise RuntimeError("Passing both `optimizers` and `optimizer_cls_and_kwargs` arguments is incompatible.")
         if model_init is not None and (self.optimizer is not None or self.lr_scheduler is not None):
             raise RuntimeError(
                 "Passing a `model_init` is incompatible with providing the `optimizers` argument. "
@@ -645,7 +661,7 @@ def __init__(
             raise ValueError("The `data_collator` should be a simple callable (function, class with `__call__`).")
 
         if args.max_steps > 0 and args.num_train_epochs > 0:
-            logger.warning("max_steps is given, it will override any value given in num_train_epochs")
+            logger.info("max_steps is given, it will override any value given in num_train_epochs")
 
         if train_dataset is not None and not has_length(train_dataset) and args.max_steps <= 0:
             raise ValueError(
@@ -1165,7 +1181,10 @@ def create_optimizer(self):
                 },
             ]
 
-            optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(self.args, opt_model)
+            if self.optimizer_cls_and_kwargs is not None:
+                optimizer_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs
+            else:
+                optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(self.args, opt_model)
 
             # Overwrite `params` in case it's created by `get_optimizer_cls_and_kwargs`
             # e.g. for GaLore optimizer.
@@ -1652,21 +1671,21 @@ def num_examples(self, dataloader: DataLoader) -> int:
         except (NameError, AttributeError, TypeError):  # no dataset or length, estimate by length of dataloader
             return len(dataloader) * self.args.per_device_train_batch_size
 
-    def num_tokens(self, train_dl: DataLoader, max_steps: Optional[int] = None) -> int:
+    @staticmethod
+    def num_tokens(train_dl: DataLoader, max_steps: Optional[int] = None) -> int:
         """
         Helper to get number of tokens in a [`~torch.utils.data.DataLoader`] by enumerating dataloader.
         """
         train_tokens = 0
         try:
-            for step, batch in enumerate(train_dl):
+            for batch in train_dl:
                 tokens = batch["input_ids"].numel()
                 if max_steps is not None:
                     return tokens * max_steps
                 train_tokens += tokens
-            return train_tokens
         except KeyError:
             logger.warning("Cannot get num_tokens from dataloader")
-            return train_tokens
+        return train_tokens
 
     def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
         """HP search setup code"""
@@ -1706,6 +1725,9 @@ def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
         if self.is_deepspeed_enabled:
             if self.args.deepspeed is None:
                 raise ValueError("For sweeps with deepspeed, `args.deepspeed` must be set")
+
+            self.accelerator.free_memory()
+
             # Rebuild the deepspeed config to reflect the updated training parameters
             from accelerate.utils import DeepSpeedPlugin
 
@@ -1729,7 +1751,7 @@ def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, Any]], ste
         if self.hp_search_backend == HPSearchBackend.OPTUNA:
             import optuna
 
-            if not trial.study._is_multi_objective():
+            if hasattr(trial, "study") and not trial.study._is_multi_objective():
                 trial.report(self.objective, step)
                 if trial.should_prune():
                     self.callback_handler.on_train_end(self.args, self.state, self.control)
@@ -2209,7 +2231,7 @@ def _inner_training_loop(
             else:
                 debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
 
-        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
+        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled
 
         # We need to reset the scheduler, as its parameters may be different on subsequent calls
         if self._created_lr_scheduler:
@@ -2258,9 +2280,15 @@ def _inner_training_loop(
         # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX
         use_accelerator_prepare = True if model is self.model else False
 
+        if use_accelerator_prepare and self.is_fsdp_enabled:
+            # In case of auto_find_batch_size=True
+            # Remove FSDP wrapping from sub-models.
+            self.model = unwrap_model(self.model, recursive=True)
+            # configure fsdp plugin for qlora if any
+            self._fsdp_qlora_plugin_updates()
+
         if delay_optimizer_creation:
             if use_accelerator_prepare:
-                self._fsdp_qlora_plugin_updates()
                 self.model = self.accelerator.prepare(self.model)
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
@@ -2383,7 +2411,6 @@ def _inner_training_loop(
         if args.eval_on_start:
             self._evaluate(trial, ignore_keys_for_eval, skip_scheduler=True)
 
-        total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):
             epoch_dataloader = train_dataloader
             if hasattr(epoch_dataloader, "set_epoch"):
@@ -2415,7 +2442,6 @@ def _inner_training_loop(
             epoch_iterator = iter(epoch_dataloader)
             # We chunkify the epoch iterator into gradient accumulation steps `n` batches
             remainder = num_examples % args.gradient_accumulation_steps
-            num_items_in_batch = None
             if remainder == 0:
                 remainder = args.gradient_accumulation_steps
             update_step = -1
@@ -2424,15 +2450,9 @@ def _inner_training_loop(
                 update_step += 1
                 num_batches = args.gradient_accumulation_steps if update_step != (total_updates - 1) else remainder
                 batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches)
-                for inputs in batch_samples:
+                for i, inputs in enumerate(batch_samples):
                     step += 1
-                    total_batched_samples += 1
-                    is_last_step_and_steps_less_than_grad_acc = (
-                        steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch
-                    )
-                    do_sync_step = is_last_step_and_steps_less_than_grad_acc or (
-                        total_batched_samples % args.gradient_accumulation_steps == 0
-                    )
+                    do_sync_step = (step + 1) % args.gradient_accumulation_steps == 0 or (step + 1) == steps_in_epoch
                     # Since we perform prefetching, we need to manually set sync_gradients
                     if not do_sync_step:
                         self.accelerator.gradient_state._set_sync_gradients(False)
@@ -2470,7 +2490,13 @@ def _inner_training_loop(
                     if step % args.gradient_accumulation_steps == 0:
                         self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
 
-                    with self.accelerator.accumulate(model):
+                    # We explicitly want to avoid relying on `accelerator.accumulate` for generation training
+                    context = (
+                        functools.partial(self.accelerator.no_sync, model=model)
+                        if i != len(batch_samples) - 1
+                        else contextlib.nullcontext
+                    )
+                    with context():
                         tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
 
                     if (
@@ -2538,7 +2564,9 @@ def _inner_training_loop(
                         self.state.global_step += 1
                         self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
                         self.control = self.callback_handler.on_step_end(args, self.state, self.control)
-                        self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
+                        self._maybe_log_save_evaluate(
+                            tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time
+                        )
                     else:
                         self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
 
@@ -2563,7 +2591,7 @@ def _inner_training_loop(
                 self.control.should_training_stop = True
 
             self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
-            self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
+            self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time)
 
             if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
                 if is_torch_xla_available():
@@ -2968,7 +2996,7 @@ def _evaluate(self, trial, ignore_keys_for_eval, skip_scheduler=False):
                 ) from exc
         return metrics
 
-    def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval):
+    def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time):
         if self.control.should_log and self.state.global_step > self._globalstep_last_logged:
             if is_torch_xla_available():
                 xm.mark_step()
@@ -2990,14 +3018,18 @@ def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, igno
             self._globalstep_last_logged = self.state.global_step
             self.store_flos()
 
-            self.log(logs)
+            self.log(logs, start_time)
 
         metrics = None
         if self.control.should_evaluate:
             metrics = self._evaluate(trial, ignore_keys_for_eval)
+            is_new_best_metric = self._determine_best_metric(metrics=metrics, trial=trial)
+
+            if self.args.save_strategy == SaveStrategy.BEST:
+                self.control.should_save = is_new_best_metric
 
         if self.control.should_save:
-            self._save_checkpoint(model, trial, metrics=metrics)
+            self._save_checkpoint(model, trial)
             self.control = self.callback_handler.on_save(self.args, self.state, self.control)
 
     def _load_rng_state(self, checkpoint):
@@ -3074,7 +3106,48 @@ def _load_rng_state(self, checkpoint):
                         "\nThis won't yield the same results as if the training had not been interrupted."
                     )
 
-    def _save_checkpoint(self, model, trial, metrics=None):
+    def _determine_best_metric(self, metrics, trial):
+        """
+        Determine if the model should be saved based on the evaluation metrics.
+        If args.metric_for_best_model is not set, the loss is used.
+
+        Returns:
+            bool: True if a new best metric was found, else False
+        """
+        is_new_best_metric = False
+
+        if self.args.metric_for_best_model is not None:
+            metric_to_check = self.args.metric_for_best_model
+
+            if not metric_to_check.startswith("eval_"):
+                metric_to_check = f"eval_{metric_to_check}"
+
+            try:
+                metric_value = metrics[metric_to_check]
+            except KeyError as exc:
+                raise KeyError(
+                    f"The `metric_for_best_model` training argument is set to '{metric_to_check}', which is not found in the evaluation metrics. "
+                    f"The available evaluation metrics are: {list(metrics.keys())}. Consider changing the `metric_for_best_model` via the TrainingArguments."
+                ) from exc
+
+            operator = np.greater if self.args.greater_is_better else np.less
+
+            if self.state.best_metric is None:
+                self.state.best_metric = float("-inf") if self.args.greater_is_better else float("inf")
+
+            if operator(metric_value, self.state.best_metric):
+                run_dir = self._get_output_dir(trial=trial)
+                checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+                output_dir = os.path.join(run_dir, checkpoint_folder)
+
+                self.state.best_metric = metric_value
+                self.state.best_model_checkpoint = output_dir
+
+                is_new_best_metric = True
+
+        return is_new_best_metric
+
+    def _save_checkpoint(self, model, trial):
         # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
         # want to save except FullyShardedDDP.
         # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
@@ -3095,31 +3168,6 @@ def _save_checkpoint(self, model, trial, metrics=None):
             # Save RNG state
             self._save_rng_state(output_dir)
 
-        # Determine the new best metric / best model checkpoint
-        if metrics is not None and self.args.metric_for_best_model is not None:
-            metric_to_check = self.args.metric_for_best_model
-            if not metric_to_check.startswith("eval_"):
-                metric_to_check = f"eval_{metric_to_check}"
-            try:
-                metric_value = metrics[metric_to_check]
-            except KeyError as exc:
-                raise KeyError(
-                    f"The `metric_for_best_model` training argument is set to '{metric_to_check}', "
-                    f"which is not found in the evaluation metrics. "
-                    f"The available evaluation metrics are: {list(metrics.keys())}. "
-                    f"Please ensure that the `compute_metrics` function returns a dictionary that includes '{metric_to_check}' or "
-                    f"consider changing the `metric_for_best_model` via the TrainingArguments."
-                ) from exc
-
-            operator = np.greater if self.args.greater_is_better else np.less
-            if (
-                self.state.best_metric is None
-                or self.state.best_model_checkpoint is None
-                or operator(metric_value, self.state.best_metric)
-            ):
-                self.state.best_metric = metric_value
-                self.state.best_model_checkpoint = output_dir
-
         # Save the Trainer state
         if self.args.should_save:
             # Update `ExportableState` callbacks and `TrainerControl` state to where we are currently
@@ -3435,13 +3483,18 @@ def hyperparameter_search(
             hp_name (`Callable[["optuna.Trial"], str]]`, *optional*):
                 A function that defines the trial/run name. Will default to None.
             kwargs (`Dict[str, Any]`, *optional*):
-                Additional keyword arguments passed along to `optuna.create_study` or `ray.tune.run`. For more
-                information see:
-
-                - the documentation of
-                  [optuna.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html)
-                - the documentation of [tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run)
-                - the documentation of [sigopt](https://app.sigopt.com/docs/endpoints/experiments/create)
+                Additional keyword arguments for each backend:
+
+                - `optuna`: parameters from
+                  [optuna.study.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html)
+                  and also the parameters `timeout`, `n_jobs` and `gc_after_trial` from
+                  [optuna.study.Study.optimize](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize)
+                - `ray`: parameters from [tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run).
+                  If `resources_per_trial` is not set in the `kwargs`, it defaults to 1 CPU core and 1 GPU (if available).
+                  If `progress_reporter` is not set in the `kwargs`,
+                  [ray.tune.CLIReporter](https://docs.ray.io/en/latest/tune/api/doc/ray.tune.CLIReporter.html) is used.
+                - `sigopt`: the parameter `proxies` from
+                  [sigopt.Connection.set_proxies](https://docs.sigopt.com/support/faq#how-do-i-use-sigopt-with-a-proxy).
 
         Returns:
             [`trainer_utils.BestRun` or `List[trainer_utils.BestRun]`]: All the information about the best run or best
@@ -3468,7 +3521,7 @@ def hyperparameter_search(
         self.hp_search_backend = None
         return best_run
 
-    def log(self, logs: Dict[str, float]) -> None:
+    def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
         """
         Log `logs` on the various objects watching training.
 
@@ -3477,11 +3530,15 @@ def log(self, logs: Dict[str, float]) -> None:
         Args:
             logs (`Dict[str, float]`):
                 The values to log.
+            start_time (`Optional[float]`):
+                The start of training.
         """
         if self.state.epoch is not None:
             logs["epoch"] = self.state.epoch
         if self.args.include_num_input_tokens_seen:
             logs["num_input_tokens_seen"] = self.state.num_input_tokens_seen
+            if start_time is not None:
+                speed_metrics("train", start_time, num_tokens=self.state.num_input_tokens_seen)
 
         output = {**logs, **{"step": self.state.global_step}}
         self.state.log_history.append(output)
@@ -3602,10 +3659,11 @@ def training_step(
             with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                 scaled_loss.backward()
         else:
-            loss *= self.args.gradient_accumulation_steps
             self.accelerator.backward(loss, **kwargs)
-
-        return loss.detach() / self.args.gradient_accumulation_steps
+            # Finally we need to normalize the loss for reporting
+            if num_items_in_batch is None:
+                return loss.detach() / self.args.gradient_accumulation_steps
+            return loss.detach()
 
     def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
         """
@@ -3650,6 +3708,9 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
             # We don't use .loss here since the model may return tuples instead of ModelOutput.
             loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
 
+        if self.args.average_tokens_across_devices and self.model_accepts_loss_kwargs:
+            loss *= self.accelerator.num_processes
+
         return (loss, outputs) if return_outputs else loss
 
     def is_local_process_zero(self) -> bool:
@@ -4540,7 +4601,7 @@ def _push_from_checkpoint(self, checkpoint_folder):
         # Same for the training arguments
         torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
 
-        if self.args.save_strategy == IntervalStrategy.STEPS:
+        if self.args.save_strategy == SaveStrategy.STEPS:
             commit_message = f"Training in progress, step {self.state.global_step}"
         else:
             commit_message = f"Training in progress, epoch {int(self.state.epoch)}"
@@ -4714,7 +4775,17 @@ def prediction_loop(
             elif args.bf16_full_eval:
                 model = model.to(dtype=torch.bfloat16, device=args.device)
 
-        batch_size = dataloader.batch_size
+        batch_size = (
+            dataloader.total_batch_size
+            if getattr(dataloader, "_is_accelerate_prepared", False)
+            else dataloader.batch_size
+        )
+
+        if batch_size is None:
+            raise ValueError(
+                "Batch size cannot be None. Ensure the dataloader has a valid batch_size or total_batch_size."
+            )
+
         num_examples = self.num_examples(dataloader)
         logger.info(f"\n***** Running {description} *****")
         logger.info(f"  Num examples = {num_examples}")
@@ -4894,24 +4965,21 @@ def _add_sm_patterns_to_gitignore(self) -> None:
             self.repo.git_push()
 
     def create_accelerator_and_postprocess(self):
+        # We explicitly don't rely on the `Accelerator` to do gradient accumulation
         grad_acc_kwargs = {}
         if is_accelerate_available("0.28.0") and self.args.accelerator_config.gradient_accumulation_kwargs is not None:
             grad_acc_kwargs = self.args.accelerator_config.gradient_accumulation_kwargs
 
         # check if num_steps is attempted to be passed in gradient_accumulation_kwargs
-        if "num_steps" in grad_acc_kwargs and self.args.gradient_accumulation_steps > 1:
-            # raise because we do not know which setting is intended.
-            raise ValueError(
-                "The `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`"
-                "If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`."
-            )
-        elif "num_steps" not in grad_acc_kwargs:
-            # take the gradient_accumulation_steps setting from TrainingArguments.
-            grad_acc_kwargs["num_steps"] = self.args.gradient_accumulation_steps
-
-        grad_acc_kwargs["sync_with_dataloader"] = False
-
-        gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)
+        if "num_steps" in grad_acc_kwargs:
+            if self.args.gradient_accumulation_steps > 1:
+                # raise because we do not know which setting is intended.
+                raise ValueError(
+                    "The `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`"
+                    "If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`."
+                )
+            else:
+                self.args.gradient_accumulation_steps = grad_acc_kwargs["num_steps"]
 
         accelerator_config = self.args.accelerator_config.to_dict()
 
@@ -4942,7 +5010,6 @@ def create_accelerator_and_postprocess(self):
 
         args = {
             "deepspeed_plugin": self.args.deepspeed_plugin,
-            "gradient_accumulation_plugin": gradient_accumulation_plugin,
         }
         if is_accelerate_available("0.28.0"):
             args["dataloader_config"] = dataloader_config
@@ -5038,12 +5105,18 @@ def get_batch_samples(self, epoch_iterator, num_batches):
                 batch_samples += [next(epoch_iterator)]
             except StopIteration:
                 break
+
+        # Keep default behavior the same
+        if not self.model_accepts_loss_kwargs:
+            return batch_samples, None
+
         if len(batch_samples) > 0 and "labels" in batch_samples[0]:
             # For now we don't support object detection
             try:
-                num_items_in_batch = sum(
-                    [data_batch["labels"][..., 1:].ne(-100).sum().item() for data_batch in batch_samples]
-                )
-            except TypeError:
+                num_items_in_batch = sum([(batch["labels"].ne(-100)).sum() for batch in batch_samples])
+            except (TypeError, AttributeError):
                 pass
+
+        if self.args.average_tokens_across_devices:
+            num_items_in_batch = self.accelerator.gather(num_items_in_batch).sum().item()
         return batch_samples, num_items_in_batch
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index 405874acf8f4c4..7b711f65701d44 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -24,7 +24,7 @@
 import numpy as np
 from tqdm.auto import tqdm
 
-from .trainer_utils import IntervalStrategy, has_length
+from .trainer_utils import IntervalStrategy, SaveStrategy, has_length
 from .training_args import TrainingArguments
 from .utils import logging
 
@@ -64,7 +64,8 @@ class TrainerState:
             The batch size for the training dataloader. Only needed when
             `auto_find_batch_size` has been used.
         num_input_tokens_seen (`int`, *optional*, defaults to 0):
-            The number of tokens seen during training (number of input tokens, not the number of prediction tokens).
+            When tracking the inputs tokens, the number of tokens seen during training (number of input tokens, not the
+            number of prediction tokens).
         total_flos (`float`, *optional*, defaults to 0):
             The total number of floating operations done by the model since the beginning of training (stored as floats
             to avoid overflow).
@@ -555,7 +556,7 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra
 
         # Save
         if (
-            args.save_strategy == IntervalStrategy.STEPS
+            args.save_strategy == SaveStrategy.STEPS
             and state.save_steps > 0
             and state.global_step % state.save_steps == 0
         ):
@@ -565,7 +566,7 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra
         if state.global_step >= state.max_steps:
             control.should_training_stop = True
             # Save the model at the end if we have a save strategy
-            if args.save_strategy != IntervalStrategy.NO:
+            if args.save_strategy not in [SaveStrategy.NO, SaveStrategy.BEST]:
                 control.should_save = True
 
         return control
@@ -580,7 +581,7 @@ def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: Tr
             control.should_evaluate = True
 
         # Save
-        if args.save_strategy == IntervalStrategy.EPOCH:
+        if args.save_strategy == SaveStrategy.EPOCH:
             control.should_save = True
 
         return control
@@ -589,11 +590,21 @@ def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: Tr
 class ProgressCallback(TrainerCallback):
     """
     A [`TrainerCallback`] that displays the progress of training or evaluation.
+    You can modify `max_str_len` to control how long strings are truncated when logging.
     """
 
-    def __init__(self):
+    def __init__(self, max_str_len: int = 100):
+        """
+        Initialize the callback with optional max_str_len parameter to control string truncation length.
+
+        Args:
+            max_str_len (`int`):
+                Maximum length of strings to display in logs.
+                Longer strings will be truncated with a message.
+        """
         self.training_bar = None
         self.prediction_bar = None
+        self.max_str_len = max_str_len
 
     def on_train_begin(self, args, state, control, **kwargs):
         if state.is_world_process_zero:
@@ -631,7 +642,13 @@ def on_log(self, args, state, control, logs=None, **kwargs):
             # but avoid doing any value pickling.
             shallow_logs = {}
             for k, v in logs.items():
-                shallow_logs[k] = v
+                if isinstance(v, str) and len(v) > self.max_str_len:
+                    shallow_logs[k] = (
+                        f"[String too long to display, length: {len(v)} > {self.max_str_len}. "
+                        "Consider increasing `max_str_len` if needed.]"
+                    )
+                else:
+                    shallow_logs[k] = v
             _ = shallow_logs.pop("total_flos", None)
             # round numbers so that it looks better in console
             if "epoch" in shallow_logs:
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index 02c298cf7d2e65..42088cd730628d 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -227,6 +227,13 @@ class IntervalStrategy(ExplicitEnum):
     EPOCH = "epoch"
 
 
+class SaveStrategy(ExplicitEnum):
+    NO = "no"
+    STEPS = "steps"
+    EPOCH = "epoch"
+    BEST = "best"
+
+
 class EvaluationStrategy(ExplicitEnum):
     NO = "no"
     STEPS = "steps"
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 485610dd9baa28..0653c8a2cb7bf0 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -33,6 +33,7 @@
     FSDPOption,
     HubStrategy,
     IntervalStrategy,
+    SaveStrategy,
     SchedulerType,
 )
 from .utils import (
@@ -349,12 +350,13 @@ class TrainingArguments:
 
             </Tip>
 
-        save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
+        save_strategy (`str` or [`~trainer_utils.SaveStrategy`], *optional*, defaults to `"steps"`):
             The checkpoint save strategy to adopt during training. Possible values are:
 
                 - `"no"`: No save is done during training.
                 - `"epoch"`: Save is done at the end of each epoch.
                 - `"steps"`: Save is done every `save_steps`.
+                - `"best"`: Save is done whenever a new `best_metric` is achieved.
 
                 If `"epoch"` or `"steps"` is chosen, saving will also be performed at the
                 very end of training, always.
@@ -962,7 +964,7 @@ class TrainingArguments:
         },
     )
     logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."})
-    save_strategy: Union[IntervalStrategy, str] = field(
+    save_strategy: Union[SaveStrategy, str] = field(
         default="steps",
         metadata={"help": "The checkpoint save strategy to use."},
     )
@@ -1028,7 +1030,7 @@ class TrainingArguments:
     use_cpu: bool = field(
         default=False,
         metadata={
-            "help": " Whether or not to use cpu. If set to False, we will use cuda/tpu/mps/npu device if available."
+            "help": "Whether or not to use cpu. If set to False, we will use cuda/tpu/mps/npu device if available."
         },
     )
     use_mps_device: bool = field(
@@ -1530,6 +1532,15 @@ class TrainingArguments:
         },
     )
 
+    average_tokens_across_devices: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to average tokens across devices. If enabled, will use all_reduce to "
+            "synchronize num_tokens_in_batch for precise loss calculation. Reference: "
+            "https://github.com/huggingface/transformers/issues/34242"
+        },
+    )
+
     def __post_init__(self):
         # Parse in args that could be `dict` sent in from the CLI as a string
         for field in _VALID_DICT_FIELDS:
@@ -1580,7 +1591,7 @@ def __post_init__(self):
 
         self.eval_strategy = IntervalStrategy(self.eval_strategy)
         self.logging_strategy = IntervalStrategy(self.logging_strategy)
-        self.save_strategy = IntervalStrategy(self.save_strategy)
+        self.save_strategy = SaveStrategy(self.save_strategy)
         self.hub_strategy = HubStrategy(self.hub_strategy)
 
         self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type)
@@ -1616,7 +1627,7 @@ def __post_init__(self):
             if self.eval_steps != int(self.eval_steps):
                 raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}")
             self.eval_steps = int(self.eval_steps)
-        if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1:
+        if self.save_strategy == SaveStrategy.STEPS and self.save_steps > 1:
             if self.save_steps != int(self.save_steps):
                 raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}")
             self.save_steps = int(self.save_steps)
@@ -1763,6 +1774,19 @@ def __post_init__(self):
         if self.framework == "pt" and is_torch_available():
             self.device
 
+        # Disable average tokens when using single device
+        if self.average_tokens_across_devices:
+            try:
+                if self.world_size == 1:
+                    logger.warning(
+                        "average_tokens_across_devices is set to True but it is invalid when world size is"
+                        "1. Turn it to False automatically."
+                    )
+                    self.average_tokens_across_devices = False
+            except ImportError as e:
+                logger.warning(f"Can not specify world size due to {e}. Turn average_tokens_across_devices to False.")
+                self.average_tokens_across_devices = False
+
         if self.torchdynamo is not None:
             warnings.warn(
                 "`torchdynamo` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
@@ -2750,8 +2774,8 @@ def set_save(
         100
         ```
         """
-        self.save_strategy = IntervalStrategy(strategy)
-        if self.save_strategy == IntervalStrategy.STEPS and steps == 0:
+        self.save_strategy = SaveStrategy(strategy)
+        if self.save_strategy == SaveStrategy.STEPS and steps == 0:
             raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.")
         self.save_steps = steps
         self.save_total_limit = total_limit
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 9df53c3f1d6161..3716a78879d501 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -114,7 +114,7 @@ class TFTrainingArguments(TrainingArguments):
             Whether to log and evaluate the first `global_step` or not.
         logging_steps (`int`, *optional*, defaults to 500):
             Number of update steps between two logs if `logging_strategy="steps"`.
-        save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
+        save_strategy (`str` or [`~trainer_utils.SaveStrategy`], *optional*, defaults to `"steps"`):
             The checkpoint save strategy to adopt during training. Possible values are:
 
                 - `"no"`: No save is done during training.
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 2a10bcaa3c9412..492642d61babb5 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -209,6 +209,7 @@
     is_torch_fp16_available_on_device,
     is_torch_fx_available,
     is_torch_fx_proxy,
+    is_torch_greater_or_equal,
     is_torch_mlu_available,
     is_torch_mps_available,
     is_torch_musa_available,
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 36e1ff2cfe65c4..3bf6d6eb288a9a 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -6758,6 +6758,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class Olmo1124ForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Olmo1124Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Olmo1124PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class OlmoeForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index d7f87717ca834a..189fbd25baf012 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -135,6 +135,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class DeformableDetrImageProcessorFast(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class DeiTFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -569,6 +576,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class RTDetrImageProcessorFast(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class SamImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 173aee9b1ac739..70bd236e3bb4ac 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -684,25 +684,27 @@ def is_torch_npu_available(check_device=False):
 
 @lru_cache()
 def is_torch_mlu_available(check_device=False):
-    "Checks if `torch_mlu` is installed and potentially if a MLU is in the environment"
+    """
+    Checks if `mlu` is available via an `cndev-based` check which won't trigger the drivers and leave mlu
+    uninitialized.
+    """
     if not _torch_available or importlib.util.find_spec("torch_mlu") is None:
         return False
 
     import torch
     import torch_mlu  # noqa: F401
 
-    from ..dependency_versions_table import deps
-
-    deps["deepspeed"] = "deepspeed-mlu>=0.10.1"
+    pytorch_cndev_based_mlu_check_previous_value = os.environ.get("PYTORCH_CNDEV_BASED_MLU_CHECK")
+    try:
+        os.environ["PYTORCH_CNDEV_BASED_MLU_CHECK"] = str(1)
+        available = torch.mlu.is_available()
+    finally:
+        if pytorch_cndev_based_mlu_check_previous_value:
+            os.environ["PYTORCH_CNDEV_BASED_MLU_CHECK"] = pytorch_cndev_based_mlu_check_previous_value
+        else:
+            os.environ.pop("PYTORCH_CNDEV_BASED_MLU_CHECK", None)
 
-    if check_device:
-        try:
-            # Will raise a RuntimeError if no MLU is found
-            _ = torch.mlu.device_count()
-            return torch.mlu.is_available()
-        except RuntimeError:
-            return False
-    return hasattr(torch, "mlu") and torch.mlu.is_available()
+    return available
 
 
 @lru_cache()
@@ -929,6 +931,14 @@ def is_flash_attn_greater_or_equal(library_version: str):
     return version.parse(importlib.metadata.version("flash_attn")) >= version.parse(library_version)
 
 
+@lru_cache()
+def is_torch_greater_or_equal(library_version: str):
+    if not _is_package_available("torch"):
+        return False
+
+    return version.parse(importlib.metadata.version("torch")) >= version.parse(library_version)
+
+
 def is_torchdistx_available():
     return _torchdistx_available
 
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 026a2066798574..ac81864e50869b 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1264,8 +1264,13 @@ def post_init(self):
         r"""
         Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
         """
-        if not version.parse(importlib.metadata.version("torchao")) >= version.parse("0.4.0"):
-            raise ValueError("Requires torchao 0.4.0 version and above")
+        if is_torchao_available():
+            if not version.parse(importlib.metadata.version("torchao")) >= version.parse("0.4.0"):
+                raise ValueError("Requires torchao 0.4.0 version and above")
+        else:
+            raise ValueError(
+                "TorchAoConfig requires torchao to be installed, please install with `pip install torchao`"
+            )
 
         _STR_TO_METHOD = self._get_torchao_quant_type_to_method()
         if self.quant_type not in _STR_TO_METHOD.keys():
@@ -1309,7 +1314,8 @@ def get_apply_tensor_subclass(self):
         return _STR_TO_METHOD[self.quant_type](**self.quant_type_kwargs)
 
     def __repr__(self):
-        return f"{self.quant_type}({', '.join(str(k) + '=' + str(v) for k, v in self.kwargs.items())})"
+        config_dict = self.to_dict()
+        return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
 
 
 @dataclass
diff --git a/tests/agents/test_monitoring.py b/tests/agents/test_monitoring.py
new file mode 100644
index 00000000000000..c43c9cb8bf86dd
--- /dev/null
+++ b/tests/agents/test_monitoring.py
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.agents.agent_types import AgentImage
+from transformers.agents.agents import AgentError, ReactCodeAgent, ReactJsonAgent
+from transformers.agents.monitoring import stream_to_gradio
+
+
+class MonitoringTester(unittest.TestCase):
+    def test_streaming_agent_text_output(self):
+        def dummy_llm_engine(prompt, **kwargs):
+            return """
+Code:
+````
+final_answer('This is the final answer.')
+```"""
+
+        agent = ReactCodeAgent(
+            tools=[],
+            llm_engine=dummy_llm_engine,
+            max_iterations=1,
+        )
+
+        # Use stream_to_gradio to capture the output
+        outputs = list(stream_to_gradio(agent, task="Test task", test_mode=True))
+
+        self.assertEqual(len(outputs), 3)
+        final_message = outputs[-1]
+        self.assertEqual(final_message.role, "assistant")
+        self.assertIn("This is the final answer.", final_message.content)
+
+    def test_streaming_agent_image_output(self):
+        def dummy_llm_engine(prompt, **kwargs):
+            return 'Action:{"action": "final_answer", "action_input": {"answer": "image"}}'
+
+        agent = ReactJsonAgent(
+            tools=[],
+            llm_engine=dummy_llm_engine,
+            max_iterations=1,
+        )
+
+        # Use stream_to_gradio to capture the output
+        outputs = list(stream_to_gradio(agent, task="Test task", image=AgentImage(value="path.png"), test_mode=True))
+
+        self.assertEqual(len(outputs), 2)
+        final_message = outputs[-1]
+        self.assertEqual(final_message.role, "assistant")
+        self.assertIsInstance(final_message.content, dict)
+        self.assertEqual(final_message.content["path"], "path.png")
+        self.assertEqual(final_message.content["mime_type"], "image/png")
+
+    def test_streaming_with_agent_error(self):
+        def dummy_llm_engine(prompt, **kwargs):
+            raise AgentError("Simulated agent error")
+
+        agent = ReactCodeAgent(
+            tools=[],
+            llm_engine=dummy_llm_engine,
+            max_iterations=1,
+        )
+
+        # Use stream_to_gradio to capture the output
+        outputs = list(stream_to_gradio(agent, task="Test task", test_mode=True))
+
+        self.assertEqual(len(outputs), 3)
+        final_message = outputs[-1]
+        self.assertEqual(final_message.role, "assistant")
+        self.assertIn("Simulated agent error", final_message.content)
diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
index 7e14cc8c9e6fc9..74a3bfe04b7506 100644
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@@ -224,6 +224,18 @@ def test_basic_run(self, sharding_strategy, dtype):
         cmd = launcher + script + args + fsdp_args
         execute_subprocess_async(cmd, env=self.get_env())
 
+    @parameterized.expand(params, name_func=_parameterized_custom_name_func)
+    @require_torch_multi_accelerator
+    @slow
+    def test_basic_run_with_gradient_accumulation(self, sharding_strategy, dtype):
+        launcher = get_launcher(distributed=True, use_accelerate=False)
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = self.get_base_args(output_dir, 1, 50).split() + [f"--{dtype}", "--gradient_accumulation_steps", "2"]
+        fsdp_args = ["--fsdp", f"{sharding_strategy} auto_wrap", "--fsdp_transformer_layer_cls_to_wrap", "BertLayer"]
+        script = [f"{self.examples_dir_str}/pytorch/text-classification/run_glue.py"]
+        cmd = launcher + script + args + fsdp_args
+        execute_subprocess_async(cmd, env=self.get_env())
+
     @parameterized.expand(dtypes)
     @require_torch_multi_accelerator
     @slow
diff --git a/tests/generation/test_flax_utils.py b/tests/generation/test_flax_utils.py
index 647482b88cd83f..bb0c1828763bb6 100644
--- a/tests/generation/test_flax_utils.py
+++ b/tests/generation/test_flax_utils.py
@@ -101,6 +101,10 @@ def test_greedy_generate_pt_fx(self):
             pt_model = pt_model_class(config).eval()
             pt_model = load_flax_weights_in_pytorch_model(pt_model, flax_model.params)
 
+            # Generate max 5 tokens only otherwise seems to be numerical error accumulation
+            pt_model.generation_config.max_length = 5
+            flax_model.generation_config.max_length = 5
+
             flax_generation_outputs = flax_model.generate(input_ids).sequences
             pt_generation_outputs = pt_model.generate(torch.tensor(input_ids, dtype=torch.long))
 
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 4e5d8f30265995..34adc132f8829a 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -15,6 +15,7 @@
 
 
 import copy
+import gc
 import inspect
 import tempfile
 import unittest
@@ -28,11 +29,13 @@
 from transformers.testing_utils import (
     is_flaky,
     require_accelerate,
+    require_flash_attn,
     require_optimum_quanto,
     require_torch,
     require_torch_gpu,
     require_torch_multi_accelerator,
     require_torch_multi_gpu,
+    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -93,6 +96,7 @@
 
 
 class GenerationTesterMixin:
+    input_name = "input_ids"
     model_tester = None
     all_generative_model_classes = ()
     max_new_tokens = 3
@@ -134,6 +138,34 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2):
 
         return config, filtered_inputs_dict
 
+    def _check_similar_generate_outputs(self, output_1, output_2, atol=1e-5, rtol=1e-5):
+        """
+        Checks whether a pair of generate outputs are similar. Two `generate` call outputs are considered similar in
+        the following siturations:
+        1. The sequences are the same
+        2. The sequences are different, but the scores up to (and including) the first mismatch are nearly identical
+        """
+        # scores doesn't include data regarding decoder input tokens
+        decoder_input_length = output_1.sequences.shape[1] - len(output_1.scores)
+        output_matches = output_1.sequences == output_2.sequences
+        has_matching_outputs = output_matches.all()
+        has_matching_scores = None
+        if not has_matching_outputs:
+            for batch_idx in range(output_1.sequences.shape[0]):
+                batch_matches = output_matches[batch_idx]
+                if batch_matches.all():
+                    continue
+                first_mismatch_idx = batch_matches.int().argmin()  # gets the index of the first False
+                first_mismatch_idx -= decoder_input_length
+                output_1_first_mismatch_scores = output_1.scores[first_mismatch_idx][batch_idx]
+                output_2_first_mismatch_scores = output_2.scores[first_mismatch_idx][batch_idx]
+                has_matching_scores = torch.allclose(
+                    output_1_first_mismatch_scores, output_2_first_mismatch_scores, rtol=atol, atol=rtol
+                )
+                if not has_matching_scores:
+                    break
+        self.assertTrue(has_matching_outputs or has_matching_scores)
+
     def _get_logits_processor_kwargs(self, do_sample=False, config=None):
         logits_processor_kwargs = {
             "bad_words_ids": [[1, 0]],
@@ -153,16 +185,16 @@ def _get_logits_processor_kwargs(self, do_sample=False, config=None):
         # This is a band-aid for VLM models, to ensure they don't generate image/video tokens which would cause them
         # to crash. On pretrained models this isn't a risk, as they are trained to not generate these tokens.
         if config is not None:
-            image_token_index = (
-                config.image_token_index
-                if getattr(config, "image_token_index", None) is not None
-                else getattr(config, "image_token_id", None)
-            )
-            video_token_index = config.video_token_index if hasattr(config, "video_token_index") else None
-            if image_token_index is not None and image_token_index < config.get_text_config().vocab_size:
-                logits_processor_kwargs["bad_words_ids"].append([image_token_index])
-            if video_token_index is not None and video_token_index < config.get_text_config().vocab_size:
-                logits_processor_kwargs["bad_words_ids"].append([video_token_index])
+            for key in [
+                "image_token_index",
+                "image_token_id",
+                "video_token_index",
+                "video_token_id",
+                "vision_start_token_id",
+            ]:
+                token_index = getattr(config, key, None)
+                if token_index is not None and token_index < config.get_text_config().vocab_size:
+                    logits_processor_kwargs["bad_words_ids"].append([token_index])
 
         return logits_processor_kwargs
 
@@ -424,7 +456,6 @@ def test_greedy_generate(self):
     def test_greedy_generate_dict_outputs(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(
@@ -451,13 +482,12 @@ def test_greedy_generate_dict_outputs(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput)
 
-            self._check_outputs(output_generate, main_input, model.config)
+            self._check_outputs(output_generate, model.config)
 
     @pytest.mark.generate
     def test_greedy_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             if not hasattr(config, "use_cache"):
                 self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
@@ -484,7 +514,7 @@ def test_greedy_generate_dict_outputs_use_cache(self):
                     output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]
                 )
 
-            self._check_outputs(output_generate, main_input, model.config, use_cache=True)
+            self._check_outputs(output_generate, model.config, use_cache=True)
 
     @pytest.mark.generate
     def test_sample_generate(self):
@@ -503,7 +533,6 @@ def test_sample_generate(self):
     def test_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._sample_generate(
@@ -531,7 +560,7 @@ def test_sample_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, SampleDecoderOnlyOutput)
 
-            self._check_outputs(output_generate, main_input, model.config, num_return_sequences=2)
+            self._check_outputs(output_generate, model.config, num_return_sequences=2)
 
     @pytest.mark.generate
     def test_beam_search_generate(self):
@@ -552,7 +581,6 @@ def test_beam_search_generate(self):
     def test_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
@@ -581,14 +609,16 @@ def test_beam_search_generate_dict_output(self):
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
             self._check_outputs(
-                output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"]
+                output_generate,
+                model.config,
+                num_return_sequences=beam_kwargs["num_return_sequences"],
+                num_beams=beam_kwargs["num_beams"],
             )
 
     @pytest.mark.generate
     def test_beam_search_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             if not hasattr(config, "use_cache"):
                 self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
@@ -621,10 +651,10 @@ def test_beam_search_generate_dict_outputs_use_cache(self):
 
             self._check_outputs(
                 output_generate,
-                main_input,
                 model.config,
                 use_cache=True,
-                num_return_sequences=beam_kwargs["num_beams"],
+                num_return_sequences=beam_kwargs["num_return_sequences"],
+                num_beams=beam_kwargs["num_beams"],
             )
 
     @require_accelerate
@@ -669,34 +699,10 @@ def test_beam_sample_generate(self):
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1])
 
-            # for VLMs inputs embeds won't match input ids unless images are encoded and merged with ids properly
-            # no quick fix available, since obtaining image embeddings step is very model-specific
-            if any(name in model.__class__.__name__.lower() for name in ("blip", "llava", "paligemma")):
-                prepare_inputs_for_generation_args = set(
-                    inspect.signature(model.prepare_inputs_for_generation).parameters
-                )
-                # `inputs_embeds` input is well supported when `cache_positions` is used, because it means the modeling
-                # code is up to date with our most recent standards
-                if (
-                    "inputs_embeds" in prepare_inputs_for_generation_args
-                    and "cache_positions" in prepare_inputs_for_generation_args
-                ):
-                    input_embeds = model.get_input_embeddings()(inputs_dict["input_ids"])
-                    beam_kwargs.update({"inputs_embeds": input_embeds})
-                    output_generate2 = self._beam_sample_generate(
-                        model=model,
-                        input_ids=None,
-                        inputs_dict={},
-                        beam_kwargs=beam_kwargs,
-                    )
-
-                    torch.testing.assert_close(output_generate[:, input_embeds.shape[1] :], output_generate2)
-
     @pytest.mark.generate
     def test_beam_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
@@ -727,7 +733,10 @@ def test_beam_sample_generate_dict_output(self):
                 self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput)
 
             self._check_outputs(
-                output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"]
+                output_generate,
+                model.config,
+                num_return_sequences=beam_kwargs["num_return_sequences"],
+                num_beams=beam_kwargs["num_beams"],
             )
 
     @pytest.mark.generate
@@ -786,7 +795,6 @@ def test_group_beam_search_generate(self):
     def test_group_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_diverse_beam_kwargs()
@@ -815,7 +823,10 @@ def test_group_beam_search_generate_dict_output(self):
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
             self._check_outputs(
-                output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"]
+                output_generate,
+                model.config,
+                num_return_sequences=beam_kwargs["num_return_sequences"],
+                num_beams=beam_kwargs["num_beams"],
             )
 
     # TODO: @gante check why it is flaky
@@ -880,7 +891,6 @@ def test_constrained_beam_search_generate(self):
     def test_constrained_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
 
@@ -920,7 +930,10 @@ def test_constrained_beam_search_generate_dict_output(self):
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
             self._check_outputs(
-                output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"]
+                output_generate,
+                model.config,
+                num_return_sequences=beam_kwargs["num_return_sequences"],
+                num_beams=beam_kwargs["num_beams"],
             )
 
     @pytest.mark.generate
@@ -963,7 +976,6 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
                 self.skipTest(reason="Won't fix: old model with different cache format")
 
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -989,7 +1001,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
                     output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]
                 )
 
-            self._check_outputs(output_generate, main_input, model.config, use_cache=True)
+            self._check_outputs(output_generate, model.config, use_cache=True)
 
     @pytest.mark.generate
     def test_contrastive_generate_low_memory(self):
@@ -1034,7 +1046,6 @@ def test_contrastive_generate_low_memory(self):
             self.assertListEqual(low_output.tolist(), high_output.tolist())
 
     @pytest.mark.generate
-    @unittest.skip("Started to break with https://github.com/huggingface/transformers/pull/33703")
     def test_beam_search_low_memory(self):
         # Check that choosing 'low_memory' does not change the model output
         for model_class in self.all_generative_model_classes:
@@ -1085,14 +1096,10 @@ def test_beam_search_low_memory(self):
 
     @pytest.mark.generate
     @parameterized.expand([("random",), ("same",)])
-    @is_flaky()  # Read NOTE (1) below. If there are API issues, all attempts will fail.
     def test_assisted_decoding_matches_greedy_search(self, assistant_type):
         # This test ensures that the assisted generation does not introduce output changes over greedy search.
-        # NOTE (1): The sentence above is true most of the time, there is a tiny difference in the logits due to matmul
-        # shape differences -- and it may result in a different output. The input shape difference happens in the
-        # main model, that runs the forward pass with several candidates at once (as opposed to generating one token at
-        # a time). See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info.
-        # NOTE (2): It breaks the pattern in the tests above, for multiple reasons:
+        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info.
+        # NOTE: It breaks the pattern in the tests above, for multiple reasons:
         # - assisted_decoding, contrarily to the other methods, can't be called on its own (e.g. needs to
         # prepare the assistant encoder outputs in the main generate body);
         # - assisted_decoding does not support `use_cache = False`
@@ -1121,7 +1128,6 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
 
             # enable cache
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1162,12 +1168,10 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
             output_assisted = model.generate(**generation_kwargs, **inputs_dict)
 
             # The two outputs must match and their shape must be as expected
-
-            self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
+            self._check_similar_generate_outputs(output_greedy, output_assisted)
             for output in (output_greedy, output_assisted):
-                self._check_outputs(output, main_input, model.config, use_cache=True)
+                self._check_outputs(output, model.config, use_cache=True)
 
-    @is_flaky()
     @pytest.mark.generate
     def test_prompt_lookup_decoding_matches_greedy_search(self):
         # This test ensures that the prompt lookup generation does not introduce output changes over greedy search.
@@ -1196,7 +1200,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
 
             # enable cache
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1229,10 +1232,9 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
             output_prompt_lookup = model.generate(**generation_kwargs, **inputs_dict)
 
             # The two outputs must match and their shape must be as expected
-
-            self.assertListEqual(output_greedy.sequences.tolist(), output_prompt_lookup.sequences.tolist())
+            self._check_similar_generate_outputs(output_greedy, output_prompt_lookup)
             for output in (output_greedy, output_prompt_lookup):
-                self._check_outputs(output, main_input, model.config, use_cache=True)
+                self._check_outputs(output, model.config, use_cache=True)
 
     @pytest.mark.generate
     def test_dola_decoding_sample(self):
@@ -1252,7 +1254,6 @@ def test_dola_decoding_sample(self):
 
             # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             # Encoder-decoder models are not supported
             if config.is_encoder_decoder:
@@ -1262,6 +1263,9 @@ def test_dola_decoding_sample(self):
 
             if model.get_output_embeddings() is None:
                 self.skipTest("DoLa is not supported for models that don't have output embeddings")
+
+            logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=True, config=model.config)
+
             # Sets dola generation arguments such that:
             # a) no EOS is generated, to ensure generation doesn't break early
             # b) there are at least two forward passes in the main model, to ensure the input preparation of
@@ -1279,8 +1283,8 @@ def test_dola_decoding_sample(self):
                 "use_cache": getattr(config, "use_cache", False),  # Some models don't support the cache
                 "dola_layers": "low",
             }
-            output_dola = model.generate(**generation_kwargs, **inputs_dict)
-            self._check_outputs(output_dola, main_input, model.config, use_cache=getattr(config, "use_cache", False))
+            output_dola = model.generate(**generation_kwargs, **logits_processor_kwargs, **inputs_dict)
+            self._check_outputs(output_dola, model.config, use_cache=getattr(config, "use_cache", False))
 
     @pytest.mark.generate
     def test_assisted_decoding_sample(self):
@@ -1310,7 +1314,6 @@ def test_assisted_decoding_sample(self):
 
             # enable cache
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1342,7 +1345,7 @@ def test_assisted_decoding_sample(self):
             }
             output_assisted = model.generate(**generation_kwargs, **inputs_dict)
 
-            self._check_outputs(output_assisted, main_input, config, use_cache=True)
+            self._check_outputs(output_assisted, config, use_cache=True)
 
     @pytest.mark.generate
     def test_prompt_lookup_decoding_stops_at_eos(self):
@@ -1379,19 +1382,22 @@ def test_generate_with_head_masking(self):
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            text_config = config.get_text_config()
 
             # We want to test only encoder-decoder models
-            if not config.is_encoder_decoder:
+            if not text_config.is_encoder_decoder:
                 continue
             model = model_class(config).to(torch_device)
 
             head_masking = {
-                "head_mask": torch.zeros(config.encoder_layers, config.encoder_attention_heads, device=torch_device),
+                "head_mask": torch.zeros(
+                    text_config.encoder_layers, text_config.encoder_attention_heads, device=torch_device
+                ),
                 "decoder_head_mask": torch.zeros(
-                    config.decoder_layers, config.decoder_attention_heads, device=torch_device
+                    text_config.decoder_layers, text_config.decoder_attention_heads, device=torch_device
                 ),
                 "cross_attn_head_mask": torch.zeros(
-                    config.decoder_layers, config.decoder_attention_heads, device=torch_device
+                    text_config.decoder_layers, text_config.decoder_attention_heads, device=torch_device
                 ),
             }
 
@@ -1492,7 +1498,7 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature):
             next_logits_with_padding = model(**model_kwargs).logits[:, -1, :]
 
             # They should result in very similar logits
-            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
+            torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, atol=1e-5, rtol=1e-5)
 
     @pytest.mark.generate
     def test_past_key_values_format(self):
@@ -1568,72 +1574,93 @@ def test_past_key_values_format(self):
                     )
 
     @pytest.mark.generate
-    def test_generate_from_inputs_embeds_decoder_only(self):
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
+        """Tests that we can generate from `inputs_embeds` instead of `input_ids` in LLMs, VLMs, etc"""
         # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids`
         # if fails, you should probably update the `prepare_inputs_for_generation` function
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
 
-            # Ignore:
-            # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids,
-            #   which would cause a mismatch),
-            config.pad_token_id = config.eos_token_id = -1
-            # b) embedding scaling, the scaling factor applied after embeding from input_ids (requires knowledge of the
-            #   variable that holds the scaling factor, which is model-dependent)
-            if hasattr(config, "scale_embedding"):
-                config.scale_embedding = False
-
             # This test is for decoder-only models (encoder-decoder models have native input embeddings support in the
             # decoder)
             if config.is_encoder_decoder:
                 continue
+            config.is_decoder = True
 
             # Skip models without explicit support
-            config.is_decoder = True
             model = model_class(config).to(torch_device).eval()
             if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
                 continue
 
-            input_ids = inputs_dict.pop("input_ids")
+            # There are a few exception patterns in this test:
+            # 1 - Some models can't generate without `input_ids`, when `inputs_embeds` are passed
+            requires_inputs_ids = any(
+                model_name in model_class.__name__.lower() for model_name in ["idefics", "qwen2vl"]
+            )
+            # 2 - Complex `inputs_embeds` computation, i.e. the correct computation of inputs embeds is more complex
+            # than calling the embedding layer with `input_ids`. Subcases of this exception:
+            #   2.A - Ignore `scale_embedding`, if the model supports it (it is controlled by a model-dependent flag)
+            if hasattr(config, "scale_embedding"):
+                config.scale_embedding = False
+            #   2.B - Some VLMs assume `inputs_embeds` and `pixel_values` are mutually exclusive AND fall in the
+            #   exception above (complex `inputs_embeds` computation). Popping `pixel_values` allow us to run the
+            #   checks without adding test complexity. Ditto for `pixel_values_videos` and `pixel_values_images`
+            pixel_values_is_mutually_exclusive = any(
+                model_name in model_class.__name__.lower()
+                for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma"]
+            )
+            if pixel_values_is_mutually_exclusive:
+                inputs_dict.pop("pixel_values", None)
+                inputs_dict.pop("pixel_values_videos", None)
+                inputs_dict.pop("pixel_values_images", None)
+            #   2.C - No easy fix, let's skip the check that compares the outputs from `input_ids` and `inputs_embeds`
+            has_complex_embeds_computation = any(
+                model_name in model_class.__name__.lower() for model_name in ["moshi", "qwen2vl"]
+            )
+            # 3 - `inputs_dict` doesn't contain `attention_mask`. When `attention_mask` is not passed to generate,
+            # we infer it from `input_ids`. The last test case will fail if there is a pad token in the original input.
+            missing_attention_mask = "attention_mask" not in inputs_dict
 
             # Traditional way of generating text
-            outputs_from_ids = model.generate(
-                input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
-            )
+            input_ids = inputs_dict.pop("input_ids")
+            generation_kwargs = {
+                "return_dict_in_generate": True,
+                "output_scores": True,
+                "num_beams": num_beams,
+                "do_sample": False,
+                "max_new_tokens": 5,
+                "min_new_tokens": 5,  # generate exactly 5 tokens
+            }
+            outputs_from_ids = model.generate(input_ids, **generation_kwargs, **inputs_dict)
             self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
 
-            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
+            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output).
+            # The output of the two calls should be the same.
             inputs_embeds = model.get_input_embeddings()(input_ids)
             outputs_from_embeds = model.generate(
-                input_ids,
-                inputs_embeds=inputs_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
+                input_ids, inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict
             )
-            self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist())
+            if not has_complex_embeds_computation:
+                self._check_similar_generate_outputs(outputs_from_ids, outputs_from_embeds)
 
-            # But if we pass different inputs_embeds, we should get different outputs (the output text may be the
+            # If we pass different inputs_embeds, we should get different outputs (the output text may be the
             # same, but the logits will almost surely be different)
             random_embeds = torch.rand_like(inputs_embeds)
             outputs_from_rand_embeds = model.generate(
-                input_ids,
-                inputs_embeds=random_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
+                input_ids, inputs_embeds=random_embeds, **generation_kwargs, **inputs_dict
             )
             for i in range(len(outputs_from_rand_embeds.scores)):
                 self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i]))
 
-            # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same
-            outputs_from_embeds_wo_ids = model.generate(
-                inputs_embeds=inputs_embeds, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
-            )
-            self.assertListEqual(
-                outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(),
-                outputs_from_embeds_wo_ids.sequences.tolist(),
-            )
+            # input_ids is not a required input on most models -- if we don't pass it, the newly generated tokens will
+            # be the same
+            if not (requires_inputs_ids or missing_attention_mask):
+                outputs_from_embeds_wo_ids = model.generate(
+                    inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict
+                )
+                outputs_from_embeds.sequences = outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :]
+                self._check_similar_generate_outputs(outputs_from_embeds_wo_ids, outputs_from_embeds)
 
     @pytest.mark.generate
     def test_generate_from_inputs_embeds_with_static_cache(self):
@@ -1847,10 +1874,8 @@ def test_new_cache_format(self, num_beams, do_sample):
     @pytest.mark.generate
     def test_generate_with_static_cache(self):
         """
-        Tests if StaticCache works if we set attn_implementation=static when generation.
-        This doesn't test if generation quality is good, but tests that models with
-        self._supports_static_cache don't throw an error when generating and return
-        a StaticCache object at the end.
+        Tests that generating with static cache give almost same results as with dynamic cache, and the output cache
+        has the expected shapes
         """
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_static_cache:
@@ -1869,13 +1894,15 @@ def test_generate_with_static_cache(self):
 
             model = model_class(config).to(torch_device).eval()
             generation_kwargs = {
-                "max_length": None,
                 "max_new_tokens": max_new_tokens,
-                "cache_implementation": "static",
                 "return_dict_in_generate": True,  # Required to return `past_key_values`
+                "output_scores": True,
                 "use_cache": True,
             }
 
+            static_cache_generation = model.generate(**generation_kwargs, **inputs_dict, cache_implementation="static")
+
+            # Check 1: The cache shapes must match the expected shapes
             max_cache_len = seq_length + max_new_tokens
             config = config.text_config if hasattr(config, "text_config") else config
             head_dim = (
@@ -1887,12 +1914,14 @@ def test_generate_with_static_cache(self):
                 else config.num_key_value_heads
             )
             num_hidden_layers = config.num_hidden_layers
-            results = model.generate(**generation_kwargs, **inputs_dict)
-
             cache_shape = (batch_size, num_key_value_heads, max_cache_len, head_dim)
-            self.assertTrue(isinstance(results.past_key_values, StaticCache))
-            self.assertTrue(len(results.past_key_values.key_cache) == num_hidden_layers)
-            self.assertTrue(results.past_key_values.key_cache[0].shape == cache_shape)
+            self.assertTrue(isinstance(static_cache_generation.past_key_values, StaticCache))
+            self.assertTrue(len(static_cache_generation.past_key_values.key_cache) == num_hidden_layers)
+            self.assertTrue(static_cache_generation.past_key_values.key_cache[0].shape == cache_shape)
+
+            # Check 2: The outputs must be similar to the case with dynamic cache
+            dynamic_cache_generation = model.generate(**generation_kwargs, **inputs_dict)
+            self._check_similar_generate_outputs(dynamic_cache_generation, static_cache_generation)
 
     @require_optimum_quanto
     @pytest.mark.generate
@@ -1926,25 +1955,32 @@ def test_generate_with_quant_cache(self):
             with self.assertRaises(ValueError):
                 model.generate(**generation_kwargs, **inputs_dict)
 
+    @parameterized.expand(
+        [
+            ("forward_only", False),  # TODO (@joao): a few models failing. After fixed, this should not be "@slow"
+            ("end_to_end", True),  # TODO (@joao): end-to-end compilation is broken with torch 2.5+, explore and fix
+        ]
+    )
     @pytest.mark.generate
     @require_torch_gpu
     @slow
-    @is_flaky()  # compilation may result in equivalent (!= same) FP ops, causing the argmax in `generate` to be flaky
-    def test_generate_compile_fullgraph(self):
+    def test_generate_compile(self, _, end_to_end):
         """
-        Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results.
+        Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. Tests
+        end-to-end compilation and forward pass compilation only.
         ⚠️ Runs two sequential generations to ensure the cache doesn't get stuck after the first compiled run! ⚠️
         """
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_static_cache:
                 self.skipTest("This model doesn't support static cache")
+
             # TODO (joao) -- fix and enable me :)
-            if any(model_name in model_class.__name__.lower() for model_name in ["whisper"]):
+            if end_to_end and any(model_name in model_class.__name__.lower() for model_name in ["whisper"]):
                 self.skipTest("whisper model end-to-end generate compile not yet supported")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             # TODO (joao) -- fix and enable me :)
-            if config.is_encoder_decoder:
+            if end_to_end and config.is_encoder_decoder:
                 self.skipTest("Encoder-decoder model end-to-end generate compile not yet supported")
 
             model = model_class(config).to(torch_device)
@@ -1959,27 +1995,33 @@ def test_generate_compile_fullgraph(self):
             generation_kwargs = {
                 "do_sample": False,
                 "max_new_tokens": 10,
+                "return_dict_in_generate": True,
+                "output_scores": True,
             }
+            # end-to-end works best with dynamic cache, forward compilation works best with static cache
+            if not end_to_end:
+                generation_kwargs["cache_implementation"] = "static"
 
-            max_cache_len = input_ids.shape[1] + generation_kwargs["max_new_tokens"]
-            config = config.get_text_config()
-            past_key_values = StaticCache(
-                config, batch_size=half_batch_size, max_cache_len=max_cache_len, device=torch_device
-            )
+            # get eager + dynamic cache results for future comparison
+            dynamic_outputs = []
+            for model_inputs in input_ids_sets:
+                dynamic_outputs.append(model.generate(model_inputs, **generation_kwargs))
+
+            # get compiled results
+            generation_config = copy.deepcopy(model.generation_config)
+            generation_config.update(**generation_kwargs)
+            torch.compiler.reset()
+            if end_to_end:
+                model.generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead")
+            else:
+                model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead")
 
+            compiled_outputs = []
             for model_inputs in input_ids_sets:
-                # eager dynamic cache
-                output_dynamic = model.generate(model_inputs, **generation_kwargs)
-
-                # end-to-end compiled dynamic cache
-                torch.compiler.reset()
-                compiled_generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead")
-                generation_config = copy.deepcopy(model.generation_config)
-                generation_config.update(**generation_kwargs)
-                output_compiled = compiled_generate(
-                    model_inputs, generation_config=generation_config, past_key_values=past_key_values
-                )
-                self.assertListEqual(output_dynamic.tolist(), output_compiled.tolist())
+                compiled_outputs.append(model.generate(model_inputs, generation_config=generation_config))
+
+            for dynamic_result, compiled_result in zip(dynamic_outputs, compiled_outputs):
+                self._check_similar_generate_outputs(dynamic_result, compiled_result)
 
     @pytest.mark.generate
     def test_generate_methods_with_num_logits_to_keep(self):
@@ -2007,7 +2049,6 @@ def test_generate_methods_with_num_logits_to_keep(self):
             self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
 
     @pytest.mark.generate
-    @is_flaky()  # assisted generation tests are flaky (minor fp ops differences)
     def test_assisted_decoding_with_num_logits_to_keep(self):
         for model_class in self.all_generative_model_classes:
             if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):
@@ -2016,6 +2057,9 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
                 self.skipTest(reason="Stateful models don't support assisted generation")
 
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
+            # NOTE: assisted generation only works with cache on at the moment.
+            if not hasattr(config, "use_cache"):
+                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
             config.use_cache = True
             config.is_decoder = True
 
@@ -2028,14 +2072,16 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
                 "max_new_tokens": 10,
                 "do_sample": False,
                 "assistant_model": assistant_model,
+                "return_dict_in_generate": True,
+                "output_scores": True,
             }
 
-            assistant_model.generation_config.assistant_confidence_threshold = None
             # Setting num_logits_to_keep at 0 keeps all logits (old behavior)
             with_all_logits = model.generate(**generation_kwargs, **inputs_dict, num_logits_to_keep=0)
             # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior)
             without_all_logits = model.generate(**inputs_dict, **generation_kwargs)
-            self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
+
+            self._check_similar_generate_outputs(with_all_logits, without_all_logits)
 
     @pytest.mark.generate
     def test_inherits_generation_mixin(self):
@@ -2046,17 +2092,100 @@ def test_inherits_generation_mixin(self):
         for model_class in self.all_generative_model_classes:
             self.assertTrue("GenerationMixin" in str(model_class.__bases__))
 
-    def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1):
-        # we can be sure what is batch size from main input but seq length depends on model type and whether input is text/audio/image
-        # so we infer actual text seq length from model_tester, same was as it is done in `test_modeling_common.py` tests`
-        batch_size = main_input.shape[0]
+    def _test_attention_implementation(self, attn_implementation):
+        """
+        Compares the output of generate with the eager attention implementation against other implementations.
+        NOTE: despite the test logic being the same, different implementations actually need diferent decorators, hence
+        this separate function.
+        """
+        max_new_tokens = 30
+        support_flag = {
+            "sdpa": "_supports_sdpa",
+            "flash_attention_2": "_supports_flash_attn_2",
+        }
+
+        for model_class in self.all_generative_model_classes:
+            if not getattr(model_class, support_flag[attn_implementation]):
+                self.skipTest(f"{model_class.__name__} does not support `attn_implementation={attn_implementation}`")
+
+            config, original_inputs_dict = self.prepare_config_and_inputs_for_generate()
+            inputs_dict = {}
+            for input_name, input_data in original_inputs_dict.items():
+                if isinstance(input_data, torch.Tensor) and input_data.dtype in [torch.float32, torch.bfloat16]:
+                    inputs_dict[input_name] = input_data.to(torch.float16)
+                else:
+                    inputs_dict[input_name] = input_data
+            main_input = inputs_dict[model_class.main_input_name]
+
+            # make sure that all models have enough positions for generation
+            if hasattr(config, "max_position_embeddings"):
+                config.max_position_embeddings = max_new_tokens + main_input.shape[1] + 1
+
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                del model
+                gc.collect()
+
+                generate_kwargs = {
+                    "max_new_tokens": max_new_tokens,
+                    "do_sample": False,
+                    "return_dict_in_generate": True,
+                    "output_scores": True,
+                    "use_cache": True,
+                }
+
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    low_cpu_mem_usage=True,
+                    attn_implementation="eager",
+                ).to(torch_device)
+                res_eager = model_eager.generate(**inputs_dict, **generate_kwargs)
+                del model_eager
+                gc.collect()
+
+                model_attn = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    low_cpu_mem_usage=True,
+                    attn_implementation=attn_implementation,
+                ).to(torch_device)
+                res_attn = model_attn.generate(**inputs_dict, **generate_kwargs)
+                del model_attn
+                gc.collect()
+
+                self._check_similar_generate_outputs(res_eager, res_attn, atol=1e-3, rtol=1e-3)
+
+    @pytest.mark.generate
+    @require_torch_sdpa
+    @slow
+    def test_eager_matches_sdpa_generate(self):
+        """Tests that generate has equivalent outputs with SDPA and eager attention implementations."""
+        self._test_attention_implementation("sdpa")
+
+    @pytest.mark.flash_attn_test
+    @require_flash_attn
+    @require_torch_gpu
+    @slow
+    def test_eager_matches_fa2_generate(self):
+        """Tests that generate has equivalent outputs with FA2 and eager attention implementations."""
+        # TODO (@joao @raushan) -- this test is failing the output checks on most models, investigate. After fixing,
+        # check whether we still need the overwrites
+        self._test_attention_implementation("flash_attention_2")
+
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
+        input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
+        internal_batch_size = (
+            input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
+        )
 
         seq_length = getattr(self.model_tester, "seq_length", None)
         seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
         seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
 
         config = config.text_config if hasattr(config, "text_config") else config
-        num_sequences_in_output = batch_size * num_return_sequences
 
         gen_len = (
             output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
@@ -2067,19 +2196,21 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
             seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
 
         # scores
-        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
+        self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
 
         # unprocessed logits
-        self._check_logits(num_sequences_in_output, output.logits, config=config)
+        self._check_logits(internal_batch_size, output.logits, config=config)
 
         # Attentions
         if self.has_attentions:
             if config.is_encoder_decoder:
                 # encoder
-                self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
+                self._check_encoder_attention_for_generate(
+                    output.encoder_attentions, input_batch_size, config, seq_length
+                )
                 # decoder
                 self._check_attentions_for_generate(
-                    num_sequences_in_output,
+                    internal_batch_size,
                     output.decoder_attentions,
                     min_length=1,
                     max_length=output.sequences.shape[-1],
@@ -2091,7 +2222,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
                 attentions = output.attentions if not use_cache else output.attentions[1:]
                 min_length = seq_length if not use_cache else seq_length + 1
                 self._check_attentions_for_generate(
-                    num_sequences_in_output,
+                    internal_batch_size,
                     attentions=attentions,
                     min_length=min_length,
                     max_length=output.sequences.shape[-1],
@@ -2103,12 +2234,12 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
         if config.is_encoder_decoder:
             # encoder
             self._check_encoder_hidden_states_for_generate(
-                output.encoder_hidden_states, batch_size, config, seq_length
+                output.encoder_hidden_states, input_batch_size, config, seq_length
             )
 
             # decoder
             self._check_hidden_states_for_generate(
-                num_sequences_in_output,
+                internal_batch_size,
                 output.decoder_hidden_states,
                 min_length=1,
                 max_length=output.sequences.shape[-1],
@@ -2120,7 +2251,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
             hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
             min_length = seq_length if not use_cache else seq_length + 1
             self._check_hidden_states_for_generate(
-                num_sequences_in_output,
+                internal_batch_size,
                 hidden_states,
                 min_length=min_length,
                 max_length=output.sequences.shape[-1],
@@ -2151,7 +2282,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
                 past_key_values = output.past_key_values
                 past_sequence_length = output.sequences.shape[-1] - 1
                 self._check_past_key_values_for_generate(
-                    num_sequences_in_output,
+                    internal_batch_size,
                     past_key_values,
                     seq_length=past_sequence_length,
                     config=config,
@@ -3979,6 +4110,28 @@ def test_generate_compile_fullgraph_tiny(self):
         gen_out = compiled_generate(**model_inputs, generation_config=generation_config)
         self.assertTrue(gen_out.shape[1] > model_inputs["input_ids"].shape[1])  # some text was generated
 
+    def test_assisted_generation_early_exit(self):
+        """
+        Tests that assisted generation with early exit works as expected. Under the hood, this has complex cache
+        manipulation, which will cause the test to fail if something goes wrong there.
+        """
+        expected_output = "Alice and Bob are playing a game of poker. Alice has a pair of 8s and Bob has a pair"
+
+        prompt = "Alice and Bob"
+        checkpoint = "facebook/layerskip-llama3.2-1B"
+
+        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+        inputs = tokenizer(prompt, return_tensors="pt").to(torch_device)
+
+        model = AutoModelForCausalLM.from_pretrained(checkpoint).to(torch_device)
+        original_outputs = model.generate(**inputs, do_sample=False, max_new_tokens=20)
+        original_decoded = tokenizer.batch_decode(original_outputs, skip_special_tokens=True)
+        self.assertEqual(original_decoded, [expected_output])
+
+        outputs_assisted = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20)
+        decoded_assisted = tokenizer.batch_decode(outputs_assisted, skip_special_tokens=True)
+        self.assertEqual(decoded_assisted, [expected_output])
+
 
 @require_torch
 class TokenHealingTestCase(unittest.TestCase):
diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py
index d1e5631b342d33..0a123c02ab778b 100644
--- a/tests/models/albert/test_modeling_albert.py
+++ b/tests/models/albert/test_modeling_albert.py
@@ -16,9 +16,12 @@
 
 import unittest
 
-from transformers import AlbertConfig, is_torch_available
+from packaging import version
+from parameterized import parameterized
+
+from transformers import AlbertConfig, AutoTokenizer, is_torch_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
@@ -286,6 +289,12 @@ def setUp(self):
         self.model_tester = AlbertModelTester(self)
         self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
 
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @unittest.skip("Albert requires `head_mask` which is currently not done in this test.")
+    def test_eager_matches_sdpa_inference(self):
+        pass
+
     def test_config(self):
         self.config_tester.run_common_tests()
 
@@ -342,3 +351,45 @@ def test_inference_no_head_absolute_embedding(self):
         )
 
         self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
+
+    @slow
+    def test_export(self):
+        if version.parse(torch.__version__) < version.parse("2.4.0"):
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        distilbert_model = "albert/albert-base-v2"
+        device = "cpu"
+        attn_implementation = "sdpa"
+        max_length = 64
+
+        tokenizer = AutoTokenizer.from_pretrained(distilbert_model)
+        inputs = tokenizer(
+            f"Paris is the {tokenizer.mask_token} of France.",
+            return_tensors="pt",
+            padding="max_length",
+            max_length=max_length,
+        )
+
+        model = AlbertForMaskedLM.from_pretrained(
+            distilbert_model,
+            device_map=device,
+            attn_implementation=attn_implementation,
+        )
+
+        logits = model(**inputs).logits
+        eg_predicted_mask = tokenizer.decode(logits[0, 4].topk(5).indices)
+        self.assertEqual(
+            eg_predicted_mask.split(),
+            ["capital", "capitol", "comune", "arrondissement", "bastille"],
+        )
+
+        exported_program = torch.export.export(
+            model,
+            args=(inputs["input_ids"],),
+            kwargs={"attention_mask": inputs["attention_mask"]},
+            strict=True,
+        )
+
+        result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"])
+        ep_predicted_mask = tokenizer.decode(result.logits[0, 4].topk(5).indices)
+        self.assertEqual(eg_predicted_mask, ep_predicted_mask)
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index ddeb585a757d5d..3c7e679686f617 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -457,11 +457,20 @@ class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = AlignModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=AlignConfig,
+            has_text_modality=False,
+            common_properties=["projection_dim", "temperature_init_value"],
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="Start to fail after using torch `cu118`.")
     def test_multi_gpu_data_parallel_forward(self):
         super().test_multi_gpu_data_parallel_forward()
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index 0175e562eda618..658e2e38d9adb5 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -452,11 +452,20 @@ def is_pipeline_test_to_skip(
 
     def setUp(self):
         self.model_tester = AltCLIPModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=AltCLIPConfig,
+            has_text_modality=False,
+            common_properties=["projection_dim", "logit_scale_init_value"],
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index eda51d21199f31..e4d0df141be2b9 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -1532,8 +1532,3 @@ def test_retain_grad_hidden_states_attentions(self):
     @unittest.skip
     def test_save_load_fast_init_from_base(self):
         pass
-
-    @unittest.skip(reason="Generate needs input ids")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        # generate only works with input ids for bartforcausalLM
-        pass
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index 5c87fbea8ee795..25566027742507 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -16,13 +16,14 @@
 import tempfile
 import unittest
 
+from packaging import version
+
 from transformers import AutoTokenizer, BertConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import (
     CaptureLogger,
     require_torch,
     require_torch_accelerator,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -510,11 +511,6 @@ def test_model_as_decoder(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
 
-    @unittest.skip(reason="Generate needs input ids")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        # generate only works with input ids for bertforcausalLM
-        pass
-
     def test_model_as_decoder_with_default_input_mask(self):
         # This regression test was failing with PyTorch < 1.3
         (
@@ -672,79 +668,6 @@ def test_torchscript_device_change(self):
                 loaded = torch.jit.load(os.path.join(tmp, "bert.pt"), map_location=torch_device)
                 loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device))
 
-    # This test was copied from the common test_eager_matches_sdpa_generate(), but without low_cpu_mem_usage=True.
-    # TODO: Remove this and use the parent method (in common tests) once BERT supports low_cpu_mem_usage=True.
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        max_new_tokens = 30
-
-        if len(self.all_generative_model_classes) == 0:
-            self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_sdpa:
-                self.skipTest(f"{model_class.__name__} does not support SDPA")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model_sdpa = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    # low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    # low_cpu_mem_usage=True,
-                    attn_implementation="eager",
-                ).to(torch_device)
-
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-                for name, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
-                has_sdpa = False
-                for name, submodule in model_sdpa.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        has_sdpa = True
-                        break
-                if not has_sdpa:
-                    raise ValueError("The SDPA model should have SDPA attention layers")
-
-                # Just test that a large cache works as expected
-                res_eager = model_eager.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
-
-                res_sdpa = model_sdpa.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(res_eager, res_sdpa))
-
 
 @require_torch
 class BertModelIntegrationTest(unittest.TestCase):
@@ -823,3 +746,43 @@ def test_sdpa_ignored_mask(self):
             self.assertTrue(
                 torch.allclose(res_eager.last_hidden_state, res_sdpa.last_hidden_state, atol=1e-5, rtol=1e-4)
             )
+
+    @slow
+    def test_export(self):
+        if version.parse(torch.__version__) < version.parse("2.4.0"):
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        bert_model = "google-bert/bert-base-uncased"
+        device = "cpu"
+        attn_implementation = "sdpa"
+        max_length = 512
+
+        tokenizer = AutoTokenizer.from_pretrained(bert_model)
+        inputs = tokenizer(
+            "the man worked as a [MASK].",
+            return_tensors="pt",
+            padding="max_length",
+            max_length=max_length,
+        )
+
+        model = BertForMaskedLM.from_pretrained(
+            bert_model,
+            device_map=device,
+            attn_implementation=attn_implementation,
+            use_cache=True,
+        )
+
+        logits = model(**inputs).logits
+        eg_predicted_mask = tokenizer.decode(logits[0, 6].topk(5).indices)
+        self.assertEqual(eg_predicted_mask.split(), ["carpenter", "waiter", "barber", "mechanic", "salesman"])
+
+        exported_program = torch.export.export(
+            model,
+            args=(inputs["input_ids"],),
+            kwargs={"attention_mask": inputs["attention_mask"]},
+            strict=True,
+        )
+
+        result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"])
+        ep_predicted_mask = tokenizer.decode(result.logits[0, 6].topk(5).indices)
+        self.assertEqual(eg_predicted_mask, ep_predicted_mask)
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index 2f8ee3229ff2cd..7e1dbbe6bb9cb0 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -436,6 +436,7 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             "feature-extraction": BlipModel,
             "image-to-text": BlipForConditionalGeneration,
             "visual-question-answering": BlipForQuestionAnswering,
+            "image-text-to-text": BlipForConditionalGeneration,
         }
         if is_torch_available()
         else {}
@@ -443,16 +444,23 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
-    test_resize_embeddings = False
+    test_resize_embeddings = True
     test_attention_outputs = False
 
     def setUp(self):
         self.model_tester = BlipModelTester(self)
+        common_properties = ["logit_scale_init_value", "image_text_hidden_size", "projection_dim", "label_smoothing"]
+        self.config_tester = ConfigTester(
+            self, config_class=BlipConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
@@ -737,7 +745,6 @@ def prepare_config_and_inputs_for_common(self):
         config, input_ids, attention_mask, pixel_values = config_and_inputs
         inputs_dict = {
             "input_ids": input_ids,
-            "labels": input_ids,
             "attention_mask": attention_mask,
             "pixel_values": pixel_values,
         }
@@ -786,10 +793,10 @@ def prepare_config_and_inputs_for_common(self):
         config, input_ids, attention_mask, pixel_values = config_and_inputs
         inputs_dict = {
             "input_ids": input_ids,
-            "labels": input_ids,
             "decoder_input_ids": input_ids,
             "attention_mask": attention_mask,
             "pixel_values": pixel_values,
+            "labels": input_ids,
         }
         return config, inputs_dict
 
@@ -801,7 +808,7 @@ class BlipVQAModelTest(ModelTesterMixin, unittest.TestCase):
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
-    test_resize_embeddings = False
+    test_resize_embeddings = True
     test_attention_outputs = False
     test_torchscript = False
 
@@ -810,7 +817,6 @@ def setUp(self):
 
     def _prepare_inputs_for_vqa(self):
         _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        inputs_dict["labels"] = inputs_dict["input_ids"]
         inputs_dict["decoder_input_ids"] = inputs_dict["input_ids"]
         inputs_dict.pop("return_loss")
         return inputs_dict
@@ -881,7 +887,7 @@ class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
-    test_resize_embeddings = False
+    test_resize_embeddings = True
     test_attention_outputs = False
     test_torchscript = False
 
@@ -1109,7 +1115,7 @@ class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
-    test_resize_embeddings = False
+    test_resize_embeddings = True
     test_attention_outputs = False
     test_torchscript = False
 
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index e5d04bd85a3404..a141ef40be1959 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -15,11 +15,14 @@
 """Testing suite for the PyTorch BLIP-2 model."""
 
 import inspect
+import os
 import tempfile
 import unittest
 
 import numpy as np
+import pytest
 import requests
+from parameterized import parameterized
 
 from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
 from transformers.testing_utils import (
@@ -32,7 +35,7 @@
     slow,
     torch_device,
 )
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_torch_sdpa_available, is_vision_available
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -327,7 +330,7 @@ def __init__(
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
+        max_position_embeddings=512,
         eos_token_id=2,
         pad_token_id=1,
         bos_token_id=0,
@@ -391,7 +394,14 @@ def get_config(self):
 # this model tester uses a decoder-only language model (OPT)
 class Blip2ForConditionalGenerationDecoderOnlyModelTester:
     def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+        self,
+        parent,
+        vision_kwargs=None,
+        qformer_kwargs=None,
+        text_kwargs=None,
+        is_training=True,
+        num_query_tokens=10,
+        image_token_index=4,
     ):
         if vision_kwargs is None:
             vision_kwargs = {}
@@ -405,14 +415,24 @@ def __init__(
         self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
         self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs)
         self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
+        self.seq_length = self.text_model_tester.seq_length + num_query_tokens  # need seq_length for common tests
         self.is_training = is_training
         self.num_query_tokens = num_query_tokens
+        self.image_token_index = image_token_index
 
     def prepare_config_and_inputs(self):
         _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
         _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
 
+        vision_tokens = (
+            torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype)
+            * self.image_token_index
+        )
+        input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id
+        input_ids = torch.cat([vision_tokens, input_ids], dim=-1)
+        vision_attention_mask = torch.ones_like(vision_tokens)
+        attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1)
+
         config = self.get_config()
 
         return config, input_ids, attention_mask, pixel_values
@@ -423,6 +443,7 @@ def get_config(self):
             qformer_config=self.qformer_model_tester.get_config(),
             text_config=self.text_model_tester.get_config(),
             num_query_tokens=self.num_query_tokens,
+            image_token_index=self.image_token_index,
         )
 
     def create_and_check_for_conditional_generation(self, config, input_ids, attention_mask, pixel_values):
@@ -443,7 +464,6 @@ def prepare_config_and_inputs_for_common(self):
             "pixel_values": pixel_values,
             "input_ids": input_ids,
             "attention_mask": attention_mask,
-            "labels": input_ids,
         }
         return config, inputs_dict
 
@@ -451,21 +471,139 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
     test_resize_embeddings = False
     test_attention_outputs = False
-    test_torchscript = False
+    test_torchscript = True
     _is_composite = True
 
     def setUp(self):
         self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self)
+        common_properties = ["image_token_index", "num_query_tokens", "image_text_hidden_size"]
+        self.config_tester = ConfigTester(
+            self, config_class=Blip2Config, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_for_conditional_generation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
 
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        # overwrite because BLIP requires ipnut ids and pixel values as input
+        if not self.test_torchscript:
+            self.skipTest(reason="test_torchscript is set to `False`")
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        for model_class in self.all_model_classes:
+            for attn_implementation in ["eager", "sdpa"]:
+                if attn_implementation == "sdpa" and (not model_class._supports_sdpa or not is_torch_sdpa_available()):
+                    continue
+
+                configs_no_init._attn_implementation = attn_implementation
+                model = model_class(config=configs_no_init)
+                model.to(torch_device)
+                model.eval()
+                inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                main_input_name = model_class.main_input_name
+
+                try:
+                    if model.config.is_encoder_decoder:
+                        model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                        main_input = inputs[main_input_name]
+                        input_ids = inputs["input_ids"]
+                        attention_mask = inputs["attention_mask"]
+                        decoder_input_ids = inputs["decoder_input_ids"]
+                        decoder_attention_mask = inputs["decoder_attention_mask"]
+                        model(main_input, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
+                        traced_model = torch.jit.trace(
+                            model, (main_input, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
+                        )
+                    else:
+                        main_input = inputs[main_input_name]
+                        input_ids = inputs["input_ids"]
+
+                        if model.config._attn_implementation == "sdpa":
+                            trace_input = {main_input_name: main_input, "input_ids": input_ids}
+
+                            if "attention_mask" in inputs:
+                                trace_input["attention_mask"] = inputs["attention_mask"]
+                            else:
+                                self.skipTest(reason="testing SDPA without attention_mask is not supported")
+
+                            model(main_input, attention_mask=inputs["attention_mask"])
+                            # example_kwarg_inputs was introduced in torch==2.0, but it is fine here since SDPA has a requirement on torch>=2.1.
+                            traced_model = torch.jit.trace(model, example_kwarg_inputs=trace_input)
+                        else:
+                            model(main_input, input_ids)
+                            traced_model = torch.jit.trace(model, (main_input, input_ids))
+                except RuntimeError:
+                    self.fail("Couldn't trace module.")
+
+                with tempfile.TemporaryDirectory() as tmp_dir_name:
+                    pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                    try:
+                        torch.jit.save(traced_model, pt_file_name)
+                    except Exception:
+                        self.fail("Couldn't save module.")
+
+                    try:
+                        loaded_model = torch.jit.load(pt_file_name)
+                    except Exception:
+                        self.fail("Couldn't load module.")
+
+                model.to(torch_device)
+                model.eval()
+
+                loaded_model.to(torch_device)
+                loaded_model.eval()
+
+                model_state_dict = model.state_dict()
+                loaded_model_state_dict = loaded_model.state_dict()
+
+                non_persistent_buffers = {}
+                for key in loaded_model_state_dict.keys():
+                    if key not in model_state_dict.keys():
+                        non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+                loaded_model_state_dict = {
+                    key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+                }
+
+                self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+                model_buffers = list(model.buffers())
+                for non_persistent_buffer in non_persistent_buffers.values():
+                    found_buffer = False
+                    for i, model_buffer in enumerate(model_buffers):
+                        if torch.equal(non_persistent_buffer, model_buffer):
+                            found_buffer = True
+                            break
+
+                    self.assertTrue(found_buffer)
+                    model_buffers.pop(i)
+
+                models_equal = True
+                for layer_name, p1 in model_state_dict.items():
+                    if layer_name in loaded_model_state_dict:
+                        p2 = loaded_model_state_dict[layer_name]
+                        if p1.data.ne(p2.data).sum() > 0:
+                            models_equal = False
+
+                self.assertTrue(models_equal)
+
+                # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+                # (Even with this call, there are still memory leak by ~0.04MB)
+                self.clear_torch_jit_class_registry()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
@@ -583,6 +721,192 @@ def test_model_from_pretrained(self):
         model = Blip2ForConditionalGeneration.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    # overwrite because BLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
+        use_cache = True  # force this to be True in case False is passed
+
+        input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
+        internal_batch_size = (
+            input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
+        )
+
+        seq_length = getattr(self.model_tester, "seq_length", None)
+        seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
+        seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
+
+        config = config.text_config if hasattr(config, "text_config") else config
+
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
+
+        # in some models we subsample the sequence length in inner layers
+        if hasattr(self.model_tester, "get_subsampled_output_lengths"):
+            seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
+
+        # scores
+        self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
+
+        # unprocessed logits
+        self._check_logits(internal_batch_size, output.logits, config=config)
+
+        # Attentions
+        if self.has_attentions:
+            if config.is_encoder_decoder:
+                # encoder
+                self._check_encoder_attention_for_generate(
+                    output.encoder_attentions, input_batch_size, config, seq_length
+                )
+                # decoder
+                self._check_attentions_for_generate(
+                    internal_batch_size,
+                    output.decoder_attentions,
+                    min_length=1,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+            else:
+                # if use_cache first input is equal to no use_cache, so skip here
+                attentions = output.attentions if not use_cache else output.attentions[1:]
+                min_length = seq_length if not use_cache else seq_length + 1
+                self._check_attentions_for_generate(
+                    internal_batch_size,
+                    attentions=attentions,
+                    min_length=min_length,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+
+        # Hidden States
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_hidden_states_for_generate(
+                output.encoder_hidden_states, input_batch_size, config, seq_length
+            )
+
+            # decoder
+            self._check_hidden_states_for_generate(
+                internal_batch_size,
+                output.decoder_hidden_states,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_hidden_states_for_generate(
+                internal_batch_size,
+                hidden_states,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+
+        # Past Key Value States
+        if use_cache:
+            past_key_values = output.past_key_values
+            past_sequence_length = output.sequences.shape[-1] - 1
+            self._check_past_key_values_for_generate(
+                internal_batch_size,
+                past_key_values,
+                seq_length=past_sequence_length,
+                config=config,
+            )
+
+    # overwrite because BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present
+    @pytest.mark.generate
+    def test_left_padding_compatibility(self):
+        # NOTE: left-padding results in small numerical differences. This is expected.
+        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
+
+        # First, filter out models that don't support left padding
+        # - The model must have generative capabilities
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(reason="No generative architecture available for this model.")
+
+        # - The model must support padding
+        if not self.has_attentions:
+            self.skipTest(reason="This model doesn't support padding.")
+
+        # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
+        decoder_only_classes = []
+        for model_class in self.all_generative_model_classes:
+            config, _ = self.prepare_config_and_inputs_for_generate()
+            if config.is_encoder_decoder:
+                continue
+            else:
+                decoder_only_classes.append(model_class)
+        if len(decoder_only_classes) == 0:
+            self.skipTest(reason="No decoder-only architecture available for this model.")
+
+        # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
+        #   added support for it yet. We skip these models for now.
+        has_encoder_attributes = any(
+            attr_name
+            for attr_name in config.to_dict().keys()
+            if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
+        )
+        if has_encoder_attributes:
+            self.skipTest(
+                reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
+            )
+
+        # Then, test left-padding
+        def _prepare_model_kwargs(input_ids, attention_mask, signature):
+            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            if "position_ids" in signature:
+                position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                model_kwargs["position_ids"] = position_ids
+            if "cache_position" in signature:
+                cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
+                model_kwargs["cache_position"] = cache_position
+            return model_kwargs
+
+        for model_class in decoder_only_classes:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            input_ids = inputs_dict["input_ids"]
+            attention_mask = inputs_dict.get("attention_mask")
+            pixel_values = inputs_dict["pixel_values"]
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+
+            model = model_class(config).to(torch_device).eval()
+            signature = inspect.signature(model.forward).parameters.keys()
+
+            # no cache as some models require special cache classes to be init outside forward
+            model.generation_config.use_cache = False
+
+            # Without padding
+            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
+            next_logits_wo_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :]
+
+            # With left-padding (length 32)
+            # can hardcode pad_token to be 0 as we'll do attn masking anyway
+            pad_token_id = (
+                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
+            )
+            pad_size = (input_ids.shape[0], 32)
+            padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
+            padded_input_ids = torch.cat((padding, input_ids), dim=1)
+            padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+            model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
+            next_logits_with_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :]
+
+            # They should result in very similar logits
+            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
+
+    @unittest.skip("BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present")
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
+        pass
+
 
 # this class is based on `T5ModelTester` found in tests/models/t5/test_modeling_t5.py
 class Blip2TextModelTester:
@@ -754,7 +1078,6 @@ def prepare_config_and_inputs_for_common(self):
             "attention_mask": attention_mask,
             "decoder_input_ids": decoder_input_ids,
             "decoder_attention_mask": decoder_attention_mask,
-            "labels": labels,
         }
         return config, inputs_dict
 
@@ -767,6 +1090,7 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi
             "feature-extraction": Blip2Model,
             "image-to-text": Blip2ForConditionalGeneration,
             "visual-question-answering": Blip2ForConditionalGeneration,
+            "image-text-to-text": Blip2ForConditionalGeneration,
         }
         if is_torch_available()
         else {}
@@ -774,9 +1098,9 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
-    test_resize_embeddings = False
+    test_resize_embeddings = True
     test_attention_outputs = False
-    test_torchscript = False
+    test_torchscript = True
     _is_composite = True
 
     # TODO: Fix the failed tests
@@ -803,6 +1127,116 @@ def test_for_conditional_generation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
 
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        # overwrite because BLIP requires ipnut ids and pixel values as input
+        if not self.test_torchscript:
+            self.skipTest(reason="test_torchscript is set to `False`")
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        for model_class in self.all_model_classes:
+            for attn_implementation in ["eager", "sdpa"]:
+                if attn_implementation == "sdpa" and (not model_class._supports_sdpa or not is_torch_sdpa_available()):
+                    continue
+
+                configs_no_init._attn_implementation = attn_implementation
+                model = model_class(config=configs_no_init)
+                model.to(torch_device)
+                model.eval()
+                inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                main_input_name = model_class.main_input_name
+
+                try:
+                    if model.config.is_encoder_decoder:
+                        model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                        main_input = inputs[main_input_name]
+                        input_ids = inputs["input_ids"]
+                        attention_mask = inputs["attention_mask"]
+                        decoder_input_ids = inputs["decoder_input_ids"]
+                        decoder_attention_mask = inputs["decoder_attention_mask"]
+                        model(main_input, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
+                        traced_model = torch.jit.trace(
+                            model, (main_input, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
+                        )
+                    else:
+                        main_input = inputs[main_input_name]
+                        input_ids = inputs["input_ids"]
+
+                        if model.config._attn_implementation == "sdpa":
+                            trace_input = {main_input_name: main_input, "input_ids": input_ids}
+
+                            if "attention_mask" in inputs:
+                                trace_input["attention_mask"] = inputs["attention_mask"]
+                            else:
+                                self.skipTest(reason="testing SDPA without attention_mask is not supported")
+
+                            model(main_input, attention_mask=inputs["attention_mask"])
+                            # example_kwarg_inputs was introduced in torch==2.0, but it is fine here since SDPA has a requirement on torch>=2.1.
+                            traced_model = torch.jit.trace(model, example_kwarg_inputs=trace_input)
+                        else:
+                            model(main_input, input_ids)
+                            traced_model = torch.jit.trace(model, (main_input, input_ids))
+                except RuntimeError:
+                    self.fail("Couldn't trace module.")
+
+                with tempfile.TemporaryDirectory() as tmp_dir_name:
+                    pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                    try:
+                        torch.jit.save(traced_model, pt_file_name)
+                    except Exception:
+                        self.fail("Couldn't save module.")
+
+                    try:
+                        loaded_model = torch.jit.load(pt_file_name)
+                    except Exception:
+                        self.fail("Couldn't load module.")
+
+                model.to(torch_device)
+                model.eval()
+
+                loaded_model.to(torch_device)
+                loaded_model.eval()
+
+                model_state_dict = model.state_dict()
+                loaded_model_state_dict = loaded_model.state_dict()
+
+                non_persistent_buffers = {}
+                for key in loaded_model_state_dict.keys():
+                    if key not in model_state_dict.keys():
+                        non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+                loaded_model_state_dict = {
+                    key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+                }
+
+                self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+                model_buffers = list(model.buffers())
+                for non_persistent_buffer in non_persistent_buffers.values():
+                    found_buffer = False
+                    for i, model_buffer in enumerate(model_buffers):
+                        if torch.equal(non_persistent_buffer, model_buffer):
+                            found_buffer = True
+                            break
+
+                    self.assertTrue(found_buffer)
+                    model_buffers.pop(i)
+
+                models_equal = True
+                for layer_name, p1 in model_state_dict.items():
+                    if layer_name in loaded_model_state_dict:
+                        p2 = loaded_model_state_dict[layer_name]
+                        if p1.data.ne(p2.data).sum() > 0:
+                            models_equal = False
+
+                self.assertTrue(models_equal)
+
+                # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+                # (Even with this call, there are still memory leak by ~0.04MB)
+                self.clear_torch_jit_class_registry()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
@@ -941,7 +1375,7 @@ def test_get_text_features(self):
     def test_get_image_features(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"]
+        keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
 
         for key in keys_to_pop:
             inputs_dict.pop(key)
@@ -961,7 +1395,7 @@ def test_get_image_features(self):
     def test_get_qformer_features(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"]
+        keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
 
         for key in keys_to_pop:
             inputs_dict.pop(key)
@@ -1071,7 +1505,7 @@ class Blip2TextModelWithProjectionTest(ModelTesterMixin, unittest.TestCase):
     test_pruning = False
     test_head_masking = False
 
-    test_resize_embeddings = False
+    test_resize_embeddings = True
     test_attention_outputs = False
     test_torchscript = False
 
@@ -1395,7 +1829,7 @@ class Blip2TextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
-    test_resize_embeddings = False
+    test_resize_embeddings = True
     test_attention_outputs = False
     test_torchscript = False
 
@@ -1560,6 +1994,7 @@ def test_inference_opt(self):
         generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
 
         # Test output
+        print(predictions[0].tolist(), generated_text)
         self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118])
         self.assertEqual("a woman sitting on the beach with a dog", generated_text)
 
@@ -1574,9 +2009,9 @@ def test_inference_opt(self):
         # Test output
         self.assertEqual(
             predictions[0].tolist(),
-            [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
+            [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
         )
-        self.assertEqual(generated_text, "it's not a city, it's a beach")
+        self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach")
 
     def test_inference_interpolate_pos_encoding(self):
         processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
@@ -1685,9 +2120,9 @@ def test_inference_opt_multi_accelerator(self):
         # Test output
         self.assertEqual(
             predictions[0].tolist(),
-            [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
+            [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
         )
-        self.assertEqual(generated_text, "it's not a city, it's a beach")
+        self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach")
 
     @require_torch_multi_accelerator
     def test_inference_t5_multi_accelerator(self):
diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py
index 1ff43e359d5e05..f6613725680111 100644
--- a/tests/models/camembert/test_tokenization_camembert.py
+++ b/tests/models/camembert/test_tokenization_camembert.py
@@ -154,7 +154,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir)
                 tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
                 EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
                 with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
-                    self.assertEqual(tokenizer._eos_token, new_eos)
+                    self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
                     self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))
 
                 with tempfile.TemporaryDirectory() as tmp_dir_2:
@@ -194,7 +194,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir)
                         tokenizer_fast = self.rust_tokenizer_class.from_pretrained(
                             pretrained_name, eos_token=new_eos, from_slow=True
                         )
-                        self.assertEqual(tokenizer_fast._eos_token, new_eos)
+                        self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
                         self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
                         # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
                         with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py
index aad26ef147e83e..bb2ba8b3428174 100644
--- a/tests/models/chameleon/test_modeling_chameleon.py
+++ b/tests/models/chameleon/test_modeling_chameleon.py
@@ -16,17 +16,14 @@
 
 import unittest
 
-import pytest
 import requests
 from parameterized import parameterized
 
 from transformers import ChameleonConfig, is_torch_available, is_vision_available, set_seed
 from transformers.testing_utils import (
     require_bitsandbytes,
-    require_flash_attn,
     require_read_token,
     require_torch,
-    require_torch_gpu,
     slow,
     torch_device,
 )
@@ -279,6 +276,7 @@ class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
         {
             "feature-extraction": ChameleonModel,
             "text-generation": ChameleonForConditionalGeneration,
+            "image-text-to-text": ChameleonForConditionalGeneration,
         }
         if is_torch_available()
         else {}
@@ -329,43 +327,6 @@ def test_model_rope_scaling(self, scaling_type):
         # The output should be different for long inputs
         self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
 
-    @require_flash_attn
-    @require_read_token
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = ChameleonForConditionalGeneration.from_pretrained(
-            "facebook/chameleon-7b",
-            load_in_4bit=True,
-            device_map={"": 0},
-        )
-
-        processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        processor.tokenizer.padding_side = "right"
-
-        inputs = processor(text=texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = processor.tokenizer.batch_decode(output_native)
-
-        model = ChameleonForConditionalGeneration.from_pretrained(
-            "facebook/chameleon-7b",
-            load_in_4bit=True,
-            attn_implementation="flash_attention_2",
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = processor.tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
     @unittest.skip("Chameleon forces some token ids to be -inf!")
     def test_batching_equivalence(self):
         pass
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 9f8cc62d2e0fc3..60b77d0efa4b7b 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -515,11 +515,18 @@ class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = ClapModelTester(self)
+        common_properties = ["logit_scale_init_value", "projection_hidden_act", "projection_dim"]
+        self.config_tester = ConfigTester(
+            self, config_class=ClapConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index a7c8c8ef8410e8..fa5de84e06205f 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -745,11 +745,18 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
     def setUp(self):
         self.model_tester = CLIPModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=CLIPConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index c5edf7cb757b30..b2b047bb502cce 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -472,11 +472,18 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
 
     def setUp(self):
         self.model_tester = CLIPSegModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=CLIPSegConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     def test_model_for_image_segmentation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_for_image_segmentation(*config_and_inputs)
@@ -796,7 +803,7 @@ def test_inference_image_segmentation(self):
 
         # forward pass
         with torch.no_grad():
-            outputs = model(**inputs, interpolate_pos_encoding=True)
+            outputs = model(**inputs)
 
         # verify the predicted masks
         self.assertEqual(
@@ -804,7 +811,7 @@ def test_inference_image_segmentation(self):
             torch.Size((3, 352, 352)),
         )
         expected_masks_slice = torch.tensor(
-            [[-7.4613, -7.4785, -7.3627], [-7.3268, -7.0898, -7.1333], [-6.9838, -6.7900, -6.8913]]
+            [[-7.4613, -7.4785, -7.3628], [-7.3268, -7.0899, -7.1333], [-6.9838, -6.7900, -6.8913]]
         ).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3))
diff --git a/tests/models/clvp/test_feature_extraction_clvp.py b/tests/models/clvp/test_feature_extraction_clvp.py
index db641eaf6145cb..1f059ca46944e1 100644
--- a/tests/models/clvp/test_feature_extraction_clvp.py
+++ b/tests/models/clvp/test_feature_extraction_clvp.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
 import itertools
 import os
 import random
@@ -24,7 +23,13 @@
 from datasets import Audio, load_dataset
 
 from transformers import ClvpFeatureExtractor
-from transformers.testing_utils import check_json_file_has_correct_format, require_torch, slow
+from transformers.testing_utils import (
+    check_json_file_has_correct_format,
+    cleanup,
+    require_torch,
+    slow,
+    torch_device,
+)
 from transformers.utils.import_utils import is_torch_available
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
@@ -116,8 +121,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device)
 
     # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_from_and_save_pretrained
     def test_feat_extract_from_and_save_pretrained(self):
diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py
index 0cf89a74523364..a212b4781d0a78 100644
--- a/tests/models/clvp/test_modeling_clvp.py
+++ b/tests/models/clvp/test_modeling_clvp.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Clvp model."""
 
-import gc
 import tempfile
 import unittest
 
@@ -23,6 +22,7 @@
 
 from transformers import ClvpConfig, ClvpDecoderConfig, ClvpEncoderConfig
 from transformers.testing_utils import (
+    cleanup,
     require_torch,
     slow,
     torch_device,
@@ -174,8 +174,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device)
 
     def test_config(self):
         self.encoder_config_tester.run_common_tests()
@@ -294,8 +293,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device)
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -416,13 +414,18 @@ class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase)
 
     def setUp(self):
         self.model_tester = ClvpModelForConditionalGenerationTester(self)
-        self.clvp_config_tester = ConfigTester(self, config_class=ClvpConfig, hidden_size=32)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.clvp_config_tester = ConfigTester(
+            self, config_class=ClvpConfig, has_text_modality=False, common_properties=common_properties, hidden_size=32
+        )
+
+    def test_config(self):
+        self.clvp_config_tester.run_common_tests()
 
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device)
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -571,8 +574,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     def test_conditional_encoder(self):
         with torch.no_grad():
diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py
index 3a05867dfdfc8c..cd3b2f978e7ab7 100644
--- a/tests/models/cohere/test_modeling_cohere.py
+++ b/tests/models/cohere/test_modeling_cohere.py
@@ -307,64 +307,6 @@ def test_model_various_embeddings(self):
     def test_torch_fx_output_loss(self):
         super().test_torch_fx_output_loss()
 
-    @require_bitsandbytes
-    @require_torch_sdpa
-    @require_torch_multi_gpu
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        max_new_tokens = 30
-
-        model_id = "CohereForAI/c4ai-command-r-v01-4bit"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-        model_sdpa = CohereForCausalLM.from_pretrained(
-            model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto"
-        )
-        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-        model_eager = CohereForCausalLM.from_pretrained(
-            model_id, torch_dtype=torch.float16, attn_implementation="eager", device_map="auto"
-        )
-
-        self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-        for name, submodule in model_eager.named_modules():
-            if "SdpaAttention" in submodule.__class__.__name__:
-                raise ValueError("The eager model should not have SDPA attention layers")
-
-        has_sdpa = False
-        for name, submodule in model_sdpa.named_modules():
-            if "SdpaAttention" in submodule.__class__.__name__:
-                has_sdpa = True
-                break
-        if not has_sdpa:
-            raise ValueError("The SDPA model should have SDPA attention layers")
-
-        texts = [
-            "hi here's a longer context, getting longer and",
-            "Hello this is a very long sentence my friend, very long for real",
-            "Today I am in Paris and",
-        ]
-
-        for padding_side in ["left", "right"]:
-            tokenizer.padding_side = padding_side
-            tokenizer.pad_token = tokenizer.eos_token
-
-            inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
-
-            res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-            res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-
-            with self.subTest(f"{padding_side}"):
-                torch.testing.assert_close(
-                    res_eager,
-                    res_sdpa,
-                    msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
-                )
-
 
 @require_torch
 @slow
diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py
index a9bdddd7bfe25e..88efa9bb189161 100644
--- a/tests/models/ctrl/test_modeling_ctrl.py
+++ b/tests/models/ctrl/test_modeling_ctrl.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 
 
-import gc
 import unittest
 
 from transformers import CTRLConfig, is_torch_available
-from transformers.testing_utils import backend_empty_cache, require_torch, slow, torch_device
+from transformers.testing_utils import cleanup, require_torch, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -235,8 +234,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        backend_empty_cache(torch_device)
+        cleanup(torch_device)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -261,8 +259,7 @@ class CTRLModelLanguageGenerationTest(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        backend_empty_cache(torch_device)
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     def test_lm_generate_ctrl(self):
diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
index 29dd0556afcde1..4a65f1b8d17864 100644
--- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py
+++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
@@ -20,8 +20,8 @@
 
 import numpy as np
 
-from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
 
@@ -32,7 +32,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import DeformableDetrImageProcessor
+    from transformers import DeformableDetrImageProcessor, DeformableDetrImageProcessorFast
 
 
 class DeformableDetrImageProcessingTester(unittest.TestCase):
@@ -52,6 +52,7 @@ def __init__(
         rescale_factor=1 / 255,
         do_pad=True,
     ):
+        super().__init__()
         # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
         size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
         self.parent = parent
@@ -133,6 +134,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 @require_vision
 class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = DeformableDetrImageProcessor if is_vision_available() else None
+    fast_image_processing_class = DeformableDetrImageProcessorFast if is_torchvision_available() else None
 
     def setUp(self):
         super().setUp()
@@ -143,25 +145,27 @@ def image_processor_dict(self):
         return self.image_processor_tester.prepare_image_processor_dict()
 
     def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "do_pad"))
-        self.assertTrue(hasattr(image_processing, "size"))
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "do_rescale"))
+            self.assertTrue(hasattr(image_processing, "do_pad"))
+            self.assertTrue(hasattr(image_processing, "size"))
 
     def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(image_processor.do_pad, True)
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
+            self.assertEqual(image_processor.do_pad, True)
 
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
-        )
-        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(image_processor.do_pad, False)
+            image_processor = image_processing_class.from_dict(
+                self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
+            )
+            self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
+            self.assertEqual(image_processor.do_pad, False)
 
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
@@ -172,40 +176,41 @@ def test_call_pytorch_with_coco_detection_annotations(self):
 
         target = {"image_id": 39769, "annotations": target}
 
-        # encode them
-        image_processing = DeformableDetrImageProcessor()
-        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
+        for image_processing_class in self.image_processor_list:
+            # encode them
+            image_processing = image_processing_class()
+            encoding = image_processing(images=image, annotations=target, return_tensors="pt")
 
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+            # verify pixel values
+            expected_shape = torch.Size([1, 3, 800, 1066])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)
 
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+            expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+            self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+            # verify area
+            expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+            # verify boxes
+            expected_boxes_shape = torch.Size([6, 4])
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+            expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+            # verify image_id
+            expected_image_id = torch.tensor([39769])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+            # verify is_crowd
+            expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+            # verify class_labels
+            expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+            # verify orig_size
+            expected_orig_size = torch.tensor([480, 640])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+            # verify size
+            expected_size = torch.tensor([800, 1066])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
 
     @slow
     def test_call_pytorch_with_coco_panoptic_annotations(self):
@@ -218,43 +223,45 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
 
         masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
 
-        # encode them
-        image_processing = DeformableDetrImageProcessor(format="coco_panoptic")
-        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+        for image_processing_class in self.image_processor_list:
+            # encode them
+            image_processing = image_processing_class(format="coco_panoptic")
+            encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
 
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+            # verify pixel values
+            expected_shape = torch.Size([1, 3, 800, 1066])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)
 
-        # verify area
-        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+            expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+            self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+            # verify area
+            expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+            # verify boxes
+            expected_boxes_shape = torch.Size([6, 4])
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+            expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+            # verify image_id
+            expected_image_id = torch.tensor([39769])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+            # verify is_crowd
+            expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+            # verify class_labels
+            expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+            # verify masks
+            expected_masks_sum = 822873
+            relative_error = torch.abs(encoding["labels"][0]["masks"].sum() - expected_masks_sum) / expected_masks_sum
+            self.assertTrue(relative_error < 1e-3)
+            # verify orig_size
+            expected_orig_size = torch.tensor([480, 640])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+            # verify size
+            expected_size = torch.tensor([800, 1066])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
 
     @slow
     # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->DeformableDetr
@@ -549,53 +556,181 @@ def test_max_width_max_height_resizing_and_pad_strategy(self):
             self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
 
     def test_longest_edge_shortest_edge_resizing_strategy(self):
-        image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)
+        for image_processing_class in self.image_processor_list:
+            image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)
+
+            # max size is set; width < height;
+            # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
+            image_processor = image_processing_class(
+                size={"longest_edge": 640, "shortest_edge": 640},
+                do_pad=False,
+            )
+            inputs = image_processor(images=[image_1], return_tensors="pt")
+            self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436]))
+
+            image_2 = torch.ones([653, 958, 3], dtype=torch.uint8)
+            # max size is set; height < width;
+            # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
+            image_processor = image_processing_class(
+                size={"longest_edge": 640, "shortest_edge": 640},
+                do_pad=False,
+            )
+            inputs = image_processor(images=[image_2], return_tensors="pt")
+            self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640]))
+
+            image_3 = torch.ones([100, 120, 3], dtype=torch.uint8)
+            # max size is set; width == size; height > max_size;
+            # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
+            image_processor = image_processing_class(
+                size={"longest_edge": 118, "shortest_edge": 100},
+                do_pad=False,
+            )
+            inputs = image_processor(images=[image_3], return_tensors="pt")
+            self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118]))
+
+            image_4 = torch.ones([128, 50, 3], dtype=torch.uint8)
+            # max size is set; height == size; width < max_size;
+            # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
+            image_processor = image_processing_class(
+                size={"longest_edge": 256, "shortest_edge": 50},
+                do_pad=False,
+            )
+            inputs = image_processor(images=[image_4], return_tensors="pt")
+            self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50]))
+
+            image_5 = torch.ones([50, 50, 3], dtype=torch.uint8)
+            # max size is set; height == width; width < max_size;
+            # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
+            image_processor = image_processing_class(
+                size={"longest_edge": 117, "shortest_edge": 50},
+                do_pad=False,
+            )
+            inputs = image_processor(images=[image_5], return_tensors="pt")
+            self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))
+
+    @slow
+    @require_torch_gpu
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations
+    def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # Ignore copy
+        processor = self.image_processor_list[1]()
+
+        # 1. run processor on CPU
+        encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu")
+        # 2. run processor on GPU
+        encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda")
+
+        # verify pixel values
+        self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)
+        self.assertTrue(
+            torch.allclose(
+                encoding_cpu["pixel_values"][0, 0, 0, :3],
+                encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"),
+                atol=1e-4,
+            )
+        )
+        # verify area
+        self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu")))
+        # verify boxes
+        self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape)
+        self.assertTrue(
+            torch.allclose(
+                encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3
+            )
+        )
+        # verify image_id
+        self.assertTrue(
+            torch.allclose(encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu"))
+        )
+        # verify is_crowd
+        self.assertTrue(
+            torch.allclose(encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu"))
+        )
+        # verify class_labels
+        self.assertTrue(
+            torch.allclose(
+                encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu")
+            )
+        )
+        # verify orig_size
+        self.assertTrue(
+            torch.allclose(encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu"))
+        )
+        # verify size
+        self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu")))
 
-        # max size is set; width < height;
-        # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
-        image_processor = DeformableDetrImageProcessor(
-            size={"longest_edge": 640, "shortest_edge": 640},
-            do_pad=False,
+    @slow
+    @require_torch_gpu
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations
+    def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # Ignore copy
+        processor = self.image_processor_list[1](format="coco_panoptic")
+
+        # 1. run processor on CPU
+        encoding_cpu = processor(
+            images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu"
         )
-        inputs = image_processor(images=[image_1], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436]))
-
-        image_2 = torch.ones([653, 958, 3], dtype=torch.uint8)
-        # max size is set; height < width;
-        # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
-        image_processor = DeformableDetrImageProcessor(
-            size={"longest_edge": 640, "shortest_edge": 640},
-            do_pad=False,
+        # 2. run processor on GPU
+        encoding_gpu = processor(
+            images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cuda"
         )
-        inputs = image_processor(images=[image_2], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640]))
-
-        image_3 = torch.ones([100, 120, 3], dtype=torch.uint8)
-        # max size is set; width == size; height > max_size;
-        # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
-        image_processor = DeformableDetrImageProcessor(
-            size={"longest_edge": 118, "shortest_edge": 100},
-            do_pad=False,
+
+        # verify pixel values
+        self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)
+        self.assertTrue(
+            torch.allclose(
+                encoding_cpu["pixel_values"][0, 0, 0, :3],
+                encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"),
+                atol=1e-4,
+            )
         )
-        inputs = image_processor(images=[image_3], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118]))
-
-        image_4 = torch.ones([128, 50, 3], dtype=torch.uint8)
-        # max size is set; height == size; width < max_size;
-        # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
-        image_processor = DeformableDetrImageProcessor(
-            size={"longest_edge": 256, "shortest_edge": 50},
-            do_pad=False,
+        # verify area
+        self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu")))
+        # verify boxes
+        self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape)
+        self.assertTrue(
+            torch.allclose(
+                encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3
+            )
         )
-        inputs = image_processor(images=[image_4], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50]))
-
-        image_5 = torch.ones([50, 50, 3], dtype=torch.uint8)
-        # max size is set; height == width; width < max_size;
-        # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
-        image_processor = DeformableDetrImageProcessor(
-            size={"longest_edge": 117, "shortest_edge": 50},
-            do_pad=False,
+        # verify image_id
+        self.assertTrue(
+            torch.allclose(encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu"))
         )
-        inputs = image_processor(images=[image_5], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))
+        # verify is_crowd
+        self.assertTrue(
+            torch.allclose(encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu"))
+        )
+        # verify class_labels
+        self.assertTrue(
+            torch.allclose(
+                encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu")
+            )
+        )
+        # verify masks
+        masks_sum_cpu = encoding_cpu["labels"][0]["masks"].sum()
+        masks_sum_gpu = encoding_gpu["labels"][0]["masks"].sum()
+        relative_error = torch.abs(masks_sum_cpu - masks_sum_gpu) / masks_sum_cpu
+        self.assertTrue(relative_error < 1e-3)
+        # verify orig_size
+        self.assertTrue(
+            torch.allclose(encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu"))
+        )
+        # verify size
+        self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu")))
diff --git a/tests/models/depth_anything/test_modeling_depth_anything.py b/tests/models/depth_anything/test_modeling_depth_anything.py
index 0e59bc4d13fe70..6e7b423e9ec35f 100644
--- a/tests/models/depth_anything/test_modeling_depth_anything.py
+++ b/tests/models/depth_anything/test_modeling_depth_anything.py
@@ -18,6 +18,7 @@
 
 from transformers import DepthAnythingConfig, Dinov2Config
 from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
@@ -263,7 +264,7 @@ def test_inference(self):
         self.assertEqual(predicted_depth.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]],
+            [[8.8223, 8.6483, 8.6216], [8.3332, 8.6047, 8.7545], [8.6547, 8.6885, 8.7472]],
         ).to(torch_device)
 
         self.assertTrue(torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-6))
@@ -286,7 +287,34 @@ def test_inference(self):
         self.assertEqual(predicted_depth.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]],
+            [[1.3349, 1.2947, 1.2802], [1.2794, 1.2338, 1.2901], [1.2630, 1.2219, 1.2478]],
         ).to(torch_device)
 
         self.assertTrue(torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
+
+    def test_export(self):
+        for strict in [True, False]:
+            with self.subTest(strict=strict):
+                if not is_torch_greater_or_equal_than_2_4:
+                    self.skipTest(reason="This test requires torch >= 2.4 to run.")
+                model = (
+                    DepthAnythingForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf")
+                    .to(torch_device)
+                    .eval()
+                )
+                image_processor = DPTImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
+                image = prepare_img()
+                inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+                exported_program = torch.export.export(
+                    model,
+                    args=(inputs["pixel_values"],),
+                    strict=strict,
+                )
+                with torch.no_grad():
+                    eager_outputs = model(**inputs)
+                    exported_outputs = exported_program.module().forward(inputs["pixel_values"])
+                self.assertEqual(eager_outputs.predicted_depth.shape, exported_outputs.predicted_depth.shape)
+                self.assertTrue(
+                    torch.allclose(eager_outputs.predicted_depth, exported_outputs.predicted_depth, atol=1e-4)
+                )
diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py
index 976b306115b68a..f91c520873668f 100644
--- a/tests/models/detr/test_image_processing_detr.py
+++ b/tests/models/detr/test_image_processing_detr.py
@@ -677,7 +677,7 @@ def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
 
         target = {"image_id": 39769, "annotations": target}
 
-        processor = self.image_processor_list[1].from_pretrained("facebook/detr-resnet-50")
+        processor = self.image_processor_list[1]()
         # 1. run processor on CPU
         encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu")
         # 2. run processor on GPU
@@ -734,7 +734,7 @@ def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self):
 
         masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
 
-        processor = self.image_processor_list[1].from_pretrained("facebook/detr-resnet-50-panoptic")
+        processor = self.image_processor_list[1](format="coco_panoptic")
         # 1. run processor on CPU
         encoding_cpu = processor(
             images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu"
diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py
index 3a74a1557cf9ba..d4c51cea125720 100644
--- a/tests/models/distilbert/test_modeling_distilbert.py
+++ b/tests/models/distilbert/test_modeling_distilbert.py
@@ -30,6 +30,7 @@
     import torch
 
     from transformers import (
+        AutoTokenizer,
         DistilBertForMaskedLM,
         DistilBertForMultipleChoice,
         DistilBertForQuestionAnswering,
@@ -38,6 +39,7 @@
         DistilBertModel,
     )
     from transformers.models.distilbert.modeling_distilbert import _create_sinusoidal_embeddings
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
 
 
 class DistilBertModelTester:
@@ -420,3 +422,45 @@ def test_inference_no_head_absolute_embedding(self):
         )
 
         self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
+
+    @slow
+    def test_export(self):
+        if not is_torch_greater_or_equal_than_2_4:
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        distilbert_model = "distilbert-base-uncased"
+        device = "cpu"
+        attn_implementation = "sdpa"
+        max_length = 64
+
+        tokenizer = AutoTokenizer.from_pretrained(distilbert_model)
+        inputs = tokenizer(
+            f"Paris is the {tokenizer.mask_token} of France.",
+            return_tensors="pt",
+            padding="max_length",
+            max_length=max_length,
+        )
+
+        model = DistilBertForMaskedLM.from_pretrained(
+            distilbert_model,
+            device_map=device,
+            attn_implementation=attn_implementation,
+        )
+
+        logits = model(**inputs).logits
+        eager_predicted_mask = tokenizer.decode(logits[0, 4].topk(5).indices)
+        self.assertEqual(
+            eager_predicted_mask.split(),
+            ["capital", "birthplace", "northernmost", "centre", "southernmost"],
+        )
+
+        exported_program = torch.export.export(
+            model,
+            args=(inputs["input_ids"],),
+            kwargs={"attention_mask": inputs["attention_mask"]},
+            strict=True,
+        )
+
+        result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"])
+        exported_predicted_mask = tokenizer.decode(result.logits[0, 4].topk(5).indices)
+        self.assertEqual(eager_predicted_mask, exported_predicted_mask)
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index 376ea8b310080d..7f841fbb2efc58 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -18,6 +18,7 @@
 
 from transformers import DPTConfig
 from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
@@ -410,3 +411,24 @@ def test_post_processing_depth_estimation(self):
         ).squeeze()
         self.assertTrue(output_enlarged.shape == expected_shape)
         self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3))
+
+    def test_export(self):
+        for strict in [True, False]:
+            with self.subTest(strict=strict):
+                if not is_torch_greater_or_equal_than_2_4:
+                    self.skipTest(reason="This test requires torch >= 2.4 to run.")
+                model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade").to(torch_device).eval()
+                image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade")
+                image = prepare_img()
+                inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+                exported_program = torch.export.export(
+                    model,
+                    args=(inputs["pixel_values"],),
+                    strict=strict,
+                )
+                with torch.no_grad():
+                    eager_outputs = model(**inputs)
+                    exported_outputs = exported_program.module().forward(inputs["pixel_values"])
+                self.assertEqual(eager_outputs.logits.shape, exported_outputs.logits.shape)
+                self.assertTrue(torch.allclose(eager_outputs.logits, exported_outputs.logits, atol=1e-4))
diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
index 35283eebf5edca..ff6dd04528dbaa 100644
--- a/tests/models/dpt/test_modeling_dpt_auto_backbone.py
+++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
@@ -274,7 +274,7 @@ def test_inference_depth_estimation_dinov2(self):
         self.assertEqual(predicted_depth.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[6.0433, 7.1636, 7.4268], [6.9047, 7.2471, 7.2355], [7.9261, 8.0631, 8.0244]]
+            [[6.0336, 7.1502, 7.4130], [6.8977, 7.2383, 7.2268], [7.9180, 8.0525, 8.0134]]
         ).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
index a1a2b0155cb738..ce04fae94ea904 100644
--- a/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Falcon model."""
 
-import tempfile
 import unittest
 
 from parameterized import parameterized
@@ -27,7 +26,6 @@
     set_seed,
 )
 from transformers.testing_utils import (
-    is_flaky,
     require_bitsandbytes,
     require_torch,
     require_torch_sdpa,
@@ -520,78 +518,6 @@ def test_model_rope_scaling(self):
             torch.testing.assert_close(ntk_sin_long, original_sin_long)
         self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
 
-    # TODO: @Fxmarty
-    @is_flaky(max_attempts=3, description="flaky on some models.")
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        max_new_tokens = 30
-
-        if len(self.all_generative_model_classes) == 0:
-            self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_sdpa:
-                self.skipTest(f"{model_class.__name__} does not support SDPA")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model_sdpa = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                    attn_implementation="eager",
-                ).to(torch_device)
-
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-                # NOTE: This check is disabled for Falcon as the non-SDPA/SDPA implementation is in the same class (legacy reason).
-                # for name, submodule in model_eager.named_modules():
-                #     if "SdpaAttention" in submodule.__class__.__name__:
-                #         raise ValueError("The eager model should not have SDPA attention layers")
-
-                # has_sdpa = False
-                # for name, submodule in model_sdpa.named_modules():
-                #     if "SdpaAttention" in submodule.__class__.__name__:
-                #         has_sdpa = True
-                #         break
-                # if not has_sdpa:
-                #     raise ValueError("The SDPA model should have SDPA attention layers")
-
-                # Just test that a large cache works as expected
-                res_eager = model_eager.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
-
-                res_sdpa = model_sdpa.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(res_eager, res_sdpa))
-
 
 @require_torch
 class FalconLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index d8c8f385e9ce11..1c35fd705ccd87 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -931,11 +931,18 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = self.class_for_tester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value", "init_codebook"]
+        self.config_tester = ConfigTester(
+            self, config_class=FlavaConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="tested in individual model tests")
     def test_hidden_states_output(self):
         pass
diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py
index 9425bddb6f703c..bcac135be7210b 100644
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@@ -17,12 +17,15 @@
 import io
 import unittest
 
+import pytest
 import requests
+from parameterized import parameterized
 
 from transformers import FuyuConfig, is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
 from transformers.utils import cached_property
 
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
 
@@ -263,9 +266,12 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class FuyuModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class FuyuModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (FuyuForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = {"text-generation": FuyuForCausalLM} if is_torch_available() else {}
+    all_generative_model_classes = (FuyuForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"text-generation": FuyuForCausalLM, "image-text-to-text": FuyuForCausalLM} if is_torch_available() else {}
+    )
 
     test_head_masking = False
     test_pruning = False
@@ -294,6 +300,16 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
+    @pytest.mark.generate
+    @parameterized.expand([("random",), ("same",)])
+    @unittest.skip("Fuyu doesn't support assisted generation due to the need to crop/extend image patches indices")
+    def test_assisted_decoding_matches_greedy_search(self):
+        pass
+
+    @unittest.skip("Fuyu doesn't support assisted generation due to the need to crop/extend image patches indices")
+    def test_assisted_decoding_sample(self):
+        pass
+
     # TODO: Fix me (once this model gets more usage)
     @unittest.skip(reason="Does not work on the tiny model.")
     def test_disk_offload_bin(self):
diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
index a888bdcd3bc7be..e8483f8c7c7d32 100644
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -319,9 +319,6 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.6]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "google/gemma-2b"
-
     # used in `test_torch_compile_for_training`
     _torch_compile_train_cls = GemmaForCausalLM if is_torch_available() else None
 
@@ -419,51 +416,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Gemma apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py
index 94670803daa998..06116c4dbafbd4 100644
--- a/tests/models/gemma2/test_modeling_gemma2.py
+++ b/tests/models/gemma2/test_modeling_gemma2.py
@@ -78,7 +78,6 @@ class Gemma2ModelTest(GemmaModelTest, unittest.TestCase):
     test_pruning = False
     _is_stateful = True
     model_split_percents = [0.5, 0.6]
-    _torch_compile_test_ckpt = "google/gemma-2-9b"
 
     def setUp(self):
         self.model_tester = Gemma2ModelTester(self)
@@ -200,19 +199,6 @@ def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_l
     def test_sdpa_equivalence(self):
         pass
 
-    def test_eager_attention_loaded_by_default(self):
-        """Gemma 2 + SDPA = inferior results, because of the logit softcapping. Eager is the default."""
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Usually we enable SDPA by default, but not for Gemma2
-        model = Gemma2Model(config)
-        self.assertTrue(model.config._attn_implementation == "eager")
-
-        # We can still force SDPA
-        config._attn_implementation = "sdpa"
-        model = Gemma2Model(config)
-        self.assertTrue(model.config._attn_implementation == "sdpa")
-
 
 @slow
 @require_torch_gpu
@@ -278,9 +264,30 @@ def test_model_9b_pipeline_bf16(self):
             "Hi today I'm going to be talking about the history of the United States. The United States of America",
         ]
 
-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
+        ).to(torch_device)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+
+        output = pipe(self.input_text, max_new_tokens=20, do_sample=False, padding=True)
+
+        self.assertEqual(output[0][0]["generated_text"], EXPECTED_TEXTS[0])
+        self.assertEqual(output[1][0]["generated_text"], EXPECTED_TEXTS[1])
+
+    @require_read_token
+    def test_model_2b_pipeline_bf16_flex_attention(self):
+        # See https://github.com/huggingface/transformers/pull/31747 -- pipeline was broken for Gemma2 before this PR
+        model_id = "google/gemma-2-2b"
+        # EXPECTED_TEXTS should match the same non-pipeline test, minus the special tokens
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the 1960s and I am trying to find out what the average",
+            "Hi today I'm going to be talking about the 10 best anime of all time.\n\n1",
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
+        ).to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
 
@@ -366,3 +373,23 @@ def test_export_static_cache(self):
         )
         ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True)
         self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text)
+
+    @require_read_token
+    def test_model_9b_bf16_flex_attention(self):
+        model_id = "google/gemma-2-9b"
+        EXPECTED_TEXTS = [
+            "<bos>Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
+            "<pad><pad><bos>Hi today I'm going to be talking about the history of the United States. The United States of America",
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
+        ).to(torch_device)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=False)
+
+        self.assertEqual(output_text, EXPECTED_TEXTS)
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index 33da9e26cba03d..ccfb41459caf73 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -401,7 +401,12 @@ class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else ()
     pipeline_model_mapping = (
-        {"feature-extraction": GitModel, "image-to-text": GitForCausalLM, "text-generation": GitForCausalLM}
+        {
+            "feature-extraction": GitModel,
+            "image-to-text": GitForCausalLM,
+            "text-generation": GitForCausalLM,
+            "image-text-to-text": GitForCausalLM,
+        }
         if is_torch_available()
         else {}
     )
diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py
index f703ccd5096d41..ebac3b9167ce26 100644
--- a/tests/models/glm/test_modeling_glm.py
+++ b/tests/models/glm/test_modeling_glm.py
@@ -14,13 +14,9 @@
 # limitations under the License.
 """Testing suite for the PyTorch Glm model."""
 
-import inspect
-import tempfile
 import unittest
 
-import numpy as np
 import pytest
-from parameterized import parameterized
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, GlmConfig, is_torch_available
 from transformers.testing_utils import (
@@ -28,12 +24,10 @@
     require_flash_attn,
     require_torch,
     require_torch_accelerator,
-    require_torch_gpu,
     require_torch_sdpa,
     slow,
     torch_device,
 )
-from transformers.utils import is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -306,10 +300,6 @@ class GlmModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     test_headmasking = False
     test_pruning = False
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "THUDM/glm-4-9b"
-    _torch_compile_test_revision = "refs/pr/15"
-
     def setUp(self):
         self.model_tester = GlmModelTester(self)
         self.config_tester = ConfigTester(self, config_class=GlmConfig, hidden_size=37)
@@ -426,409 +416,6 @@ def test_custom_4d_attention_mask(self):
 
             torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-3)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """Overwrite the common test as the test is flaky on tiny models."""
-        model = GlmForCausalLM.from_pretrained(
-            "THUDM/glm-4-9b",
-            device_map={"": 0},
-            torch_dtype=torch.bfloat16,
-            revision="refs/pr/15",
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b", revision="refs/pr/15")
-        tokenizer.padding_side = "right"
-
-        texts = ["hi", "Hello this is a very long sentence"]
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=15, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = GlmForCausalLM.from_pretrained(
-            "THUDM/glm-4-9b",
-            device_map={"": 0},
-            attn_implementation="flash_attention_2",
-            torch_dtype=torch.bfloat16,
-            revision="refs/pr/15",
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=15, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    @is_flaky
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        """Overwrite to add flakyness: some cases can sometimes fail"""
-        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
-            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
-
-        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
-            self.skipTest(
-                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
-            )
-
-        # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
-        if torch_dtype == "float16":
-            torch_dtype = torch.float16
-        elif torch_dtype == "bfloat16":
-            torch_dtype = torch.bfloat16
-        elif torch_dtype == "float32":
-            torch_dtype = torch.float32
-
-        atols = {
-            ("cpu", False, torch.float32): 1e-6,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-6,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-6,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-6,
-            ("cuda", True, torch.bfloat16): 1e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-        rtols = {
-            ("cpu", False, torch.float32): 1e-4,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-4,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-4,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-4,
-            ("cuda", True, torch.bfloat16): 3e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-
-        def get_mean_reldiff(failcase, x, ref, atol, rtol):
-            return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-            # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors.
-            # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask.
-            # This means that the class needs to be instantiated much later, after `use_mask` is set, which means a significant refactor of the code.
-            # However masking there is not done at any layers that matters (i.e self-attention), therefore we can safely deactivate it.
-            deactivate_mask = "use_mask_token" in inspect.signature(model_class).parameters
-
-            is_encoder_decoder = model.config.is_encoder_decoder
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch_dtype,
-                    attn_implementation="eager",
-                )
-                model_eager = model_eager.eval().to(torch_device)
-
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-                for name, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
-                has_sdpa = False
-                for name, submodule in model_sdpa.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        has_sdpa = True
-                        break
-                if not has_sdpa and model_sdpa.config.model_type != "falcon":
-                    raise ValueError("The SDPA model should have SDPA attention layers")
-
-                # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
-                # but it would be nicer to have an efficient way to use parameterized.expand
-                fail_cases = []
-                for padding_side in ["left", "right"]:
-                    for use_mask in [False, True]:
-                        for output_attentions in [True, False]:
-                            can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
-                            if not (self.has_attentions and can_output_attn) and output_attentions:
-                                continue
-                            for batch_size in [1, 5]:
-                                dummy_input = inputs_dict[model.main_input_name]
-
-                                if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                    dummy_input = dummy_input.to(torch_dtype)
-
-                                dummy_input = dummy_input[:batch_size]
-                                if dummy_input.shape[0] != batch_size:
-                                    if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                        extension = torch.rand(
-                                            batch_size - dummy_input.shape[0],
-                                            *dummy_input.shape[1:],
-                                            dtype=torch_dtype,
-                                            device=torch_device,
-                                        )
-                                        dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-                                    else:
-                                        extension = torch.randint(
-                                            high=5,
-                                            size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]),
-                                            dtype=dummy_input.dtype,
-                                            device=torch_device,
-                                        )
-                                        dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-
-                                if not use_mask:
-                                    dummy_attention_mask = None
-                                else:
-                                    dummy_attention_mask = inputs_dict.get("attention_mask", None)
-                                    if dummy_attention_mask is None:
-                                        if is_encoder_decoder:
-                                            seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1]
-                                        else:
-                                            seqlen = dummy_input.shape[-1]
-                                        dummy_attention_mask = (
-                                            torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
-                                        )
-
-                                    dummy_attention_mask = dummy_attention_mask[:batch_size]
-                                    if dummy_attention_mask.shape[0] != batch_size:
-                                        extension = torch.ones(
-                                            batch_size - dummy_attention_mask.shape[0],
-                                            *dummy_attention_mask.shape[1:],
-                                            dtype=dummy_attention_mask.dtype,
-                                            device=torch_device,
-                                        )
-                                        dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
-                                        dummy_attention_mask = dummy_attention_mask.to(torch_device)
-
-                                    dummy_attention_mask[:] = 1
-                                    if padding_side == "left":
-                                        dummy_attention_mask[-1, :-1] = 1
-                                        dummy_attention_mask[-1, -4:] = 0
-                                    elif padding_side == "right":
-                                        dummy_attention_mask[-1, 1:] = 1
-                                        dummy_attention_mask[-1, :3] = 0
-
-                                for enable_kernels in [False, True]:
-                                    failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
-                                    if is_encoder_decoder:
-                                        decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[
-                                            :batch_size
-                                        ]
-                                        if decoder_input_ids.shape[0] != batch_size:
-                                            extension = torch.ones(
-                                                batch_size - decoder_input_ids.shape[0],
-                                                *decoder_input_ids.shape[1:],
-                                                dtype=decoder_input_ids.dtype,
-                                                device=torch_device,
-                                            )
-                                            decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0)
-                                            decoder_input_ids = decoder_input_ids.to(torch_device)
-
-                                        # TODO: never an `attention_mask` arg here?
-                                        processed_inputs = {
-                                            model.main_input_name: dummy_input,
-                                            "decoder_input_ids": decoder_input_ids,
-                                            "decoder_attention_mask": dummy_attention_mask,
-                                            "output_hidden_states": True,
-                                        }
-                                    else:
-                                        processed_inputs = {
-                                            model.main_input_name: dummy_input,
-                                            "output_hidden_states": True,
-                                        }
-
-                                        # Otherwise fails for e.g. WhisperEncoderModel
-                                        if "attention_mask" in inspect.signature(model_eager.forward).parameters:
-                                            processed_inputs["attention_mask"] = dummy_attention_mask
-
-                                        if (
-                                            self.has_attentions
-                                            and "output_attentions" in inspect.signature(model_sdpa.forward).parameters
-                                        ):
-                                            processed_inputs["output_attentions"] = output_attentions
-                                    if not deactivate_mask and (
-                                        "bool_masked_pos" in inspect.signature(model_eager.forward).parameters
-                                    ):
-                                        dummy_mask = torch.ones((self.model_tester.num_masks,))
-
-                                        # In case of additional token (like class) we define a custom `mask_length`
-                                        if hasattr(self.model_tester, "mask_length"):
-                                            mask_length = self.model_tester.mask_length - dummy_mask.size(0)
-                                        else:
-                                            mask_length = self.model_tester.seq_length - dummy_mask.size(0)
-                                        dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)])
-                                        dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool()
-                                        processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device)
-
-                                    if "noise" in inspect.signature(model_eager.forward).parameters:
-                                        np.random.seed(2)
-                                        num_patches = int(
-                                            (self.model_tester.image_size // self.model_tester.patch_size) ** 2
-                                        )
-                                        noise = np.random.uniform(size=(batch_size, num_patches))
-                                        processed_inputs["noise"] = torch.from_numpy(noise)
-
-                                    # TODO: test gradients as well (& for FA2 as well!)
-                                    with torch.no_grad():
-                                        with torch.backends.cuda.sdp_kernel(
-                                            enable_flash=enable_kernels,
-                                            enable_math=True,
-                                            enable_mem_efficient=enable_kernels,
-                                        ):
-                                            prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
-                                            outputs_eager = model_eager(**prepared_inputs)
-                                            outputs_sdpa = model_sdpa(**prepared_inputs)
-
-                                    logits_eager = (
-                                        outputs_eager.hidden_states[-1]
-                                        if not is_encoder_decoder
-                                        else outputs_eager.decoder_hidden_states[-1]
-                                    )
-                                    logits_sdpa = (
-                                        outputs_sdpa.hidden_states[-1]
-                                        if not is_encoder_decoder
-                                        else outputs_sdpa.decoder_hidden_states[-1]
-                                    )
-
-                                    if torch_device in ["cpu", "cuda"]:
-                                        atol = atols[torch_device, enable_kernels, torch_dtype]
-                                        rtol = rtols[torch_device, enable_kernels, torch_dtype]
-                                    else:
-                                        atol = 1e-7
-                                        rtol = 1e-4
-
-                                    # Masked tokens output slightly deviates - we don't mind that.
-                                    if use_mask:
-                                        if padding_side == "left":
-                                            sub_sdpa = logits_sdpa[:-1]
-                                            sub_eager = logits_eager[:-1]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            sub_sdpa = logits_sdpa[-1, :-4]
-                                            sub_eager = logits_eager[-1, :-4]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            # Testing the padding tokens is not really meaningful but anyway
-                                            # sub_sdpa = logits_sdpa[-1, -4:]
-                                            # sub_eager = logits_eager[-1, -4:]
-                                            # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-                                        elif padding_side == "right":
-                                            sub_sdpa = logits_sdpa[:-1]
-                                            sub_eager = logits_eager[:-1]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            sub_sdpa = logits_sdpa[-1, 3:]
-                                            sub_eager = logits_eager[-1, 3:]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            # Testing the padding tokens is not really meaningful but anyway
-                                            # sub_sdpa = logits_sdpa[-1, :3]
-                                            # sub_eager = logits_eager[-1, :3]
-                                            # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-
-                                    else:
-                                        if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                            )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
-
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_generate(self):
-        """Overwrite to add flakyness: outputs sometimes start to diverge after some tokens"""
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_sdpa:
-                self.skipTest(f"{model_class.__name__} does not support SDPA")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model_sdpa = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                    attn_implementation="eager",
-                ).to(torch_device)
-
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-                for name, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
-                has_sdpa = False
-                for name, submodule in model_sdpa.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        has_sdpa = True
-                        break
-                if not has_sdpa:
-                    raise ValueError("The SDPA model should have SDPA attention layers")
-
-                # Just test that a large cache works as expected
-                res_eager = model_eager.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
-
-                res_sdpa = model_sdpa.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(res_eager, res_sdpa))
-
 
 @slow
 @require_torch_accelerator
diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py
index 254c1135357147..81e95ab244f9aa 100644
--- a/tests/models/glpn/test_modeling_glpn.py
+++ b/tests/models/glpn/test_modeling_glpn.py
@@ -157,14 +157,6 @@ def setUp(self):
         self.model_tester = GLPNModelTester(self)
         self.config_tester = GLPNConfigTester(self, config_class=GLPNConfig)
 
-    @unittest.skip(reason="Failing after #32550")
-    def test_pipeline_depth_estimation(self):
-        pass
-
-    @unittest.skip(reason="Failing after #32550")
-    def test_pipeline_depth_estimation_fp16(self):
-        pass
-
     def test_config(self):
         self.config_tester.run_common_tests()
 
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 3f96c20ab2dbd9..012444b472c0fc 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -15,7 +15,6 @@
 
 
 import datetime
-import gc
 import math
 import unittest
 
@@ -23,7 +22,7 @@
 
 from transformers import GPT2Config, is_torch_available
 from transformers.testing_utils import (
-    backend_empty_cache,
+    cleanup,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
@@ -542,8 +541,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        backend_empty_cache(torch_device)
+        cleanup(torch_device)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -753,8 +751,7 @@ class GPT2ModelLanguageGenerationTest(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        backend_empty_cache(torch_device)
+        cleanup(torch_device, gc_collect=True)
 
     def _test_lm_generate_gpt2_helper(
         self,
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
index 9d7750f5cf20cc..1db484c4062c35 100644
--- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
+++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -18,7 +18,7 @@
 from parameterized import parameterized
 
 from transformers import GPTBigCodeConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.testing_utils import cleanup, require_torch, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -422,9 +422,9 @@ def setUp(self):
         self.config_tester = ConfigTester(self, config_class=GPTBigCodeConfig, n_embd=37)
 
     def tearDown(self):
-        import gc
-
-        gc.collect()
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        cleanup(torch_device)
 
     def test_config(self):
         self.config_tester.run_common_tests()
diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index 196f873696eb70..2c3319f02475cc 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -19,7 +19,7 @@
 from parameterized import parameterized
 
 from transformers import AutoTokenizer, GPTNeoXConfig, is_torch_available, set_seed
-from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device
+from transformers.testing_utils import require_torch, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -434,68 +434,6 @@ def test_model_rope_scaling(self):
             torch.testing.assert_close(ntk_sin_long, original_sin_long)
         self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
 
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        """
-        Based on tests.models.llama.test_modeling_llama.LlamaModelTest.test_eager_matches_sdpa_generate
-        which also overwrites the common test as the test is flaky on tiny models.
-        """
-        max_new_tokens = 30
-
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-1b")
-
-        model_sdpa = GPTNeoXForCausalLM.from_pretrained(
-            "EleutherAI/pythia-1b",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-        ).to(torch_device)
-
-        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-        model_eager = GPTNeoXForCausalLM.from_pretrained(
-            "EleutherAI/pythia-1b",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-            attn_implementation="eager",
-        ).to(torch_device)
-
-        self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-        for name, submodule in model_eager.named_modules():
-            if "SdpaAttention" in submodule.__class__.__name__:
-                raise ValueError("The eager model should not have SDPA attention layers")
-
-        has_sdpa = False
-        for name, submodule in model_sdpa.named_modules():
-            if "SdpaAttention" in submodule.__class__.__name__:
-                has_sdpa = True
-                break
-        if not has_sdpa:
-            raise ValueError("The SDPA model should have SDPA attention layers")
-
-        texts = [
-            "hi here's a longer context, getting longer and",
-            "Hello this is a very long sentence my friend, very long for real",
-            "Today I am in Paris and",
-        ]
-
-        for padding_side in ["left", "right"]:
-            tokenizer.padding_side = padding_side
-            tokenizer.pad_token = tokenizer.eos_token
-
-            inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
-
-            res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-            res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-
-            with self.subTest(f"{padding_side}"):
-                torch.testing.assert_close(
-                    res_eager,
-                    res_sdpa,
-                    msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
-                )
-
 
 @require_torch
 class GPTNeoXLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py
index 6f6fba50dc123a..afc741cd502dec 100644
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@@ -17,14 +17,9 @@
 import datetime
 import unittest
 
-import pytest
-
-from transformers import BitsAndBytesConfig, GPTJConfig, is_torch_available
+from transformers import GPTJConfig, is_torch_available
 from transformers.testing_utils import (
-    require_bitsandbytes,
-    require_flash_attn,
     require_torch,
-    require_torch_gpu,
     slow,
     tooslow,
     torch_device,
@@ -505,44 +500,6 @@ def test_model_from_pretrained(self):
         model = GPTJModel.from_pretrained(model_name, revision="float16", torch_dtype=torch.float16)
         self.assertIsNotNone(model)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-        expected_outputs = [
-            "hi<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Q: I have a question about the new version of the game. I have a question about the",
-            "Hello this is a very long sentence.\n\nA:\n\nI think the best way to understand this is to think of it",
-        ]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-
-        model = GPTJForCausalLM.from_pretrained(
-            "EleutherAI/gpt-j-6b",
-            device_map={"": 0},
-            attn_implementation="flash_attention_2",
-            revision="float16",
-            torch_dtype=torch.float16,
-            quantization_config=quantization_config,
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(expected_outputs, output_fa_2)
-
 
 @require_torch
 class GPTJModelLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py
index 1bcb6641803c04..60eb964927278a 100644
--- a/tests/models/granite/test_modeling_granite.py
+++ b/tests/models/granite/test_modeling_granite.py
@@ -17,17 +17,14 @@
 import tempfile
 import unittest
 
-import pytest
 from parameterized import parameterized
 
-from transformers import AutoTokenizer, GraniteConfig, is_torch_available, set_seed
+from transformers import GraniteConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
-    require_bitsandbytes,
     require_flash_attn,
     require_read_token,
     require_torch,
     require_torch_gpu,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -303,9 +300,6 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "ibm/PowerLM-3b"
-
     def setUp(self):
         self.model_tester = GraniteModelTester(self)
         self.config_tester = ConfigTester(self, config_class=GraniteConfig, hidden_size=37)
@@ -423,46 +417,6 @@ def test_model_rope_scaling(self):
         with self.assertRaises(AssertionError):
             torch.testing.assert_close(yarn_sin_long, original_sin_long)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @require_read_token
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = GraniteForCausalLM.from_pretrained(
-            "ibm/PowerLM-3b",
-            load_in_4bit=True,
-            device_map={"": 0},
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("ibm/PowerLM-3b")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = GraniteForCausalLM.from_pretrained(
-            "ibm/PowerLM-3b",
-            load_in_4bit=True,
-            device_map={"": 0},
-            attn_implementation="flash_attention_2",
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
     @require_flash_attn
     @require_torch_gpu
     @slow
@@ -490,15 +444,6 @@ def test_use_flash_attention_2_true(self):
                 if not has_flash:
                     raise ValueError("The flash model should have flash attention layers")
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        """
-        skipping the test since mup is very flaky and gets consistently different outputs
-        """
-        self.skipTest("skipping the test since mup is very flaky and gets consistently different outputs")
-
 
 @require_torch_gpu
 class GraniteIntegrationTest(unittest.TestCase):
diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py
index 124ce0c3bb5ae6..97af65667ed048 100644
--- a/tests/models/granitemoe/test_modeling_granitemoe.py
+++ b/tests/models/granitemoe/test_modeling_granitemoe.py
@@ -17,17 +17,14 @@
 import tempfile
 import unittest
 
-import pytest
 from parameterized import parameterized
 
 from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
-    require_bitsandbytes,
     require_flash_attn,
     require_read_token,
     require_torch,
     require_torch_gpu,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -302,9 +299,6 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "ibm/PowerMoE-3b"
-
     def setUp(self):
         self.model_tester = GraniteMoeModelTester(self)
         self.config_tester = ConfigTester(self, config_class=GraniteMoeConfig, hidden_size=37)
@@ -422,46 +416,6 @@ def test_model_rope_scaling(self):
         with self.assertRaises(AssertionError):
             torch.testing.assert_close(yarn_sin_long, original_sin_long)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @require_read_token
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = GraniteMoeForCausalLM.from_pretrained(
-            "ibm-granite/granitemoe-3b",
-            load_in_4bit=True,
-            device_map={"": 0},
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granitemoe-3b")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = GraniteMoeForCausalLM.from_pretrained(
-            "ibm-granite/granitemoe-3b",
-            load_in_4bit=True,
-            device_map={"": 0},
-            attn_implementation="flash_attention_2",
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
     @require_flash_attn
     @require_torch_gpu
     @slow
@@ -489,15 +443,6 @@ def test_use_flash_attention_2_true(self):
                 if not has_flash:
                     raise ValueError("The flash model should have flash attention layers")
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        """
-        skipping the test since mup is very flaky and gets consistently different outputs
-        """
-        self.skipTest("skipping the test since mup is very flaky and gets consistently different outputs")
-
 
 @require_torch_gpu
 class GraniteMoeIntegrationTest(unittest.TestCase):
diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index fc622ead7a711b..bb8b9272efc952 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -159,26 +159,28 @@ def image_processor_dict(self):
 
     # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_image_processor_properties with DeformableDetr->GroundingDino
     def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "do_pad"))
-        self.assertTrue(hasattr(image_processing, "size"))
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "do_rescale"))
+            self.assertTrue(hasattr(image_processing, "do_pad"))
+            self.assertTrue(hasattr(image_processing, "size"))
 
     # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_image_processor_from_dict_with_kwargs with DeformableDetr->GroundingDino
     def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(image_processor.do_pad, True)
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
+            self.assertEqual(image_processor.do_pad, True)
 
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
-        )
-        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(image_processor.do_pad, False)
+            image_processor = image_processing_class.from_dict(
+                self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
+            )
+            self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
+            self.assertEqual(image_processor.do_pad, False)
 
     def test_post_process_object_detection(self):
         image_processor = self.image_processing_class(**self.image_processor_dict)
@@ -206,40 +208,41 @@ def test_call_pytorch_with_coco_detection_annotations(self):
 
         target = {"image_id": 39769, "annotations": target}
 
-        # encode them
-        image_processing = GroundingDinoImageProcessor()
-        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+        for image_processing_class in self.image_processor_list:
+            # encode them
+            image_processing = image_processing_class()
+            encoding = image_processing(images=image, annotations=target, return_tensors="pt")
+
+            # verify pixel values
+            expected_shape = torch.Size([1, 3, 800, 1066])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+            expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+            self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+            # verify area
+            expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+            # verify boxes
+            expected_boxes_shape = torch.Size([6, 4])
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+            expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+            # verify image_id
+            expected_image_id = torch.tensor([39769])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+            # verify is_crowd
+            expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+            # verify class_labels
+            expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+            # verify orig_size
+            expected_orig_size = torch.tensor([480, 640])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+            # verify size
+            expected_size = torch.tensor([800, 1066])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
 
     @slow
     # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->GroundingDino
@@ -373,43 +376,45 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
 
         masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
 
-        # encode them
-        image_processing = GroundingDinoImageProcessor(format="coco_panoptic")
-        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+        for image_processing_class in self.image_processor_list:
+            # encode them
+            image_processing = image_processing_class(format="coco_panoptic")
+            encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+            # verify pixel values
+            expected_shape = torch.Size([1, 3, 800, 1066])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+            expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+            self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+            # verify area
+            expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+            # verify boxes
+            expected_boxes_shape = torch.Size([6, 4])
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+            expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+            # verify image_id
+            expected_image_id = torch.tensor([39769])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+            # verify is_crowd
+            expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+            # verify class_labels
+            expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+            # verify masks
+            expected_masks_sum = 822873
+            relative_error = torch.abs(encoding["labels"][0]["masks"].sum() - expected_masks_sum) / expected_masks_sum
+            self.assertTrue(relative_error < 1e-3)
+            # verify orig_size
+            expected_orig_size = torch.tensor([480, 640])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+            # verify size
+            expected_size = torch.tensor([800, 1066])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
 
     @slow
     # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->GroundingDino
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index ce31bc44a611d2..88b55ec56d8233 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -559,11 +559,18 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
     def setUp(self):
         self.model_tester = GroupViTModelTester(self)
+        common_properties = ["projection_dim", "projection_intermediate_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=GroupViTConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="hidden_states are tested in individual model tests")
     def test_hidden_states_output(self):
         pass
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index bbade169550f8c..12004cc3c8ad89 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -134,7 +134,7 @@ def __init__(
             num_attention_heads=self.vision_num_attention_heads,
             num_hidden_layers=self.vision_num_hidden_layers,
             intermediate_size=self.vision_intermediate_size,
-        )
+        ).to_dict()
 
         self.perceiver_qk_layer_norms_perceiver = perceiver_qk_layer_norms_perceiver
         self.perceiver_resampler_depth = perceiver_resampler_depth
@@ -316,7 +316,6 @@ def prepare_pixel_values(self):
         return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
     @require_torch_sdpa
-    @slow
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test")
@@ -332,7 +331,11 @@ def test_eager_matches_sdpa_generate(self):
 @require_torch
 class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else ()
-    pipeline_model_mapping = {"feature-extraction": IdeficsModel} if is_torch_available() else {}
+    pipeline_model_mapping = (
+        {"feature-extraction": IdeficsModel, "image-text-to-text": IdeficsForVisionText2Text}
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_headmasking = False
     test_torchscript = False
@@ -349,6 +352,12 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
 
         return inputs_dict
 
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.")
+    def test_eager_matches_sdpa_inference(self):
+        pass
+
     def test_model_outputs_equivalence(self):
         try:
             orig = self.all_model_classes
@@ -598,6 +607,12 @@ def setUp(self):
         )
         self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37)
 
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.")
+    def test_eager_matches_sdpa_inference(self, torch_dtype):
+        pass
+
     @pytest.mark.generate
     def test_left_padding_compatibility(self):
         """Overwrite because IDEFICS needs image attention mask to be also padded"""
@@ -770,12 +785,6 @@ def test_contrastive_generate_low_memory(self):
     def test_custom_4d_attention_mask(self):
         pass
 
-    @unittest.skip(
-        reason="IDEFICS has specific requirements for working with inputs embeds like passing also the ids and pixels"
-    )
-    def test_generate_from_inputs_embeds_decoder_only(self):
-        pass
-
     @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs")
     def test_generate_compile_fullgraph(self):
         pass
diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py
index 854b8b934578e0..ae8c91f29d4d46 100644
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@@ -15,12 +15,10 @@
 """Testing suite for the PyTorch Idefics2 model."""
 
 import copy
-import gc
 import tempfile
 import unittest
 from io import BytesIO
 
-import pytest
 import requests
 
 from transformers import (
@@ -32,6 +30,7 @@
     is_vision_available,
 )
 from transformers.testing_utils import (
+    cleanup,
     require_bitsandbytes,
     require_flash_attn,
     require_torch,
@@ -186,7 +185,12 @@ class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = Idefics2VisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False)
+        self.config_tester = ConfigTester(
+            self, config_class=Idefics2Config, has_text_modality=False, common_properties=["image_token_id"]
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
     def test_inputs_embeds():
@@ -376,6 +380,7 @@ class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest
 
     all_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else ()
     all_generative_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": Idefics2ForConditionalGeneration} if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = True
@@ -420,50 +425,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
     def test_flash_attn_2_fp32_ln(self):
         pass
 
-    @pytest.mark.generate
-    def test_generate_from_inputs_embeds_decoder_only(self):
-        # overwrite because IDEFICS needs ids and embeds at the input to be not None
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-
-            # Ignore:
-            # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids,
-            #   which would cause a mismatch),
-            config.pad_token_id = config.eos_token_id = -1
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
-            input_ids = inputs_dict.pop("input_ids")
-
-            # Traditional way of generating text
-            outputs_from_ids = model.generate(
-                input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
-            )
-            self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
-
-            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs_from_embeds = model.generate(
-                input_ids,
-                inputs_embeds=inputs_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-            self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist())
-
-            # But if we pass different inputs_embeds, we should get different outputs (the output text may be the
-            # same, but the logits will almost surely be different)
-            random_embeds = torch.rand_like(inputs_embeds)
-            outputs_from_rand_embeds = model.generate(
-                input_ids,
-                inputs_embeds=random_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-            for i in range(len(outputs_from_rand_embeds.scores)):
-                self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i]))
-
     # We need to override as we need to prepare such that the image token is the last token
     def test_resize_tokens_embeddings(self):
         (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
@@ -628,8 +589,7 @@ def setUp(self):
         )
 
     def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     @require_torch_multi_gpu
diff --git a/tests/models/idefics2/test_processor_idefics2.py b/tests/models/idefics2/test_processor_idefics2.py
index bf713c6fb8cfbb..d89004679aef0f 100644
--- a/tests/models/idefics2/test_processor_idefics2.py
+++ b/tests/models/idefics2/test_processor_idefics2.py
@@ -226,6 +226,73 @@ def test_add_special_tokens_processor(self):
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         # fmt: on
 
+    def test_non_nested_images_with_batched_text(self):
+        processor = self.get_processor()
+        processor.image_processor.do_image_splitting = False
+
+        image_str = "<image>"
+        text_str_1 = "In this image, we see"
+        text_str_2 = "bla, bla"
+
+        text = [
+            image_str + text_str_1,
+            text_str_2 + image_str + image_str,
+        ]
+        images = [self.image1, self.image2, self.image3]
+
+        inputs = processor(text=text, images=images, padding=True)
+
+        self.assertEqual(inputs["pixel_values"].shape, (2, 2, 3, 767, 980))
+        self.assertEqual(inputs["pixel_attention_mask"].shape, (2, 2, 767, 980))
+
+    def test_process_interleaved_images_prompts_image_error(self):
+        processor = self.get_processor()
+
+        text = [
+            "This is a test sentence.",
+            "In this other sentence we try some good things",
+        ]
+        images = [[self.image1], [self.image2]]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [[self.image1], []]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+
+        text = [
+            "This is a test sentence.<image>",
+            "In this other sentence we try some good things<image>",
+        ]
+        images = [[self.image1], [self.image2, self.image3]]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [[], [self.image2]]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [self.image1, self.image2, self.image3]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [self.image1]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+
+        text = [
+            "This is a test sentence.",
+            "In this other sentence we try some good things<image>",
+        ]
+        images = [[self.image1], []]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [[], [self.image2]]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [self.image1, self.image2]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [self.image1]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+
     def test_apply_chat_template(self):
         # Message contains content which a mix of lists with images and image urls and string
         messages = [
@@ -275,13 +342,3 @@ def prepare_text_inputs(self, batch_size: Optional[int] = None):
         return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
             batch_size - 2
         )
-
-    # Override as PixtralProcessor needs nested images to work properly with batched inputs
-    @require_vision
-    def prepare_image_inputs(self, batch_size: Optional[int] = None):
-        """This function prepares a list of PIL images for testing"""
-        if batch_size is None:
-            return super().prepare_image_inputs()
-        if batch_size < 1:
-            raise ValueError("batch_size must be greater than 0")
-        return [[super().prepare_image_inputs()]] * batch_size
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index f0366e7b539a50..5bfd4c3f3c0e83 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -15,11 +15,9 @@
 """Testing suite for the PyTorch Idefics3 model."""
 
 import copy
-import gc
 import unittest
 from io import BytesIO
 
-import pytest
 import requests
 
 from transformers import (
@@ -27,7 +25,7 @@
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device
+from transformers.testing_utils import cleanup, require_bitsandbytes, require_torch, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -170,7 +168,12 @@ class Idefics3ModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = Idefics3VisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Idefics3Config, has_text_modality=False)
+        self.config_tester = ConfigTester(
+            self, config_class=Idefics3Config, has_text_modality=False, common_properties=["image_token_id"]
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
     def test_inputs_embeds():
@@ -180,10 +183,6 @@ def test_inputs_embeds():
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    @unittest.skip(reason="Model does not support padding right")
-    def test_flash_attn_2_generate_padding_right(self):
-        pass
-
     @unittest.skip(reason="Model does not support padding right")
     def test_flash_attn_2_inference_padding_right(self):
         pass
@@ -323,6 +322,7 @@ class Idefics3ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest
 
     all_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
     all_generative_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": Idefics3ForConditionalGeneration} if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = True
@@ -337,10 +337,6 @@ def setUp(self):
     def test_inputs_embeds():
         pass
 
-    @unittest.skip(reason="Model does not support padding right")
-    def test_flash_attn_2_generate_padding_right(self):
-        pass
-
     @unittest.skip(reason="Model does not support padding right")
     def test_flash_attn_2_inference_padding_right(self):
         pass
@@ -367,50 +363,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
     def test_flash_attn_2_fp32_ln(self):
         pass
 
-    @pytest.mark.generate
-    def test_generate_from_inputs_embeds_decoder_only(self):
-        # overwrite because IDEFICS needs ids and embeds at the input to be not None
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-
-            # Ignore:
-            # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids,
-            #   which would cause a mismatch),
-            config.pad_token_id = config.eos_token_id = -1
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
-            input_ids = inputs_dict.pop("input_ids")
-
-            # Traditional way of generating text
-            outputs_from_ids = model.generate(
-                input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
-            )
-            self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
-
-            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs_from_embeds = model.generate(
-                input_ids,
-                inputs_embeds=inputs_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-            self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist())
-
-            # But if we pass different inputs_embeds, we should get different outputs (the output text may be the
-            # same, but the logits will almost surely be different)
-            random_embeds = torch.rand_like(inputs_embeds)
-            outputs_from_rand_embeds = model.generate(
-                input_ids,
-                inputs_embeds=random_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-            for i in range(len(outputs_from_rand_embeds.scores)):
-                self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i]))
-
     # We need to override as we need to prepare such that the image token is the last token
     def test_resize_tokens_embeddings(self):
         (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
@@ -526,31 +478,6 @@ def test_resize_embeddings_untied(self):
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             model(**self._prepare_for_class(inputs_dict, model_class))
 
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        # overwrite because IDEFICS needs ids and embeds at the input to be not None
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
-
-            wte = model.get_input_embeddings()
-
-            input_ids = inputs["input_ids"]
-            # some models infer position ids/attn mask differently when input ids
-            # by check if pad_token let's make sure no padding is in input ids
-            not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
-            input_ids[input_ids == pad_token_id] = not_pad_token_id
-            del inputs["input_ids"]
-            inputs_embeds = wte(input_ids)
-            out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)
-            out_embeds = model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2)
-
-            self.assertTrue(torch.allclose(out_embeds, out_ids))
-
 
 @require_torch
 class Idefics3ForConditionalGenerationIntegrationTest(unittest.TestCase):
@@ -575,8 +502,7 @@ def setUp(self):
         )
 
     def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     @unittest.skip("multi-gpu tests are disabled for now")
diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py
index a53109b02b6951..52d2f1539a4867 100644
--- a/tests/models/idefics3/test_processor_idefics3.py
+++ b/tests/models/idefics3/test_processor_idefics3.py
@@ -250,6 +250,74 @@ def test_add_special_tokens_processor(self):
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         # fmt: on
 
+    def test_non_nested_images_with_batched_text(self):
+        processor = self.get_processor()
+        processor.image_processor.do_image_splitting = False
+
+        image_str = "<image>"
+        text_str_1 = "In this image, we see"
+        text_str_2 = "In this image, we see"
+
+        text = [
+            image_str + text_str_1,
+            image_str + image_str + text_str_2,
+        ]
+        images = [self.image1, self.image2, self.image3]
+
+        inputs = processor(text=text, images=images, padding=True)
+
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (2, 2, 3, 364, 364))
+        self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (2, 2, 364, 364))
+
+    # Copied from tests.models.idefics2.test_processor_idefics2.Idefics2ProcessorTest.test_process_interleaved_images_prompts_image_error
+    def test_process_interleaved_images_prompts_image_error(self):
+        processor = self.get_processor()
+
+        text = [
+            "This is a test sentence.",
+            "In this other sentence we try some good things",
+        ]
+        images = [[self.image1], [self.image2]]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [[self.image1], []]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+
+        text = [
+            "This is a test sentence.<image>",
+            "In this other sentence we try some good things<image>",
+        ]
+        images = [[self.image1], [self.image2, self.image3]]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [[], [self.image2]]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [self.image1, self.image2, self.image3]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [self.image1]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+
+        text = [
+            "This is a test sentence.",
+            "In this other sentence we try some good things<image>",
+        ]
+        images = [[self.image1], []]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [[], [self.image2]]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [self.image1, self.image2]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [self.image1]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+
     def test_apply_chat_template(self):
         # Message contains content which a mix of lists with images and image urls and string
         messages = [
@@ -299,16 +367,7 @@ def prepare_text_inputs(self, batch_size: Optional[int] = None):
             batch_size - 2
         )
 
-    # Override as Idefics3Processor needs nested images to work properly with batched inputs
-    @require_vision
-    def prepare_image_inputs(self, batch_size: Optional[int] = None):
-        """This function prepares a list of PIL images for testing"""
-        if batch_size is None:
-            return super().prepare_image_inputs()
-        if batch_size < 1:
-            raise ValueError("batch_size must be greater than 0")
-        return [[super().prepare_image_inputs()]] * batch_size
-
+    # Override tests as inputs_ids padded dimension is the second one but not the last one
     @require_vision
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py
index 079726755289fe..cdbe815431f319 100644
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -18,7 +18,7 @@
 import unittest
 
 from transformers import ImageGPTConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import require_torch, require_vision, run_test_using_subprocess, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -257,11 +257,9 @@ def _check_scores(self, batch_size, scores, length, config):
         self.assertEqual(len(scores), length)
         self.assertListEqual([iter_scores.shape for iter_scores in scores], [expected_shape] * len(scores))
 
-    @unittest.skip(
-        reason="After #33632, this test still passes, but many subsequential tests fail with `device-side assert triggered`"
-    )
+    @run_test_using_subprocess
     def test_beam_search_generate_dict_outputs_use_cache(self):
-        pass
+        super().test_beam_search_generate_dict_outputs_use_cache()
 
     def setUp(self):
         self.model_tester = ImageGPTModelTester(self)
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index a33be021353f72..e77577dad7877b 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -19,7 +19,9 @@
 import unittest
 
 import numpy as np
+import pytest
 import requests
+from parameterized import parameterized
 
 from transformers import (
     CONFIG_MAPPING,
@@ -320,7 +322,7 @@ def __init__(
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
+        max_position_embeddings=100,
         eos_token_id=2,
         pad_token_id=1,
         bos_token_id=0,
@@ -384,7 +386,14 @@ def get_config(self):
 # this model tester uses a decoder-only language model (OPT)
 class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
     def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+        self,
+        parent,
+        vision_kwargs=None,
+        qformer_kwargs=None,
+        text_kwargs=None,
+        is_training=True,
+        num_query_tokens=10,
+        image_token_index=4,
     ):
         if vision_kwargs is None:
             vision_kwargs = {}
@@ -398,9 +407,10 @@ def __init__(
         self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs)
         self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs)
         self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
+        self.seq_length = self.text_model_tester.seq_length + num_query_tokens  # need seq_length for common tests
         self.is_training = is_training
         self.num_query_tokens = num_query_tokens
+        self.image_token_index = image_token_index
 
     def prepare_config_and_inputs(self):
         _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
@@ -408,6 +418,14 @@ def prepare_config_and_inputs(self):
         _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
 
         config = self.get_config()
+        vision_tokens = (
+            torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype)
+            * self.image_token_index
+        )
+        input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id
+        input_ids = torch.cat([vision_tokens, input_ids], dim=-1)
+        vision_attention_mask = torch.ones_like(vision_tokens)
+        attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1)
 
         return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values
 
@@ -417,6 +435,7 @@ def get_config(self):
             qformer_config=self.qformer_model_tester.get_config(),
             text_config=self.text_model_tester.get_config(),
             num_query_tokens=self.num_query_tokens,
+            image_token_index=self.image_token_index,
         )
 
     def create_and_check_for_conditional_generation(
@@ -455,16 +474,27 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration}
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
-    test_resize_embeddings = False
+    test_resize_embeddings = True
     test_attention_outputs = False
     test_torchscript = False
     _is_composite = True
 
     def setUp(self):
         self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=InstructBlipConfig,
+            has_text_modality=False,
+            common_properties=["num_query_tokens", "image_token_index"],
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_for_conditional_generation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -531,6 +561,199 @@ def test_model_from_pretrained(self):
         model = InstructBlipForConditionalGeneration.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    # overwrite because InstructBLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
+        use_cache = True  # force this to be True in case False is passed
+
+        input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
+        internal_batch_size = (
+            input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
+        )
+
+        seq_length = getattr(self.model_tester, "seq_length", None)
+        seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
+        seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
+
+        config = config.text_config if hasattr(config, "text_config") else config
+
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
+
+        # in some models we subsample the sequence length in inner layers
+        if hasattr(self.model_tester, "get_subsampled_output_lengths"):
+            seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
+
+        # scores
+        self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
+
+        # unprocessed logits
+        self._check_logits(internal_batch_size, output.logits, config=config)
+
+        # Attentions
+        if self.has_attentions:
+            if config.is_encoder_decoder:
+                # encoder
+                self._check_encoder_attention_for_generate(
+                    output.encoder_attentions, input_batch_size, config, seq_length
+                )
+                # decoder
+                self._check_attentions_for_generate(
+                    internal_batch_size,
+                    output.decoder_attentions,
+                    min_length=1,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+            else:
+                # if use_cache first input is equal to no use_cache, so skip here
+                attentions = output.attentions if not use_cache else output.attentions[1:]
+                min_length = seq_length if not use_cache else seq_length + 1
+                self._check_attentions_for_generate(
+                    internal_batch_size,
+                    attentions=attentions,
+                    min_length=min_length,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+
+        # Hidden States
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_hidden_states_for_generate(
+                output.encoder_hidden_states, input_batch_size, config, seq_length
+            )
+
+            # decoder
+            self._check_hidden_states_for_generate(
+                internal_batch_size,
+                output.decoder_hidden_states,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_hidden_states_for_generate(
+                internal_batch_size,
+                hidden_states,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+
+        # Past Key Value States
+        if use_cache:
+            past_key_values = output.past_key_values
+            past_sequence_length = output.sequences.shape[-1] - 1
+            self._check_past_key_values_for_generate(
+                internal_batch_size,
+                past_key_values,
+                seq_length=past_sequence_length,
+                config=config,
+            )
+
+    # overwrite because InstructBLIP cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present
+    @pytest.mark.generate
+    def test_left_padding_compatibility(self):
+        # NOTE: left-padding results in small numerical differences. This is expected.
+        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
+
+        # First, filter out models that don't support left padding
+        # - The model must have generative capabilities
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(reason="No generative architecture available for this model.")
+
+        # - The model must support padding
+        if not self.has_attentions:
+            self.skipTest(reason="This model doesn't support padding.")
+
+        # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
+        decoder_only_classes = []
+        for model_class in self.all_generative_model_classes:
+            config, _ = self.prepare_config_and_inputs_for_generate()
+            if config.is_encoder_decoder:
+                continue
+            else:
+                decoder_only_classes.append(model_class)
+        if len(decoder_only_classes) == 0:
+            self.skipTest(reason="No decoder-only architecture available for this model.")
+
+        # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
+        #   added support for it yet. We skip these models for now.
+        has_encoder_attributes = any(
+            attr_name
+            for attr_name in config.to_dict().keys()
+            if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
+        )
+        if has_encoder_attributes:
+            self.skipTest(
+                reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
+            )
+
+        # Then, test left-padding
+        def _prepare_model_kwargs(input_ids, attention_mask, signature):
+            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            if "position_ids" in signature:
+                position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                model_kwargs["position_ids"] = position_ids
+            if "cache_position" in signature:
+                cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
+                model_kwargs["cache_position"] = cache_position
+            return model_kwargs
+
+        for model_class in decoder_only_classes:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            input_ids = inputs_dict["input_ids"]
+            attention_mask = inputs_dict.get("attention_mask")
+            pixel_values = inputs_dict["pixel_values"]
+            qformer_input_ids = inputs_dict["qformer_input_ids"]
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+
+            model = model_class(config).to(torch_device).eval()
+            signature = inspect.signature(model.forward).parameters.keys()
+
+            # no cache as some models require special cache classes to be init outside forward
+            model.generation_config.use_cache = False
+
+            # Without padding
+            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
+            next_logits_wo_padding = model(
+                **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
+            ).logits[:, -1, :]
+
+            # With left-padding (length 32)
+            # can hardcode pad_token to be 0 as we'll do attn masking anyway
+            pad_token_id = (
+                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
+            )
+            pad_size = (input_ids.shape[0], 32)
+            padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
+            padded_input_ids = torch.cat((padding, input_ids), dim=1)
+            padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+            model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
+            next_logits_with_padding = model(
+                **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
+            ).logits[:, -1, :]
+
+            # They should result in very similar logits
+            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
+
+    @unittest.skip(
+        "InstructBLIP cannot generate only from input ids, and requires pixel values in all cases to be present"
+    )
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
+        pass
+
     @require_torch_sdpa
     def test_sdpa_can_dispatch_composite_models(self):
         """
@@ -631,12 +854,12 @@ def test_inference_vicuna_7b(self):
         outputs = model.generate(**inputs, max_new_tokens=30)
         generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
 
-        expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 372, 338, 19500, 1623, 263, 19587, 4272]  # fmt: off
+        expected_outputs = [2, 1724, 338, 22910, 1048, 445, 1967, 29973, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 19500, 373, 263, 19587, 4272, 11952, 29889]  # fmt: off
 
         self.assertEqual(outputs[0].tolist(), expected_outputs)
         self.assertEqual(
             generated_text,
-            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while it is driving down a busy city",
+            "What is unusual about this image? The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving on a busy city street.",
         )
 
     def test_inference_flant5_xl(self):
diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index 298c7a8d7ff46f..3be5f89325cf38 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -19,7 +19,9 @@
 import unittest
 
 import numpy as np
+import pytest
 from huggingface_hub import hf_hub_download
+from parameterized import parameterized
 
 from transformers import (
     CONFIG_MAPPING,
@@ -398,7 +400,14 @@ def get_config(self):
 # this model tester uses a decoder-only language model (OPT)
 class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
     def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+        self,
+        parent,
+        vision_kwargs=None,
+        qformer_kwargs=None,
+        text_kwargs=None,
+        is_training=True,
+        num_query_tokens=10,
+        video_token_index=4,
     ):
         if vision_kwargs is None:
             vision_kwargs = {}
@@ -412,17 +421,30 @@ def __init__(
         self.qformer_model_tester = InstructBlipVideoQFormerModelTester(parent, **qformer_kwargs)
         self.text_model_tester = InstructBlipVideoTextModelDecoderOnlyTester(parent, **text_kwargs)
         self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
+        self.frames = self.vision_model_tester.frames
+        # need seq_length for common tests
+        self.seq_length = self.text_model_tester.seq_length + (num_query_tokens * self.frames)
         self.is_training = is_training
         self.num_query_tokens = num_query_tokens
+        self.video_token_index = video_token_index
 
     def prepare_config_and_inputs(self):
         _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
         _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs()
         _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        frames = self.vision_model_tester.frames
         _, c, h, w = pixel_values.shape
-        pixel_values = pixel_values.reshape(-1, frames, c, h, w)
+        pixel_values = pixel_values.reshape(-1, self.frames, c, h, w)
+
+        vision_tokens = (
+            torch.ones(
+                (input_ids.shape[0], self.num_query_tokens * self.frames), device=torch_device, dtype=input_ids.dtype
+            )
+            * self.video_token_index
+        )
+        input_ids[input_ids == self.video_token_index] = self.text_model_tester.pad_token_id
+        input_ids = torch.cat([vision_tokens, input_ids], dim=-1)
+        vision_attention_mask = torch.ones_like(vision_tokens)
+        attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1)
 
         config = self.get_config()
 
@@ -434,6 +456,7 @@ def get_config(self):
             qformer_config=self.qformer_model_tester.get_config(),
             text_config=self.text_model_tester.get_config(),
             num_query_tokens=self.num_query_tokens,
+            video_token_index=self.video_token_index,
         )
 
     def create_and_check_for_conditional_generation(
@@ -476,21 +499,29 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
     ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
 ):
     all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
-    test_resize_embeddings = False
+    test_resize_embeddings = True
     test_attention_outputs = False
     test_torchscript = False
     _is_composite = True
 
     def setUp(self):
         self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self)
+        common_properties = ["num_query_tokens", "video_token_index"]
+        self.config_tester = ConfigTester(
+            self, config_class=InstructBlipVideoConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_for_conditional_generation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
@@ -552,6 +583,199 @@ def test_model_from_pretrained(self):
         model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    # overwrite because InstructBLIPVideo internally calls LM.generate() with embeds thus it cannot operate in no cache format
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
+        use_cache = True  # force this to be True in case False is passed
+
+        input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
+        internal_batch_size = (
+            input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
+        )
+
+        seq_length = getattr(self.model_tester, "seq_length", None)
+        seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
+        seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
+
+        config = config.text_config if hasattr(config, "text_config") else config
+
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
+
+        # in some models we subsample the sequence length in inner layers
+        if hasattr(self.model_tester, "get_subsampled_output_lengths"):
+            seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
+
+        # scores
+        self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
+
+        # unprocessed logits
+        self._check_logits(internal_batch_size, output.logits, config=config)
+
+        # Attentions
+        if self.has_attentions:
+            if config.is_encoder_decoder:
+                # encoder
+                self._check_encoder_attention_for_generate(
+                    output.encoder_attentions, input_batch_size, config, seq_length
+                )
+                # decoder
+                self._check_attentions_for_generate(
+                    internal_batch_size,
+                    output.decoder_attentions,
+                    min_length=1,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+            else:
+                # if use_cache first input is equal to no use_cache, so skip here
+                attentions = output.attentions if not use_cache else output.attentions[1:]
+                min_length = seq_length if not use_cache else seq_length + 1
+                self._check_attentions_for_generate(
+                    internal_batch_size,
+                    attentions=attentions,
+                    min_length=min_length,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+
+        # Hidden States
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_hidden_states_for_generate(
+                output.encoder_hidden_states, input_batch_size, config, seq_length
+            )
+
+            # decoder
+            self._check_hidden_states_for_generate(
+                internal_batch_size,
+                output.decoder_hidden_states,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_hidden_states_for_generate(
+                internal_batch_size,
+                hidden_states,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+
+        # Past Key Value States
+        if use_cache:
+            past_key_values = output.past_key_values
+            past_sequence_length = output.sequences.shape[-1] - 1
+            self._check_past_key_values_for_generate(
+                internal_batch_size,
+                past_key_values,
+                seq_length=past_sequence_length,
+                config=config,
+            )
+
+    # overwrite because InstructBLIPVideo cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present
+    @pytest.mark.generate
+    def test_left_padding_compatibility(self):
+        # NOTE: left-padding results in small numerical differences. This is expected.
+        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
+
+        # First, filter out models that don't support left padding
+        # - The model must have generative capabilities
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(reason="No generative architecture available for this model.")
+
+        # - The model must support padding
+        if not self.has_attentions:
+            self.skipTest(reason="This model doesn't support padding.")
+
+        # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
+        decoder_only_classes = []
+        for model_class in self.all_generative_model_classes:
+            config, _ = self.prepare_config_and_inputs_for_generate()
+            if config.is_encoder_decoder:
+                continue
+            else:
+                decoder_only_classes.append(model_class)
+        if len(decoder_only_classes) == 0:
+            self.skipTest(reason="No decoder-only architecture available for this model.")
+
+        # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
+        #   added support for it yet. We skip these models for now.
+        has_encoder_attributes = any(
+            attr_name
+            for attr_name in config.to_dict().keys()
+            if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
+        )
+        if has_encoder_attributes:
+            self.skipTest(
+                reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
+            )
+
+        # Then, test left-padding
+        def _prepare_model_kwargs(input_ids, attention_mask, signature):
+            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            if "position_ids" in signature:
+                position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                model_kwargs["position_ids"] = position_ids
+            if "cache_position" in signature:
+                cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
+                model_kwargs["cache_position"] = cache_position
+            return model_kwargs
+
+        for model_class in decoder_only_classes:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            input_ids = inputs_dict["input_ids"]
+            attention_mask = inputs_dict.get("attention_mask")
+            pixel_values = inputs_dict["pixel_values"]
+            qformer_input_ids = inputs_dict["qformer_input_ids"]
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+
+            model = model_class(config).to(torch_device).eval()
+            signature = inspect.signature(model.forward).parameters.keys()
+
+            # no cache as some models require special cache classes to be init outside forward
+            model.generation_config.use_cache = False
+
+            # Without padding
+            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
+            next_logits_wo_padding = model(
+                **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
+            ).logits[:, -1, :]
+
+            # With left-padding (length 32)
+            # can hardcode pad_token to be 0 as we'll do attn masking anyway
+            pad_token_id = (
+                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
+            )
+            pad_size = (input_ids.shape[0], 32)
+            padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
+            padded_input_ids = torch.cat((padding, input_ids), dim=1)
+            padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+            model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
+            next_logits_with_padding = model(
+                **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
+            ).logits[:, -1, :]
+
+            # They should result in very similar logits
+            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
+
+    @unittest.skip(
+        "InstructBLIPVideo cannot generate only from input ids, and requires pixel values in all cases to be present"
+    )
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
+        pass
+
     @require_torch_sdpa
     def test_sdpa_can_dispatch_composite_models(self):
         """
@@ -643,7 +867,7 @@ def test_inference_vicuna_7b(self):
         generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
         self.assertEqual(
             generated_text,
-            "a baby girl wearing glasses is reading a book on the bed 1080p",
+            "Explain what is happening in this short video. a baby girl wearing glasses is reading a book on the bed 1080p",
         )
 
     def test_expansion_in_processing(self):
diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py
index 251f293f722661..ef0b5831587be1 100644
--- a/tests/models/jamba/test_modeling_jamba.py
+++ b/tests/models/jamba/test_modeling_jamba.py
@@ -539,93 +539,6 @@ def test_flash_attn_2_fp32_ln(self):
                 # with attention mask
                 _ = model(dummy_input, attention_mask=dummy_attention_mask)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        r"""
-        Overriding the test_flash_attn_2_generate_padding_right test as the Jamba model, like Mixtral, doesn't support
-        right padding + use cache with FA2
-        """
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        r"""
-        Overriding the test_flash_attn_2_generate_use_cache test as the Jamba model, like Mixtral, doesn't support
-        right padding + use cache with FA2
-        """
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Jamba does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py
index 867f97c48a68ab..dc510f0ff040bb 100644
--- a/tests/models/jetmoe/test_modeling_jetmoe.py
+++ b/tests/models/jetmoe/test_modeling_jetmoe.py
@@ -15,7 +15,6 @@
 """Testing suite for the PyTorch JetMoe model."""
 
 import gc
-import tempfile
 import unittest
 
 import pytest
@@ -24,11 +23,9 @@
 from transformers import AutoTokenizer, JetMoeConfig, is_torch_available
 from transformers.testing_utils import (
     backend_empty_cache,
-    is_flaky,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -302,13 +299,6 @@ class JetMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
     test_disk_offload_bin = False
     test_disk_offload_safetensors = False
 
-    # TODO: @Fxmarty
-    @is_flaky(max_attempts=3, description="flaky on some models.")
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        super().test_eager_matches_sdpa_generate()
-
     @parameterized.expand([(1, False), (1, True), (4, False)])
     def test_new_cache_format(self, num_beams, do_sample):
         pass
@@ -386,85 +376,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: JetMoe apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
index de6c0b15d661f9..9b5089a635da97 100644
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -21,7 +21,9 @@
 import unittest
 
 import numpy as np
+import pytest
 import requests
+from parameterized import parameterized
 
 from transformers import AutoModelForImageTextToText, AutoProcessor, Kosmos2Config
 from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
@@ -37,6 +39,7 @@
     is_vision_available,
 )
 
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
@@ -205,6 +208,7 @@ def __init__(self, parent, text_kwargs=None, vision_kwargs=None, latent_query_nu
         self.text_model_tester = Kosmos2TextModelTester(parent, **text_kwargs)
         self.vision_model_tester = Kosmos2VisionModelTester(parent, **vision_kwargs)
         self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.seq_length = self.text_model_tester.seq_length
         self.latent_query_num = latent_query_num
         self.is_training = is_training
 
@@ -253,11 +257,15 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class Kosmos2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Kosmos2Model, Kosmos2ForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (Kosmos2ForConditionalGeneration,) if is_torch_available() else ()
     pipeline_model_mapping = (
-        {"feature-extraction": Kosmos2Model, "image-to-text": Kosmos2ForConditionalGeneration}
+        {
+            "feature-extraction": Kosmos2Model,
+            "image-to-text": Kosmos2ForConditionalGeneration,
+            "image-text-to-text": Kosmos2ForConditionalGeneration,
+        }
         if is_torch_available()
         else {}
     )
@@ -269,6 +277,7 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
     _is_composite = True
 
     # TODO: `image-to-text` pipeline for this model needs Processor.
+    # TODO: Tiny model needs fixing for `image-text-to-text` (latent_query_num=3 not compatible with num_image_tokens=64).
     def is_pipeline_test_to_skip(
         self,
         pipeline_test_case_name,
@@ -279,7 +288,10 @@ def is_pipeline_test_to_skip(
         feature_extractor_name,
         processor_name,
     ):
-        return pipeline_test_case_name == "ImageToTextPipelineTests"
+        return (
+            pipeline_test_case_name == "ImageToTextPipelineTests"
+            or pipeline_test_case_name == "ImageTextToTextPipelineTests"
+        )
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
@@ -296,7 +308,12 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
 
     def setUp(self):
         self.model_tester = Kosmos2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Kosmos2Config, hidden_size=37)
+        self.config_tester = ConfigTester(
+            self, config_class=Kosmos2Config, has_text_modality=False, common_properties=["latent_query_num"]
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     # overwrite from common to skip `image_to_text_projection.latent_query`
     def test_initialization(self):
@@ -438,12 +455,68 @@ def check_same_values(layer_1, layer_2):
             # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
             # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
 
+    @pytest.mark.generate
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
     @unittest.skip(
-        "KOSMOS-2 doesn't support inputs embeds. The test isn't skipped by checking ipnut args because KOSMOS-2 has `generate()` overwritten"
+        "KOSMOS-2 doesn't support inputs embeds. The test isn't skipped by checking input args because KOSMOS-2 has `generate()` overwritten"
     )
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
+    def test_generate_from_inputs_embeds(self):
         pass
 
+    @pytest.mark.generate
+    def test_left_padding_compatibility(self):
+        # Overwrite because Kosmos-2 need to padd pixel values and pad image-attn-mask
+
+        def _prepare_model_kwargs(input_ids, attention_mask, pad_size, signature):
+            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            if "position_ids" in signature:
+                position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                model_kwargs["position_ids"] = position_ids
+            if "cache_position" in signature:
+                cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
+                model_kwargs["cache_position"] = cache_position
+            if "image_embeds_position_mask" in signature:
+                image_embeds_position_mask = torch.zeros_like(input_ids)
+                image_embeds_position_mask[:, (pad_size + 1) : pad_size + 1 + self.model_tester.latent_query_num] = 1
+                model_kwargs["image_embeds_position_mask"] = image_embeds_position_mask
+            return model_kwargs
+
+        for model_class in self.all_generative_model_classes:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            input_ids = inputs_dict["input_ids"]
+            pixel_values = inputs_dict["pixel_values"]
+            attention_mask = inputs_dict.get("attention_mask")
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+
+            model = model_class(config).to(torch_device).eval()
+            signature = inspect.signature(model.forward).parameters.keys()
+
+            # no cache as some models require special cache classes to be init outside forward
+            model.generation_config.use_cache = False
+
+            # Without padding
+            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, pad_size=0, signature=signature)
+            next_logits_wo_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :]
+
+            # With left-padding (length 32)
+            # can hardcode pad_token to be 0 as we'll do attn masking anyway
+            pad_token_id = (
+                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
+            )
+            pad_size = (input_ids.shape[0], 32)
+            padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
+            padded_input_ids = torch.cat((padding, input_ids), dim=1)
+            padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+            model_kwargs = _prepare_model_kwargs(
+                padded_input_ids, padded_attention_mask, pad_size=32, signature=signature
+            )
+            next_logits_with_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :]
+
+            # They should result in very similar logits
+            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-3))
+
     @slow
     def test_model_from_pretrained(self):
         model_name = "microsoft/kosmos-2-patch14-224"
diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
index 19a6aeec46f935..7dcf5399703103 100644
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -1659,7 +1659,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         special_tokens_map = {}
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                 special_token = getattr(tokenizer, token)
                 special_tokens_map[special_token] = f"{special_token}a"
 
@@ -1671,7 +1671,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         # Check the changes
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                 continue
             special_token = getattr(tokenizer, token)
             if special_token in special_tokens_map:
@@ -2497,3 +2497,7 @@ def test_chat_template(self):
     @unittest.skip("Chat is not supported")
     def test_chat_template_return_assistant_tokens_mask(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        pass
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index 007e23430b3a56..9af0861536f73d 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -1537,7 +1537,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         special_tokens_map = {}
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                 special_token = getattr(tokenizer, token)
                 special_tokens_map[special_token] = f"{special_token}a"
 
@@ -1549,7 +1549,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         # Check the changes
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                 continue
             special_token = getattr(tokenizer, token)
             if special_token in special_tokens_map:
@@ -2450,3 +2450,7 @@ def test_chat_template(self):
     @unittest.skip("Chat is not supported")
     def test_chat_template_return_assistant_tokens_mask(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        pass
diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
index 8acd3716cf576b..f387e52790fce3 100644
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -1588,7 +1588,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         special_tokens_map = {}
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                 special_token = getattr(tokenizer, token)
                 special_tokens_map[special_token] = f"{special_token}a"
 
@@ -1600,7 +1600,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         # Check the changes
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                 continue
             special_token = getattr(tokenizer, token)
             if special_token in special_tokens_map:
@@ -1991,3 +1991,7 @@ def test_chat_template(self):
     @unittest.skip("Chat is not supported")
     def test_chat_template_return_assistant_tokens_mask(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        pass
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index bf7ca7848951c8..9e67f4f7381e24 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch LLaMA model."""
 
-import gc
 import tempfile
 import unittest
 
@@ -25,14 +24,12 @@
 from transformers import AutoTokenizer, LlamaConfig, StaticCache, is_torch_available, set_seed
 from transformers.generation.configuration_utils import GenerationConfig
 from transformers.testing_utils import (
-    backend_empty_cache,
-    require_bitsandbytes,
+    cleanup,
     require_flash_attn,
     require_read_token,
     require_torch,
     require_torch_accelerator,
     require_torch_gpu,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -317,9 +314,6 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "meta-llama/Llama-2-7b-hf"
-
     # used in `test_torch_compile_for_training`
     _torch_compile_train_cls = LlamaForCausalLM if is_torch_available() else None
 
@@ -586,43 +580,6 @@ def _reinitialize_config(base_config, new_kwargs):
         with self.assertRaises(KeyError):
             config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}})  # missing "factor"
 
-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @require_read_token
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf",
-            load_in_4bit=True,
-            device_map={"": 0},
-        )
-
-        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2"
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
     @require_flash_attn
     @require_torch_gpu
     @slow
@@ -651,67 +608,6 @@ def test_use_flash_attention_2_true(self):
                 if not has_flash:
                     raise ValueError("The flash model should have flash attention layers")
 
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        max_new_tokens = 30
-
-        tokenizer = LlamaTokenizer.from_pretrained("saibo/llama-1B")
-
-        model_sdpa = LlamaForCausalLM.from_pretrained(
-            "saibo/llama-1B",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-        ).to(torch_device)
-
-        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-        model_eager = LlamaForCausalLM.from_pretrained(
-            "saibo/llama-1B",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-            attn_implementation="eager",
-        ).to(torch_device)
-
-        self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-        for name, submodule in model_eager.named_modules():
-            if "SdpaAttention" in submodule.__class__.__name__:
-                raise ValueError("The eager model should not have SDPA attention layers")
-
-        has_sdpa = False
-        for name, submodule in model_sdpa.named_modules():
-            if "SdpaAttention" in submodule.__class__.__name__:
-                has_sdpa = True
-                break
-        if not has_sdpa:
-            raise ValueError("The SDPA model should have SDPA attention layers")
-
-        texts = [
-            "hi here's a longer context, getting longer and",
-            "Hello this is a very long sentence my friend, very long for real",
-            "Today I am in Paris and",
-        ]
-
-        for padding_side in ["left", "right"]:
-            tokenizer.padding_side = padding_side
-            tokenizer.pad_token = tokenizer.eos_token
-
-            inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
-
-            res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-            res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-
-            with self.subTest(f"{padding_side}"):
-                torch.testing.assert_close(
-                    res_eager,
-                    res_sdpa,
-                    msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
-                )
-
     @unittest.skip("Broken by the loss update will fix soon @ArthurZucker")
     def test_torch_fx_output_loss(self, *args, **kwargs):
         pass
@@ -994,8 +890,7 @@ def test_export_static_cache(self):
 @require_torch_accelerator
 class Mask4DTestHard(unittest.TestCase):
     def tearDown(self):
-        gc.collect()
-        backend_empty_cache(torch_device)
+        cleanup(torch_device, gc_collect=True)
 
     def setUp(self):
         model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index c7e8b5e86021e5..2ae9127dee3017 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -385,6 +385,7 @@ def test_fast_special_tokens(self):
         assert fast == [1, 319, 4559, 1243]
 
         fast_tokenizer.add_eos_token = True
+        print(fast_tokenizer.add_eos_token)
         fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
         assert fast == [1, 319, 4559, 1243, 2]
 
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 405fad1bd31c8d..3d08ab35e0f630 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Llava model."""
 
-import gc
 import unittest
 
 import requests
@@ -28,6 +27,7 @@
     is_vision_available,
 )
 from transformers.testing_utils import (
+    cleanup,
     require_bitsandbytes,
     require_torch,
     require_torch_gpu,
@@ -85,7 +85,7 @@ def __init__(
         },
         is_training=True,
         vision_config={
-            "image_size": 30,
+            "image_size": 8,
             "patch_size": 2,
             "num_channels": 3,
             "is_training": True,
@@ -118,9 +118,9 @@ def __init__(
         self.batch_size = 3
         self.num_channels = 3
         self.image_size = 336
-        self.encoder_seq_length = 232
-        self.num_image_tokens = 225
+        self.num_image_tokens = (self.vision_config["image_size"] // self.vision_config["patch_size"]) ** 2
         self.seq_length = seq_length + self.num_image_tokens
+        self.encoder_seq_length = self.seq_length
 
     def get_config(self):
         return LlavaConfig(
@@ -183,14 +183,24 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
 
     all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
     all_generative_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
-    pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
+    pipeline_model_mapping = (
+        {"image-to-text": LlavaForConditionalGeneration, "image-text-to-text": LlavaForConditionalGeneration}
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_head_masking = False
     _is_composite = True
 
     def setUp(self):
         self.model_tester = LlavaVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)
+        common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"]
+        self.config_tester = ConfigTester(
+            self, config_class=LlavaConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
     def test_inputs_embeds(self):
@@ -235,6 +245,35 @@ def test_inputs_embeds_matches_input_ids(self):
                 out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
             self.assertTrue(torch.allclose(out_embeds, out_ids))
 
+    def test_mismatching_num_image_tokens(self):
+        """
+        Tests that VLMs through an error with explicit message saying what is wrong
+        when number of images don't match number of image tokens in the text.
+        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            _ = model(**input_dict)  # successfull forward with no modifications
+
+            # remove one image but leave the image token in text
+            input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(**input_dict)
+
+            # simulate multi-image case by concatenating inputs where each has exactly one image/image-token
+            input_ids = input_dict["input_ids"][:1]
+            pixel_values = input_dict["pixel_values"][:1]
+            input_ids = torch.cat([input_ids, input_ids], dim=0)
+
+            # one image and two image tokens raise an error
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, pixel_values=pixel_values)
+
+            # two images and two image tokens don't raise an error
+            pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
+            _ = model(input_ids=input_ids, pixel_values=pixel_values)
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
@@ -278,8 +317,7 @@ def setUp(self):
         self.processor = AutoProcessor.from_pretrained("llava-hf/bakLlava-v1-hf")
 
     def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     @require_bitsandbytes
@@ -569,6 +607,7 @@ def test_expansion_in_processing(self):
 
         # check processing with expansion of inputs
         processor.vision_feature_select_strategy = "default"
+        processor.num_additional_image_tokens = 1
         processor.patch_size = 14
         inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
         self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
@@ -576,6 +615,7 @@ def test_expansion_in_processing(self):
         # check processing without expansion of inputs (legacy behavior)
         processor.vision_feature_select_strategy = None
         processor.patch_size = None
+        processor.num_additional_image_tokens = None
         inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
         self.assertTrue(inputs.input_ids.shape[-1] == 18)
 
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index 6589bf14d24c65..c258ce96b94e48 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Llava-NeXT model."""
 
-import gc
 import unittest
 
 import requests
@@ -28,6 +27,7 @@
     is_vision_available,
 )
 from transformers.testing_utils import (
+    cleanup,
     require_bitsandbytes,
     require_torch,
     slow,
@@ -90,7 +90,7 @@ def __init__(
         },
         is_training=True,
         vision_config={
-            "image_size": 16,
+            "image_size": 8,
             "patch_size": 4,
             "num_channels": 3,
             "is_training": True,
@@ -123,10 +123,10 @@ def __init__(
         self.batch_size = 3
         self.num_channels = 3
         self.image_size = 30
-        self.encoder_seq_length = 95
-        self.image_grid_pinpoints = [[32, 32]]
-        self.num_image_tokens = 88
+        self.image_grid_pinpoints = [[16, 16]]
+        self.num_image_tokens = 24
         self.seq_length = seq_length + self.num_image_tokens
+        self.encoder_seq_length = self.seq_length
 
     def get_config(self):
         return LlavaNextConfig(
@@ -216,13 +216,20 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
 
     all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
     all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": LlavaNextForConditionalGeneration} if is_torch_available() else {}
     test_pruning = False
     test_head_masking = False
     _is_composite = True
 
     def setUp(self):
         self.model_tester = LlavaNextVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LlavaNextConfig, has_text_modality=False)
+        common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"]
+        self.config_tester = ConfigTester(
+            self, config_class=LlavaNextConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -283,6 +290,38 @@ def test_inputs_embeds_matches_input_ids(self):
                 out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
             self.assertTrue(torch.allclose(out_embeds, out_ids))
 
+    def test_mismatching_num_image_tokens(self):
+        """
+        Tests that VLMs through an error with explicit message saying what is wrong
+        when number of images don't match number of image tokens in the text.
+        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            _ = model(**input_dict)  # successfull forward with no modifications
+
+            # remove one image but leave the image token in text
+            input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
+            input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(**input_dict)
+
+            # simulate multi-image case by concatenating inputs where each has exactly one image/image-token
+            input_ids = input_dict["input_ids"][:1]
+            pixel_values = input_dict["pixel_values"][:1]
+            image_sizes = input_dict["image_sizes"][:1]
+            input_ids = torch.cat([input_ids, input_ids], dim=0)
+
+            # one image and two image tokens raise an error
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
+
+            # two images and two image tokens don't raise an error
+            pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
+            image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
+            _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
@@ -338,8 +377,7 @@ def setUp(self):
         self.prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
 
     def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     @require_bitsandbytes
@@ -584,6 +622,7 @@ def test_expansion_in_processing_multiimage(self):
         # check processing with expansion of inputs
         processor.vision_feature_select_strategy = "default"
         processor.patch_size = 14
+        processor.num_additional_image_tokens = 1
         inputs_expanded = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
             torch_device, torch.float16
         )
@@ -592,6 +631,7 @@ def test_expansion_in_processing_multiimage(self):
         # check processing without expansion of inputs (legacy behavior)
         processor.vision_feature_select_strategy = None
         processor.patch_size = None
+        processor.num_additional_image_tokens = None
         inputs = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
             torch_device, torch.float16
         )
@@ -618,12 +658,14 @@ def test_expansion_in_processing(self):
         # check processing with expansion of inputs
         processor.vision_feature_select_strategy = "default"
         processor.patch_size = 14
+        processor.num_additional_image_tokens = 1
         inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
         self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2356)
 
         # check processing without expansion of inputs (legacy behavior)
         processor.vision_feature_select_strategy = None
         processor.patch_size = None
+        processor.num_additional_image_tokens = None
         inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
         self.assertTrue(inputs.input_ids.shape[-1] == 17)
 
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
index 05fc8a49e1e9b9..a6fb341ff9bf56 100644
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Llava-NeXT-Video model."""
 
-import gc
 import unittest
 
 import numpy as np
@@ -29,6 +28,7 @@
     is_vision_available,
 )
 from transformers.testing_utils import (
+    cleanup,
     require_bitsandbytes,
     require_torch,
     slow,
@@ -91,7 +91,7 @@ def __init__(
         },
         is_training=True,
         vision_config={
-            "image_size": 16,
+            "image_size": 8,
             "patch_size": 4,
             "num_channels": 3,
             "is_training": True,
@@ -125,10 +125,10 @@ def __init__(
         self.batch_size = 3
         self.num_channels = 3
         self.image_size = 30
-        self.encoder_seq_length = 127
-        self.image_grid_pinpoints = [[32, 32]]
-        self.num_image_tokens = 88
-        self.num_video_tokens = 32
+
+        self.image_grid_pinpoints = [[16, 16]]
+        self.num_image_tokens = 24
+        self.num_video_tokens = 8
         self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens
 
     def get_config(self):
@@ -240,7 +240,13 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
 
     def setUp(self):
         self.model_tester = LlavaNextVideoVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LlavaNextVideoConfig, has_text_modality=False)
+        common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"]
+        self.config_tester = ConfigTester(
+            self, config_class=LlavaNextVideoConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -303,6 +309,38 @@ def test_inputs_embeds_matches_input_ids(self):
                 out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
             self.assertTrue(torch.allclose(out_embeds, out_ids))
 
+    def test_mismatching_num_image_tokens(self):
+        """
+        Tests that VLMs through an error with explicit message saying what is wrong
+        when number of images don't match number of image tokens in the text.
+        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            _ = model(**input_dict)  # successfull forward with no modifications
+
+            # remove one image but leave the image token in text
+            input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
+            input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(**input_dict)
+
+            # simulate multi-image case by concatenating inputs where each has exactly one image/image-token
+            input_ids = input_dict["input_ids"][:1]
+            pixel_values = input_dict["pixel_values"][:1]
+            image_sizes = input_dict["image_sizes"][:1]
+            input_ids = torch.cat([input_ids, input_ids], dim=0)
+
+            # one image and two image tokens raise an error
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
+
+            # two images and two image tokens don't raise an error
+            pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
+            image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
+            _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
@@ -368,8 +406,7 @@ def setUp(self):
         self.prompt_video = "USER: <video>\nWhy is this video funny? ASSISTANT:"
 
     def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     @require_bitsandbytes
@@ -521,12 +558,14 @@ def test_expansion_in_processing(self):
         # check processing with expansion of inputs
         processor.vision_feature_select_strategy = "default"
         processor.patch_size = 14
+        processor.num_additional_image_tokens = 1
         inputs_expanded = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
         self.assertTrue(inputs_expanded.input_ids.shape[-1] == 1170)
 
         # check processing without expansion of inputs (legacy behavior)
         processor.vision_feature_select_strategy = None
         processor.patch_size = None
+        processor.num_additional_image_tokens = None
         inputs = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
         self.assertTrue(inputs.input_ids.shape[-1] == 19)
 
@@ -549,12 +588,14 @@ def test_expansion_in_processing_images(self):
         # check processing with expansion of inputs
         processor.vision_feature_select_strategy = "default"
         processor.patch_size = 14
+        processor.num_additional_image_tokens = 1
         inputs_expanded = processor(self.prompt_image, images=[self.image], return_tensors="pt").to(torch_device)
         self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2652)
 
         # check processing without expansion of inputs (legacy behavior)
         processor.vision_feature_select_strategy = None
         processor.patch_size = None
+        processor.num_additional_image_tokens = None
         inputs = processor(self.prompt_image, images=[self.image], return_tensors="pt").to(torch_device)
         self.assertTrue(inputs.input_ids.shape[-1] == 19)
 
@@ -587,6 +628,7 @@ def test_expansion_in_processing_multiimage(self):
         # check processing with expansion of inputs
         processor.vision_feature_select_strategy = "default"
         processor.patch_size = 14
+        processor.num_additional_image_tokens = 1
         inputs_expanded = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
             torch_device, torch.float16
         )
@@ -595,6 +637,7 @@ def test_expansion_in_processing_multiimage(self):
         # check processing without expansion of inputs (legacy behavior)
         processor.vision_feature_select_strategy = None
         processor.patch_size = None
+        processor.num_additional_image_tokens = None
         inputs = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
             torch_device, torch.float16
         )
diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py
index 0a33898b63072b..a217eee2c70671 100644
--- a/tests/models/llava_onevision/test_modeling_llava_onevision.py
+++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Llava-NeXT model."""
 
-import gc
 import unittest
 
 import numpy as np
@@ -29,6 +28,7 @@
     is_vision_available,
 )
 from transformers.testing_utils import (
+    cleanup,
     require_bitsandbytes,
     require_torch,
     slow,
@@ -217,13 +217,22 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
 
     all_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
     all_generative_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"image-text-to-text": LlavaOnevisionForConditionalGeneration} if is_torch_available() else {}
+    )
     test_pruning = False
     test_head_masking = False
     _is_composite = True
 
     def setUp(self):
         self.model_tester = LlavaOnevisionVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LlavaOnevisionConfig, has_text_modality=False)
+        common_properties = ["image_token_index", "video_token_index", "vision_feature_layer"]
+        self.config_tester = ConfigTester(
+            self, config_class=LlavaOnevisionConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -336,8 +345,7 @@ def setUp(self):
         self.prompt_video = "user\n<video>\nWhat do you see in this video?<|im_end|>\n<|im_start|>assistant\n"
 
     def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     @require_bitsandbytes
diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py
index f19358a22f4b31..9b3a9563b58ddc 100644
--- a/tests/models/mamba2/test_modeling_mamba2.py
+++ b/tests/models/mamba2/test_modeling_mamba2.py
@@ -204,7 +204,8 @@ def test_generate_without_input_ids(self):
         pass
 
     @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case")
-    def test_generate_from_inputs_embeds_decoder_only(self):
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
         pass
 
     @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case")
@@ -275,12 +276,6 @@ def recursive_check(tuple_object, dict_object):
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
 
-    @unittest.skip(
-        reason="Mamba2 does not support generating with input embeddings (custom cache_position computation)"
-    )
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        pass
-
 
 @require_torch
 @slow
diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py
index fcdde2eb8a874b..eaf30131d34054 100644
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -1435,7 +1435,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         special_tokens_map = {}
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                 special_token = getattr(tokenizer, token)
                 special_tokens_map[special_token] = f"{special_token}a"
 
@@ -1447,7 +1447,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         # Check the changes
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                 continue
             special_token = getattr(tokenizer, token)
             if special_token in special_tokens_map:
@@ -2330,3 +2330,7 @@ def test_added_tokens_serialization(self):
     @unittest.skip("Chat is not supported")
     def test_chat_template_return_assistant_tokens_mask(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        pass
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
index ba78cf9ce3f7d6..a3caefe14ab501 100644
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -20,6 +20,7 @@
 
 from tests.test_modeling_common import floats_tensor
 from transformers import Mask2FormerConfig, is_torch_available, is_vision_available
+from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
 from transformers.testing_utils import (
     require_timm,
     require_torch,
@@ -481,3 +482,28 @@ def test_with_segmentation_maps_and_loss(self):
             outputs = model(**inputs)
 
         self.assertTrue(outputs.loss is not None)
+
+    def test_export(self):
+        if not is_torch_greater_or_equal_than_2_4:
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+        model = Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).to(torch_device).eval()
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
+
+        exported_program = torch.export.export(
+            model,
+            args=(inputs["pixel_values"], inputs["pixel_mask"]),
+            strict=True,
+        )
+        with torch.no_grad():
+            eager_outputs = model(**inputs)
+            exported_outputs = exported_program.module().forward(inputs["pixel_values"], inputs["pixel_mask"])
+        self.assertEqual(eager_outputs.masks_queries_logits.shape, exported_outputs.masks_queries_logits.shape)
+        self.assertTrue(
+            torch.allclose(eager_outputs.masks_queries_logits, exported_outputs.masks_queries_logits, atol=TOLERANCE)
+        )
+        self.assertEqual(eager_outputs.class_queries_logits.shape, exported_outputs.class_queries_logits.shape)
+        self.assertTrue(
+            torch.allclose(eager_outputs.class_queries_logits, exported_outputs.class_queries_logits, atol=TOLERANCE)
+        )
diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py
index 074dceae155214..7ddc6b747447a3 100644
--- a/tests/models/mimi/test_modeling_mimi.py
+++ b/tests/models/mimi/test_modeling_mimi.py
@@ -21,7 +21,6 @@
 
 import numpy as np
 from datasets import Audio, load_dataset
-from packaging import version
 from parameterized import parameterized
 from pytest import mark
 
@@ -410,10 +409,14 @@ def test_identity_shortcut(self):
         config.use_conv_shortcut = False
         self.model_tester.create_and_check_model_forward(config, inputs_dict)
 
+    # Overwrite to use `audio_values` as the tensors to compare.
+    # TODO: Try to do this in the parent class.
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
-    @slow
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        if torch_dtype == "float16" and torch_device == "cpu":
+            self.skipTest("`replication_pad1d` not implemented for 'Half")
+
         if not self.has_attentions:
             self.skipTest(reason="Model architecture does not support attentions")
 
@@ -514,7 +517,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                             can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
                             if not (self.has_attentions and can_output_attn) and output_attentions:
                                 continue
-                            for batch_size in [1, 5]:
+                            for batch_size in [7]:
                                 dummy_input = inputs_dict[model.main_input_name]
 
                                 if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
@@ -565,11 +568,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                     dummy_attention_mask[:] = 1
                                     if padding_side == "left":
-                                        dummy_attention_mask[-1, :-1] = 1
-                                        dummy_attention_mask[-1, -4:] = 0
+                                        dummy_attention_mask[-1, :2] = 0
+                                        dummy_attention_mask[-1, 2:] = 1
                                     elif padding_side == "right":
-                                        dummy_attention_mask[-1, 1:] = 1
-                                        dummy_attention_mask[-1, :3] = 0
+                                        dummy_attention_mask[-1, -2:] = 0
+                                        dummy_attention_mask[-1, :-2] = 1
 
                                 for enable_kernels in [False, True]:
                                     failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
@@ -656,52 +659,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                     # Masked tokens output slightly deviates - we don't mind that.
                                     if use_mask:
+                                        _logits_sdpa = torch.zeros_like(input=logits_sdpa)
+                                        _logits_eager = torch.zeros_like(input=logits_eager)
+
+                                        _logits_sdpa[:-1] = logits_sdpa[:-1]
+                                        _logits_eager[:-1] = logits_eager[:-1]
+
                                         if padding_side == "left":
-                                            sub_sdpa = logits_sdpa[:-1]
-                                            sub_eager = logits_eager[:-1]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            sub_sdpa = logits_sdpa[-1, :-4]
-                                            sub_eager = logits_eager[-1, :-4]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            # Testing the padding tokens is not really meaningful but anyway
-                                            # sub_sdpa = logits_sdpa[-1, -4:]
-                                            # sub_eager = logits_eager[-1, -4:]
-                                            # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-                                        elif padding_side == "right":
-                                            sub_sdpa = logits_sdpa[:-1]
-                                            sub_eager = logits_eager[:-1]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            sub_sdpa = logits_sdpa[-1, 3:]
-                                            sub_eager = logits_eager[-1, 3:]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            # Testing the padding tokens is not really meaningful but anyway
-                                            # sub_sdpa = logits_sdpa[-1, :3]
-                                            # sub_eager = logits_eager[-1, :3]
-                                            # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
+                                            _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
 
-                                    else:
-                                        if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                            )
+                                        elif padding_side == "right":
+                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
+                                            _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
+
+                                        logits_sdpa = _logits_sdpa
+                                        logits_eager = _logits_eager
+
+                                    results = [
+                                        torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
+                                        for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
+                                    ]
+                                    # If 80% batch elements have matched results, it's fine
+                                    if np.mean(results) < 0.8:
+                                        fail_cases.append(
+                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                        )
 
                 self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
@@ -745,22 +728,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
     def test_sdpa_can_compile_dynamic(self):
         pass
 
-    # For now, Let's focus only on GPU for `torch.compile`
-    @slow
-    @require_torch_gpu
-    def test_torch_compile(self):
-        if version.parse(torch.__version__) < version.parse("2.3"):
-            self.skipTest(reason="This test requires torch >= 2.3 to run.")
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        n_iter = 3
-        for model_class in self.all_model_classes:
-            model = model_class(config).to(torch_device)
-            model.forward = torch.compile(model.forward)
-            for i in range(n_iter):
-                _ = model(inputs_dict["input_values"].to(torch_device))
-
     @is_flaky()
     def test_batching_equivalence(self):
         super().test_batching_equivalence()
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index 600c4ffa14b0d0..c5ea050edf92ef 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -15,7 +15,6 @@
 """Testing suite for the PyTorch Mistral model."""
 
 import gc
-import tempfile
 import unittest
 
 import pytest
@@ -24,7 +23,7 @@
 from transformers import AutoTokenizer, MistralConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
     backend_empty_cache,
-    is_flaky,
+    cleanup,
     require_bitsandbytes,
     require_flash_attn,
     require_read_token,
@@ -332,13 +331,6 @@ def is_pipeline_test_to_skip(
     ):
         return True
 
-    # TODO: @Fxmarty
-    @is_flaky(max_attempts=3, description="flaky on some models.")
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        super().test_eager_matches_sdpa_generate()
-
     def setUp(self):
         self.model_tester = MistralModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MistralConfig, hidden_size=37)
@@ -424,85 +416,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Mistral apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
@@ -524,8 +437,7 @@ def setUpClass(cls):
             cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
 
     def tearDown(self):
-        torch.cuda.empty_cache()
-        gc.collect()
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     def test_model_7b_logits(self):
@@ -744,8 +656,7 @@ class Mask4DTestHard(unittest.TestCase):
     _model = None
 
     def tearDown(self):
-        gc.collect()
-        backend_empty_cache(torch_device)
+        cleanup(torch_device, gc_collect=True)
 
     @property
     def model(self):
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index 0688435e81423c..931bb1f17beccf 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -14,18 +14,15 @@
 # limitations under the License.
 """Testing suite for the PyTorch Mixtral model."""
 
-import tempfile
 import unittest
 
 import pytest
 
 from transformers import MixtralConfig, is_torch_available
 from transformers.testing_utils import (
-    is_flaky,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -332,13 +329,6 @@ def is_pipeline_test_to_skip(
     ):
         return True
 
-    # TODO: @Fxmarty
-    @is_flaky(max_attempts=3, description="flaky on some models.")
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        super().test_eager_matches_sdpa_generate()
-
     def setUp(self):
         self.model_tester = MixtralModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MixtralConfig, hidden_size=37)
@@ -424,85 +414,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Mixtral apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py
index fafa2f71331ba3..8da927f815db81 100644
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Mllama model."""
 
-import gc
 import unittest
 
 import requests
@@ -30,12 +29,11 @@
 )
 from transformers.models.mllama.configuration_mllama import MllamaTextConfig
 from transformers.testing_utils import (
-    is_flaky,
+    cleanup,
     require_bitsandbytes,
     require_read_token,
     require_torch,
     require_torch_gpu,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -126,18 +124,11 @@ class MllamaForCausalLMModelTest(ModelTesterMixin, GenerationTesterMixin, unitte
     all_generative_model_classes = (MllamaForCausalLM,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
-    _torch_compile_test_ckpt = "nltpt/Llama-3.2-11B-Vision"
 
     def setUp(self):
         self.model_tester = MllamaText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MllamaTextConfig, has_text_modality=True)
 
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_generate(self):
-        super().test_eager_matches_sdpa_generate()
-
 
 class MllamaVisionText2TextModelTester:
     def __init__(
@@ -271,6 +262,7 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
 
     all_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
     all_generative_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": MllamaForConditionalGeneration} if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
     test_torchscript = False
@@ -278,7 +270,12 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
 
     def setUp(self):
         self.model_tester = MllamaVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MllamaConfig, has_text_modality=False)
+        self.config_tester = ConfigTester(
+            self, config_class=MllamaConfig, has_text_modality=False, common_properties=["image_token_index"]
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
     def test_inputs_embeds(self):
@@ -360,19 +357,6 @@ def _check_attentions_for_generate(
 
             self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes)
 
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_generate(self):
-        super().test_eager_matches_sdpa_generate()
-
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_inference_1_bfloat16(self):
-        # A workaround to override parametrized test with flaky decorator
-        super().test_eager_matches_sdpa_inference_1_bfloat16()
-
     @unittest.skip("For some unknown reasons the tests fails in CrossAttention layer when doing torch.sdpa(). ")
     def test_sdpa_can_compile_dynamic(self):
         pass
@@ -409,8 +393,7 @@ def setUp(self):
         self.instruct_model_checkpoint = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
     def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     @require_torch_gpu
diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py
index d7a409427c9c51..d2bc11d09f1797 100644
--- a/tests/models/mobilebert/test_modeling_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_mobilebert.py
@@ -16,7 +16,9 @@
 
 import unittest
 
-from transformers import MobileBertConfig, is_torch_available
+from packaging import version
+
+from transformers import AutoTokenizer, MobileBertConfig, MobileBertForMaskedLM, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
@@ -384,3 +386,42 @@ def test_inference_no_head(self):
         upper_bound = torch.all((expected_slice / output[..., :3, :3]) <= 1 + TOLERANCE)
 
         self.assertTrue(lower_bound and upper_bound)
+
+    @slow
+    def test_export(self):
+        if version.parse(torch.__version__) < version.parse("2.4.0"):
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        mobilebert_model = "google/mobilebert-uncased"
+        device = "cpu"
+        attn_implementation = "eager"
+        max_length = 512
+
+        tokenizer = AutoTokenizer.from_pretrained(mobilebert_model)
+        inputs = tokenizer(
+            f"the man worked as a {tokenizer.mask_token}.",
+            return_tensors="pt",
+            padding="max_length",
+            max_length=max_length,
+        )
+
+        model = MobileBertForMaskedLM.from_pretrained(
+            mobilebert_model,
+            device_map=device,
+            attn_implementation=attn_implementation,
+        )
+
+        logits = model(**inputs).logits
+        eg_predicted_mask = tokenizer.decode(logits[0, 6].topk(5).indices)
+        self.assertEqual(eg_predicted_mask.split(), ["carpenter", "waiter", "mechanic", "teacher", "clerk"])
+
+        exported_program = torch.export.export(
+            model,
+            args=(inputs["input_ids"],),
+            kwargs={"attention_mask": inputs["attention_mask"]},
+            strict=True,
+        )
+
+        result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"])
+        ep_predicted_mask = tokenizer.decode(result.logits[0, 6].topk(5).indices)
+        self.assertEqual(eg_predicted_mask, ep_predicted_mask)
diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py
index b299b414d609b1..7d4b855c10d8bf 100644
--- a/tests/models/moshi/test_modeling_moshi.py
+++ b/tests/models/moshi/test_modeling_moshi.py
@@ -560,7 +560,7 @@ def _get_input_ids_and_config(self, batch_size=2):
         return config, input_ids, attention_mask, inputs_dict
 
     def prepare_config_and_inputs_for_generate(self, batch_size=2):
-        config, filtered_inputs_dict = super().prepare_config_and_inputs_for_generate()
+        config, filtered_inputs_dict = super().prepare_config_and_inputs_for_generate(batch_size=batch_size)
 
         # Make sure we only return `input_ids`.
         # Note that audio_codes will still be generated internally, so the ability to test audio codes is still there.
@@ -591,9 +591,11 @@ def _check_hidden_states_for_generate(
                 [expected_shape] * len(iter_hidden_states),
             )
 
-    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
         # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True`
-        super()._check_outputs(output, input_ids, config, use_cache=True, num_return_sequences=num_return_sequences)
+        super()._check_outputs(
+            output, config, use_cache=True, num_return_sequences=num_return_sequences, num_beams=num_beams
+        )
 
     def _check_hidden_states_for_generate(
         self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
@@ -655,57 +657,6 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
-    @pytest.mark.generate
-    def test_generate_from_inputs_embeds_decoder_only(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, _, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).to(torch_device).eval()
-
-            # Traditional way of generating text
-            outputs_from_ids = model.generate(
-                input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True, **inputs_dict
-            )
-            self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
-
-            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs_from_embeds = model.generate(
-                input_ids,
-                inputs_embeds=inputs_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-                **inputs_dict,
-            )
-
-            # But if we pass different inputs_embeds, we should get different outputs (the output text may be the
-            # same, but the logits will almost surely be different)
-            random_embeds = torch.rand_like(inputs_embeds)
-            outputs_from_rand_embeds = model.generate(
-                input_ids,
-                inputs_embeds=random_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-                **inputs_dict,
-            )
-            for i in range(len(outputs_from_rand_embeds.scores)):
-                self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i]))
-
-            # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same
-            outputs_from_embeds_wo_ids = model.generate(
-                inputs_embeds=inputs_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-                **inputs_dict,
-            )
-            self.assertListEqual(
-                outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(),
-                outputs_from_embeds_wo_ids.sequences.tolist(),
-            )
-
     @unittest.skip(reason="Continuing from past key values is not straightforward as we're dealing with 3 inputs")
     def test_generate_continue_from_past_key_values(self):
         pass
@@ -788,14 +739,10 @@ def test_left_padding_compatibility(self):
     @slow
     @is_flaky(max_attempts=5, description="flaky on some models.")
     def test_eager_matches_sdpa_generate(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
+        """Overwritten -- mochi has custom inputs and custom output checks"""
 
         max_new_tokens = 5
 
-        if len(self.all_generative_model_classes) == 0:
-            self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
-
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_sdpa:
                 self.skipTest(f"{model_class.__name__} does not support SDPA")
diff --git a/tests/models/moshi/test_tokenization_moshi.py b/tests/models/moshi/test_tokenization_moshi.py
index ad3a34a197f0e4..a520cca94bbe73 100644
--- a/tests/models/moshi/test_tokenization_moshi.py
+++ b/tests/models/moshi/test_tokenization_moshi.py
@@ -237,7 +237,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         special_tokens_map = {}
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                 special_token = getattr(tokenizer, token)
                 special_tokens_map[special_token] = f"{special_token}a"
 
@@ -249,7 +249,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         # Check the changes
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                 continue
             special_token = getattr(tokenizer, token)
             if special_token in special_tokens_map:
diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py
index 20412da2e1db06..1628d3a5893eaa 100644
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@@ -576,9 +576,6 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     # The small MT5 model needs higher percentages for CPU/MP tests
     model_split_percents = [0.5, 0.8, 0.9]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "google/mt5-small"
-
     def setUp(self):
         self.model_tester = MT5ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MT5Config, d_model=37)
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 438178bfc6faa2..37b5af3ae7e312 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -450,147 +450,8 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
                 assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
-    def test_flash_attn_2_generate_left_padding(self):
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # make sure we do left padding
-                dummy_attention_mask[:, :-1] = 0
-                dummy_attention_mask[:, -1:] = 1
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
-    def test_flash_attn_2_generate_padding_right(self):
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # make sure we do right padding
-                dummy_attention_mask[:, :-1] = 1
-                dummy_attention_mask[:, -1:] = 0
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache
-    def test_flash_attn_2_generate_use_cache(self):
-        max_new_tokens = 30
-
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
-    @slow
     # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         if not self.has_attentions:
@@ -617,8 +478,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
 
         atols = {
             ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-6,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -629,8 +492,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         }
         rtols = {
             ("cpu", False, torch.float32): 1e-4,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-4,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-4,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -666,7 +531,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                 fail_cases = []
                 for padding_side in ["left", "right"]:
                     for use_mask in [False, True]:
-                        for batch_size in [1, 5]:
+                        for batch_size in [7]:
                             # Ignore copy
                             batch_size_input_ids = self.model_tester.num_codebooks * batch_size
                             dummy_input = inputs_dict[model.main_input_name]
@@ -723,11 +588,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 dummy_attention_mask[:] = 1
                                 if padding_side == "left":
-                                    dummy_attention_mask[-1, :-1] = 1
-                                    dummy_attention_mask[-1, -4:] = 0
+                                    dummy_attention_mask[-1, :2] = 0
+                                    dummy_attention_mask[-1, 2:] = 1
                                 elif padding_side == "right":
-                                    dummy_attention_mask[-1, 1:] = 1
-                                    dummy_attention_mask[-1, :3] = 0
+                                    dummy_attention_mask[-1, -2:] = 0
+                                    dummy_attention_mask[-1, :-2] = 1
 
                             for enable_kernels in [False, True]:
                                 failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
@@ -770,122 +635,34 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 # Masked tokens output slightly deviates - we don't mind that.
                                 if use_mask:
-                                    if padding_side == "left":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, :-4]
-                                        sub_eager = logits_eager[-1, :-4]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, -4:]
-                                        # sub_eager = logits_eager[-1, -4:]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-                                    elif padding_side == "right":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, 3:]
-                                        sub_eager = logits_eager[-1, 3:]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, :3]
-                                        # sub_eager = logits_eager[-1, :3]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-
-                                else:
-                                    if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                        fail_cases.append(
-                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                        )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
-
-    @require_torch_sdpa
-    @slow
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate
-    def test_eager_matches_sdpa_generate(self):
-        max_new_tokens = 30
-
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_sdpa:
-                self.skipTest(f"{model_class.__name__} does not support SDPA")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
+                                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
+                                    _logits_eager = torch.zeros_like(input=logits_eager)
 
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
+                                    _logits_sdpa[:-1] = logits_sdpa[:-1]
+                                    _logits_eager[:-1] = logits_eager[:-1]
 
-                model_sdpa = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                    attn_implementation="eager",
-                ).to(torch_device)
-
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-                for name, submodule in model_eager.named_modules():
-                    if "SdpaAttention" in submodule.__class__.__name__:
-                        raise ValueError("The eager model should not have SDPA attention layers")
+                                    if padding_side == "left":
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
 
-                has_sdpa = False
-                for name, submodule in model_sdpa.named_modules():
-                    if "SdpaAttention" in submodule.__class__.__name__:
-                        has_sdpa = True
-                        break
-                if not has_sdpa:
-                    raise ValueError("The SDPA model should have SDPA attention layers")
+                                    elif padding_side == "right":
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
 
-                # Just test that a large cache works as expected
-                res_eager = model_eager.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
+                                    logits_sdpa = _logits_sdpa
+                                    logits_eager = _logits_eager
 
-                res_sdpa = model_sdpa.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
+                                results = [
+                                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
+                                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
+                                ]
+                                # If 80% batch elements have matched results, it's fine
+                                if np.mean(results) < 0.8:
+                                    fail_cases.append(
+                                        get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                    )
 
-                self.assertTrue(torch.allclose(res_eager, res_sdpa))
+                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
 
 def prepare_musicgen_inputs_dict(
@@ -1653,149 +1430,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
                 assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
-    def test_flash_attn_2_generate_left_padding(self):
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask")
-                if dummy_attention_mask is None:
-                    dummy_attention_mask = torch.ones_like(dummy_input)
-
-                # make sure we do left padding
-                dummy_attention_mask[:, :-1] = 0
-                dummy_attention_mask[:, -1:] = 1
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
-    def test_flash_attn_2_generate_padding_right(self):
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask")
-                if dummy_attention_mask is None:
-                    dummy_attention_mask = torch.ones_like(dummy_input)
-                # make sure we do right padding
-                dummy_attention_mask[:, :-1] = 1
-                dummy_attention_mask[:, -1:] = 0
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache
-    def test_flash_attn_2_generate_use_cache(self):
-        max_new_tokens = 30
-
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_torch_sdpa
     def test_sdpa_can_dispatch_composite_models(self):
         if not self.has_attentions:
@@ -1845,8 +1479,6 @@ def test_sdpa_can_dispatch_composite_models(self):
 
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
-    @slow
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         if not self.has_attentions:
             self.skipTest(reason="Model architecture does not support attentions")
@@ -1872,8 +1504,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
 
         atols = {
             ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-6,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -1884,8 +1518,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         }
         rtols = {
             ("cpu", False, torch.float32): 1e-4,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-4,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-4,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -1898,8 +1534,26 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         def get_mean_reldiff(failcase, x, ref, atol, rtol):
             return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
 
+        if hasattr(self.model_tester, "num_hidden_layers"):
+            self.model_tester.num_hidden_layers = 1
+
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            config.rms_norm_eps = 1.0
+            config.layer_norm_eps = 1.0
+            config.norm_eps = 1.0
+            config.norm_epsilon = 1.0
+            config.layer_norm_epsilon = 1.0
+
+            for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]:
+                if hasattr(config, attr):
+                    getattr(config, attr).rms_norm_eps = 1.0
+                    getattr(config, attr).layer_norm_eps = 1.0
+                    getattr(config, attr).norm_eps = 1.0
+                    getattr(config, attr).norm_epsilon = 1.0
+                    getattr(config, attr).layer_norm_epsilon = 1.0
+
             model = model_class(config)
 
             is_encoder_decoder = model.config.is_encoder_decoder
@@ -1916,12 +1570,19 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                 )
                 model_eager = model_eager.eval().to(torch_device)
 
+                for x in model_eager.modules():
+                    if isinstance(x, (torch.nn.LayerNorm, torch.nn.GroupNorm)):
+                        x.eps = 1.0
+                for x in model_sdpa.modules():
+                    if isinstance(x, (torch.nn.LayerNorm, torch.nn.GroupNorm)):
+                        x.eps = 1.0
+
                 # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
                 # but it would be nicer to have an efficient way to use parameterized.expand
                 fail_cases = []
                 for padding_side in ["left", "right"]:
                     for use_mask in [False, True]:
-                        for batch_size in [1, 5]:
+                        for batch_size in [7]:
                             dummy_input = inputs_dict[model.main_input_name]
 
                             if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
@@ -1971,11 +1632,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 dummy_attention_mask[:] = 1
                                 if padding_side == "left":
-                                    dummy_attention_mask[-1, :-1] = 1
-                                    dummy_attention_mask[-1, -4:] = 0
+                                    dummy_attention_mask[-1, :2] = 0
+                                    dummy_attention_mask[-1, 2:] = 1
                                 elif padding_side == "right":
-                                    dummy_attention_mask[-1, 1:] = 1
-                                    dummy_attention_mask[-1, :3] = 0
+                                    dummy_attention_mask[-1, -2:] = 0
+                                    dummy_attention_mask[-1, :-2] = 1
 
                             for enable_kernels in [False, True]:
                                 failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
@@ -2036,122 +1697,34 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 # Masked tokens output slightly deviates - we don't mind that.
                                 if use_mask:
-                                    if padding_side == "left":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, :-4]
-                                        sub_eager = logits_eager[-1, :-4]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, -4:]
-                                        # sub_eager = logits_eager[-1, -4:]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-                                    elif padding_side == "right":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, 3:]
-                                        sub_eager = logits_eager[-1, 3:]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, :3]
-                                        # sub_eager = logits_eager[-1, :3]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-
-                                else:
-                                    if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                        fail_cases.append(
-                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                        )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
+                                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
+                                    _logits_eager = torch.zeros_like(input=logits_eager)
 
-    @require_torch_sdpa
-    @slow
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate
-    def test_eager_matches_sdpa_generate(self):
-        max_new_tokens = 30
-
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_sdpa:
-                self.skipTest(f"{model_class.__name__} does not support SDPA")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
+                                    _logits_sdpa[:-1] = logits_sdpa[:-1]
+                                    _logits_eager[:-1] = logits_eager[:-1]
 
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model_sdpa = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                    attn_implementation="eager",
-                ).to(torch_device)
-
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-                for name, submodule in model_eager.named_modules():
-                    if "SdpaAttention" in submodule.__class__.__name__:
-                        raise ValueError("The eager model should not have SDPA attention layers")
+                                    if padding_side == "left":
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
 
-                has_sdpa = False
-                for name, submodule in model_sdpa.named_modules():
-                    if "SdpaAttention" in submodule.__class__.__name__:
-                        has_sdpa = True
-                        break
-                if not has_sdpa:
-                    raise ValueError("The SDPA model should have SDPA attention layers")
+                                    elif padding_side == "right":
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
 
-                # Just test that a large cache works as expected
-                res_eager = model_eager.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
+                                    logits_sdpa = _logits_sdpa
+                                    logits_eager = _logits_eager
 
-                res_sdpa = model_sdpa.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
+                                results = [
+                                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
+                                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
+                                ]
+                                # If 80% batch elements have matched results, it's fine
+                                if np.mean(results) < 0.8:
+                                    fail_cases.append(
+                                        get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                    )
 
-                self.assertTrue(torch.allclose(res_eager, res_sdpa))
+                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
     def test_requires_grad_with_frozen_encoders(self):
         config = self.model_tester.get_config()
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index f53fc21ba80c09..de7a2745ca073f 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -460,7 +460,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
-    @slow
     # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         if not self.has_attentions:
@@ -487,8 +486,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
 
         atols = {
             ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-6,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -499,8 +500,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         }
         rtols = {
             ("cpu", False, torch.float32): 1e-4,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-4,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-4,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -536,7 +539,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                 fail_cases = []
                 for padding_side in ["left", "right"]:
                     for use_mask in [False, True]:
-                        for batch_size in [1, 5]:
+                        for batch_size in [7]:
                             # Ignore copy
                             batch_size_input_ids = self.model_tester.num_codebooks * batch_size
                             dummy_input = inputs_dict[model.main_input_name]
@@ -593,11 +596,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 dummy_attention_mask[:] = 1
                                 if padding_side == "left":
-                                    dummy_attention_mask[-1, :-1] = 1
-                                    dummy_attention_mask[-1, -4:] = 0
+                                    dummy_attention_mask[-1, :2] = 0
+                                    dummy_attention_mask[-1, 2:] = 1
                                 elif padding_side == "right":
-                                    dummy_attention_mask[-1, 1:] = 1
-                                    dummy_attention_mask[-1, :3] = 0
+                                    dummy_attention_mask[-1, -2:] = 0
+                                    dummy_attention_mask[-1, :-2] = 1
 
                             for enable_kernels in [False, True]:
                                 failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
@@ -640,52 +643,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 # Masked tokens output slightly deviates - we don't mind that.
                                 if use_mask:
+                                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
+                                    _logits_eager = torch.zeros_like(input=logits_eager)
+
+                                    _logits_sdpa[:-1] = logits_sdpa[:-1]
+                                    _logits_eager[:-1] = logits_eager[:-1]
+
                                     if padding_side == "left":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, :-4]
-                                        sub_eager = logits_eager[-1, :-4]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, -4:]
-                                        # sub_eager = logits_eager[-1, -4:]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
+
                                     elif padding_side == "right":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, 3:]
-                                        sub_eager = logits_eager[-1, 3:]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, :3]
-                                        # sub_eager = logits_eager[-1, :3]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
 
-                                else:
-                                    if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                        fail_cases.append(
-                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                        )
+                                    logits_sdpa = _logits_sdpa
+                                    logits_eager = _logits_eager
+
+                                results = [
+                                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
+                                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
+                                ]
+                                # If 80% batch elements have matched results, it's fine
+                                if np.mean(results) < 0.8:
+                                    fail_cases.append(
+                                        get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                    )
 
                 self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
@@ -1437,149 +1420,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
                 assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
-    def test_flash_attn_2_generate_left_padding(self):
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask")
-                if dummy_attention_mask is None:
-                    dummy_attention_mask = torch.ones_like(dummy_input)
-
-                # make sure we do left padding
-                dummy_attention_mask[:, :-1] = 0
-                dummy_attention_mask[:, -1:] = 1
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
-    def test_flash_attn_2_generate_padding_right(self):
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask")
-                if dummy_attention_mask is None:
-                    dummy_attention_mask = torch.ones_like(dummy_input)
-                # make sure we do right padding
-                dummy_attention_mask[:, :-1] = 1
-                dummy_attention_mask[:, -1:] = 0
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache
-    def test_flash_attn_2_generate_use_cache(self):
-        max_new_tokens = 30
-
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_torch_sdpa
     def test_sdpa_can_dispatch_composite_models(self):
         if not self.has_attentions:
@@ -1629,7 +1469,6 @@ def test_sdpa_can_dispatch_composite_models(self):
 
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
-    @slow
     # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         if not self.all_model_classes[0]._supports_sdpa:
@@ -1653,8 +1492,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
 
         atols = {
             ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-6,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -1665,8 +1506,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         }
         rtols = {
             ("cpu", False, torch.float32): 1e-4,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-4,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-4,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -1702,7 +1545,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                 fail_cases = []
                 for padding_side in ["left", "right"]:
                     for use_mask in [False, True]:
-                        for batch_size in [1, 5]:
+                        for batch_size in [7]:
                             dummy_input = inputs_dict[model.main_input_name]
 
                             if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
@@ -1752,11 +1595,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 dummy_attention_mask[:] = 1
                                 if padding_side == "left":
-                                    dummy_attention_mask[-1, :-1] = 1
-                                    dummy_attention_mask[-1, -4:] = 0
+                                    dummy_attention_mask[-1, :2] = 0
+                                    dummy_attention_mask[-1, 2:] = 1
                                 elif padding_side == "right":
-                                    dummy_attention_mask[-1, 1:] = 1
-                                    dummy_attention_mask[-1, :3] = 0
+                                    dummy_attention_mask[-1, -2:] = 0
+                                    dummy_attention_mask[-1, :-2] = 1
 
                             for enable_kernels in [False, True]:
                                 failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
@@ -1817,122 +1660,34 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 # Masked tokens output slightly deviates - we don't mind that.
                                 if use_mask:
-                                    if padding_side == "left":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, :-4]
-                                        sub_eager = logits_eager[-1, :-4]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, -4:]
-                                        # sub_eager = logits_eager[-1, -4:]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-                                    elif padding_side == "right":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, 3:]
-                                        sub_eager = logits_eager[-1, 3:]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, :3]
-                                        # sub_eager = logits_eager[-1, :3]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-
-                                else:
-                                    if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                        fail_cases.append(
-                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                        )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
-
-    @require_torch_sdpa
-    @slow
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate
-    def test_eager_matches_sdpa_generate(self):
-        max_new_tokens = 30
-
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_sdpa:
-                self.skipTest(f"{model_class.__name__} does not support SDPA")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
+                                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
+                                    _logits_eager = torch.zeros_like(input=logits_eager)
 
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
+                                    _logits_sdpa[:-1] = logits_sdpa[:-1]
+                                    _logits_eager[:-1] = logits_eager[:-1]
 
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model_sdpa = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                    attn_implementation="eager",
-                ).to(torch_device)
-
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-                for name, submodule in model_eager.named_modules():
-                    if "SdpaAttention" in submodule.__class__.__name__:
-                        raise ValueError("The eager model should not have SDPA attention layers")
+                                    if padding_side == "left":
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
 
-                has_sdpa = False
-                for name, submodule in model_sdpa.named_modules():
-                    if "SdpaAttention" in submodule.__class__.__name__:
-                        has_sdpa = True
-                        break
-                if not has_sdpa:
-                    raise ValueError("The SDPA model should have SDPA attention layers")
+                                    elif padding_side == "right":
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
 
-                # Just test that a large cache works as expected
-                res_eager = model_eager.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
+                                    logits_sdpa = _logits_sdpa
+                                    logits_eager = _logits_eager
 
-                res_sdpa = model_sdpa.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
+                                results = [
+                                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
+                                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
+                                ]
+                                # If 80% batch elements have matched results, it's fine
+                                if np.mean(results) < 0.8:
+                                    fail_cases.append(
+                                        get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                    )
 
-                self.assertTrue(torch.allclose(res_eager, res_sdpa))
+                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
     def test_requires_grad_with_frozen_encoders(self):
         config = self.model_tester.get_config()
diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py
index 13adfe1e579489..fd62c74d3d6e11 100644
--- a/tests/models/nemotron/test_modeling_nemotron.py
+++ b/tests/models/nemotron/test_modeling_nemotron.py
@@ -19,7 +19,6 @@
 import unittest
 
 import pytest
-from parameterized import parameterized
 
 from transformers import NemotronConfig, is_torch_available
 from transformers.testing_utils import (
@@ -92,8 +91,6 @@ class NemotronModelTest(GemmaModelTest):
     test_pruning = False
     fx_compatible = False
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "nvidia/nemotron-3-8b-base-4k-hf"
     # used in `test_torch_compile_for_training`
     _torch_compile_train_cls = NemotronForCausalLM if is_torch_available() else None
 
@@ -101,15 +98,6 @@ def setUp(self):
         self.model_tester = NemotronModelTester(self)
         self.config_tester = ConfigTester(self, config_class=NemotronConfig, hidden_size=37)
 
-    @require_torch_sdpa
-    @slow
-    @unittest.skip(
-        reason="Due to custom causal mask, there is a slightly too big difference between eager and sdpa in bfloat16."
-    )
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        pass
-
     @unittest.skip("Eager and SDPA do not produce the same outputs, thus this test fails")
     def test_model_outputs_equivalence(self, **kwargs):
         pass
diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py
index fbe73248d00b7c..a85e9db34586f9 100644
--- a/tests/models/olmo/test_modeling_olmo.py
+++ b/tests/models/olmo/test_modeling_olmo.py
@@ -24,10 +24,8 @@
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
 from transformers.testing_utils import (
-    is_flaky,
     require_tokenizers,
     require_torch,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -317,13 +315,6 @@ def test_model_various_embeddings(self):
     def test_save_load_fast_init_from_base(self):
         pass
 
-    # TODO: @Fxmarty
-    @is_flaky(max_attempts=3, description="flaky on some models.")
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        super().test_eager_matches_sdpa_generate()
-
     @parameterized.expand([("linear",), ("dynamic",)])
     def test_model_rope_scaling(self, scaling_type):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/olmo_1124/__init__.py b/tests/models/olmo_1124/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/olmo_1124/test_modeling_olmo_1124.py b/tests/models/olmo_1124/test_modeling_olmo_1124.py
new file mode 100644
index 00000000000000..9dad7932f1a570
--- /dev/null
+++ b/tests/models/olmo_1124/test_modeling_olmo_1124.py
@@ -0,0 +1,468 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch OLMo November 2024 model."""
+
+import unittest
+
+from packaging import version
+from parameterized import parameterized
+
+from transformers import Olmo1124Config, is_torch_available, set_seed
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers.testing_utils import (
+    require_tokenizers,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Olmo1124ForCausalLM,
+        Olmo1124Model,
+    )
+
+
+class Olmo1124ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="silu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return Olmo1124Config(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Olmo1124Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = Olmo1124Model(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = Olmo1124ForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = Olmo1124ForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (Olmo1124Model, Olmo1124ForCausalLM) if is_torch_available() else ()
+    all_generative_model_classes = (Olmo1124ForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": Olmo1124Model,
+            "text-generation": Olmo1124ForCausalLM,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_pruning = False
+    fx_compatible = False
+
+    # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
+    # This is because we are hitting edge cases with the causal_mask buffer
+    model_split_percents = [0.5, 0.7, 0.8]
+
+    def setUp(self):
+        self.model_tester = Olmo1124ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Olmo1124Config, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="OLMo November 2024 does not support head pruning.")
+    def test_headmasking(self):
+        pass
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="OLMo November 2024 buffers include complex numbers, which breaks this test")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @parameterized.expand([("linear",), ("dynamic",)])
+    def test_model_rope_scaling(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = Olmo1124Model(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = Olmo1124Model(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+
+@require_torch
+class Olmo1124IntegrationTest(unittest.TestCase):
+    @slow
+    def test_model_7b_logits(self):
+        input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]]
+        model = Olmo1124ForCausalLM.from_pretrained("shanearora/OLMo-7B-1124-hf", device_map="auto")
+        out = model(torch.tensor(input_ids)).logits.float()
+        # Expected mean on dim = -1
+        EXPECTED_MEAN = torch.tensor(
+            [[-13.0244, -13.9564, -11.8270, -11.3047, -12.3794, -12.4215, -15.6030, -12.7962]]
+        )
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
+        # slicing logits[0, 0, 0:30]
+        EXPECTED_SLICE = torch.tensor([-5.3909, -13.9841, -13.6123, -14.5780, -13.9455, -13.2265, -13.4734, -11.9079, -9.2879, -12.6139, -11.4819, -5.9607, -11.9657, -6.3618, -11.1065, -7.3075, -6.5674, -6.7154, -7.3409, -7.9662, -8.0863, -8.1682, -8.7341, -8.7665, -8.8742, -9.7813, -8.0620, -12.5937, -7.6440, -11.3966])  # fmt: skip
+        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-2, rtol=1e-2)
+
+    @slow
+    def test_model_7b_greedy_generation(self):
+        EXPECTED_TEXT_COMPLETION = """Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light is the fastest speed possible, and 3) the speed of light is the same for all observers, regardless of their relative motion. The theory of relativity is based on the idea that the speed of light is constant. This means that"""
+        prompt = "Simply put, the theory of relativity states that "
+        tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo-7B-1124-hf", device_map="auto")
+        model = Olmo1124ForCausalLM.from_pretrained("shanearora/OLMo-7B-1124-hf", device_map="auto")
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
+
+        # greedy generation outputs
+        generated_ids = model.generate(input_ids, max_new_tokens=64, top_p=None, temperature=1, do_sample=False)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+    @require_tokenizers
+    def test_simple_encode_decode(self):
+        rust_tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo-7B-1124-hf")
+
+        self.assertEqual(rust_tokenizer.encode("This is a test"), [2028, 374, 264, 1296])
+        self.assertEqual(rust_tokenizer.decode([2028, 374, 264, 1296], skip_special_tokens=True), "This is a test")
+
+        # bytefallback showcase
+        self.assertEqual(rust_tokenizer.encode("生活的真谛是"), [21990, 76706, 9554, 89151, 39013, 249, 21043])  # fmt: skip
+        self.assertEqual(
+            rust_tokenizer.decode([21990, 76706, 9554, 89151, 39013, 249, 21043], skip_special_tokens=True),
+            "生活的真谛是",
+        )
+
+        # Inner spaces showcase
+        self.assertEqual(rust_tokenizer.encode("Hi  Hello"), [13347, 220, 22691])
+        self.assertEqual(rust_tokenizer.decode([13347, 220, 22691], skip_special_tokens=True), "Hi  Hello")
+
+        self.assertEqual(rust_tokenizer.encode("Hi   Hello"), [13347, 256, 22691])
+        self.assertEqual(rust_tokenizer.decode([13347, 256, 22691], skip_special_tokens=True), "Hi   Hello")
+
+        self.assertEqual(rust_tokenizer.encode(""), [])
+
+        self.assertEqual(rust_tokenizer.encode(" "), [220])
+
+        self.assertEqual(rust_tokenizer.encode("  "), [256])
+
+        self.assertEqual(rust_tokenizer.encode(" Hello"), [22691])
+
+    @slow
+    def test_export_static_cache(self):
+        if version.parse(torch.__version__) < version.parse("2.4.0"):
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        from transformers.integrations.executorch import (
+            TorchExportableModuleWithStaticCache,
+            convert_and_export_with_cache,
+        )
+
+        olmo_1124_model = "shanearora/OLMo-7B-1124-hf"
+
+        tokenizer = AutoTokenizer.from_pretrained(olmo_1124_model, pad_token="</s>", padding_side="right")
+        EXPECTED_TEXT_COMPLETION = [
+            "Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light",
+        ]
+        max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[
+            "input_ids"
+        ].shape[-1]
+
+        # Load model
+        device = "cpu"
+        dtype = torch.bfloat16
+        cache_implementation = "static"
+        attn_implementation = "sdpa"
+        batch_size = 1
+        generation_config = GenerationConfig(
+            use_cache=True,
+            cache_implementation=cache_implementation,
+            max_length=max_generation_length,
+            cache_config={
+                "batch_size": batch_size,
+                "max_cache_len": max_generation_length,
+            },
+        )
+        model = Olmo1124ForCausalLM.from_pretrained(
+            olmo_1124_model,
+            device_map=device,
+            torch_dtype=dtype,
+            attn_implementation=attn_implementation,
+            generation_config=generation_config,
+        )
+
+        prompts = ["Simply put, the theory of relativity states that "]
+        prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
+        prompt_token_ids = prompt_tokens["input_ids"]
+        max_new_tokens = max_generation_length - prompt_token_ids.shape[-1]
+
+        # Static Cache + eager
+        eager_generated_ids = model.generate(
+            **prompt_tokens, max_new_tokens=max_new_tokens, do_sample=False, cache_implementation=cache_implementation
+        )
+        eager_generated_text = tokenizer.batch_decode(eager_generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, eager_generated_text)
+
+        # Static Cache + export
+        exported_program = convert_and_export_with_cache(model)
+        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
+            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
+        )
+        ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text)
diff --git a/tests/models/olmoe/test_modeling_olmoe.py b/tests/models/olmoe/test_modeling_olmoe.py
index 08ec1458efe146..9efadb06eb416b 100644
--- a/tests/models/olmoe/test_modeling_olmoe.py
+++ b/tests/models/olmoe/test_modeling_olmoe.py
@@ -22,10 +22,8 @@
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
 from transformers.testing_utils import (
-    is_flaky,
     require_tokenizers,
     require_torch,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -330,13 +328,6 @@ def test_model_various_embeddings(self):
     def test_save_load_fast_init_from_base(self):
         pass
 
-    # TODO: @Fxmarty
-    @is_flaky(max_attempts=3, description="flaky on some models.")
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        super().test_eager_matches_sdpa_generate()
-
     @parameterized.expand([("linear",), ("dynamic",)])
     def test_model_rope_scaling(self, scaling_type):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py
index 2093dfe685b3ee..8bae2af804500b 100644
--- a/tests/models/opt/test_modeling_opt.py
+++ b/tests/models/opt/test_modeling_opt.py
@@ -25,7 +25,6 @@
     require_torch,
     require_torch_accelerator,
     require_torch_fp16,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -339,68 +338,6 @@ def test_opt_sequence_classification_model_for_multi_label(self):
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        max_new_tokens = 30
-
-        tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350M")
-
-        texts = [
-            "hi here's a longer context, getting longer and",
-            "Hello this is a very long sentence my friend, very long for real",
-            "Today I am in Paris and",
-        ]
-
-        model_sdpa = OPTForCausalLM.from_pretrained(
-            "facebook/opt-350M",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-            attn_implementation="sdpa",
-        ).to(torch_device)
-
-        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-        model_eager = OPTForCausalLM.from_pretrained(
-            "facebook/opt-350M",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-            attn_implementation="eager",
-        ).to(torch_device)
-
-        self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-        for _, submodule in model_eager.named_modules():
-            if "SdpaAttention" in submodule.__class__.__name__:
-                raise ValueError("The eager model should not have SDPA attention layers")
-
-        has_sdpa = False
-        for _, submodule in model_sdpa.named_modules():
-            if "SdpaAttention" in submodule.__class__.__name__:
-                has_sdpa = True
-                break
-        if not has_sdpa:
-            raise ValueError("The SDPA model should have SDPA attention layers")
-
-        for padding_side in ["left", "right"]:
-            tokenizer.padding_side = padding_side
-            tokenizer.pad_token = tokenizer.eos_token
-
-            inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
-
-            res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-            res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-
-            with self.subTest(f"{padding_side}"):
-                torch.testing.assert_close(
-                    res_eager,
-                    res_sdpa,
-                    msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
-                )
-
     @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_model_parallelism(self):
         super().test_model_parallelism()
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index 48070c7bb86c6b..df763aed48c749 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -447,6 +447,13 @@ class Owlv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = Owlv2ModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=Owlv2Config, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index a08fae0bc6d10e..e0599a50fb98b4 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -442,6 +442,13 @@ class OwlViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = OwlViTModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=OwlViTConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index cfc2a2c29b1d70..ce44436a20ad2c 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -14,11 +14,9 @@
 # limitations under the License.
 """Testing suite for the PyTorch PaliGemma model."""
 
-import gc
 import unittest
 
 import requests
-from parameterized import parameterized
 
 from transformers import (
     PaliGemmaConfig,
@@ -28,9 +26,9 @@
     is_vision_available,
 )
 from transformers.testing_utils import (
+    cleanup,
     require_read_token,
     require_torch,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -183,6 +181,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
 
     all_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
     all_generative_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": PaliGemmaForConditionalGeneration}
     fx_compatible = False
     test_pruning = False
     test_torchscript = False
@@ -236,6 +235,36 @@ def test_inputs_embeds_matches_input_ids(self):
                 out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
             self.assertTrue(torch.allclose(out_embeds, out_ids))
 
+    # Copied from tests.models.llava.test_modeling_llava.LlavaForConditionalGenerationModelTest.test_mismatching_num_image_tokens
+    def test_mismatching_num_image_tokens(self):
+        """
+        Tests that VLMs through an error with explicit message saying what is wrong
+        when number of images don't match number of image tokens in the text.
+        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            _ = model(**input_dict)  # successfull forward with no modifications
+
+            # remove one image but leave the image token in text
+            input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(**input_dict)
+
+            # simulate multi-image case by concatenating inputs where each has exactly one image/image-token
+            input_ids = input_dict["input_ids"][:1]
+            pixel_values = input_dict["pixel_values"][:1]
+            input_ids = torch.cat([input_ids, input_ids], dim=0)
+
+            # one image and two image tokens raise an error
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, pixel_values=pixel_values)
+
+            # two images and two image tokens don't raise an error
+            pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
+            _ = model(input_ids=input_ids, pixel_values=pixel_values)
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
@@ -270,14 +299,6 @@ def test_disk_offload_safetensors(self):
     def test_model_parallelism(self):
         pass
 
-    @require_torch_sdpa
-    @slow
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        self.skipTest(
-            "Due to custom causal mask, there is a slightly too big difference between eager and sdpa in bfloat16."
-        )
-
     @unittest.skip(
         reason="PaliGemmma's SigLip encoder uses the same initialization scheme as the Flax original implementation"
     )
@@ -316,10 +337,6 @@ def test_save_load_low_cpu_mem_usage_no_safetensors(self):
     def test_generate_from_inputs_embeds_with_static_cache(self):
         pass
 
-    @unittest.skip(reason="TODO (@joao): fix me -- failing to produce similar results")
-    def test_static_cache_matches_dynamic(self):
-        pass
-
     @unittest.skip("FlashAttention only support fp16 and bf16 data type")
     def test_flash_attn_2_fp32_ln(self):
         pass
@@ -339,8 +356,7 @@ def setUp(self):
         self.processor = PaliGemmaProcessor.from_pretrained("google/paligemma-3b-pt-224")
 
     def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     def test_small_model_integration_test(self):
         # Let' s make sure we test the preprocessing to replace what is used
diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py
index c17f69a499866b..eae6789bef252e 100644
--- a/tests/models/phi/test_modeling_phi.py
+++ b/tests/models/phi/test_modeling_phi.py
@@ -17,15 +17,11 @@
 
 import unittest
 
-import pytest
 from parameterized import parameterized
 
 from transformers import PhiConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
-    require_bitsandbytes,
-    require_flash_attn,
     require_torch,
-    require_torch_gpu,
     slow,
     torch_device,
 )
@@ -468,43 +464,6 @@ def test_model_rope_scaling(self):
             torch.testing.assert_close(ntk_sin_long, original_sin_long)
         self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
 
-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @slow
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_flash_attn_2_generate_padding_right with LlamaForCausalLM->PhiForCausalLM,LlamaTokenizer->AutoTokenizer,meta-llama/Llama-2-7b-hf->microsoft/phi-1
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = PhiForCausalLM.from_pretrained(
-            "microsoft/phi-1",
-            load_in_4bit=True,
-            device_map={"": 0},
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = PhiForCausalLM.from_pretrained(
-            "microsoft/phi-1", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2"
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
 
 @slow
 @require_torch
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index 2d762008cbbc3d..adec2c893a05fa 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -27,6 +27,7 @@
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
 
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
@@ -388,6 +389,7 @@ def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=Tru
         self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
         self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
         self.is_training = is_training
+        self.max_patches = self.vision_model_tester.max_patches
 
     def prepare_config_and_inputs(self):
         text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
@@ -417,9 +419,14 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class Pix2StructModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else ()
-    pipeline_model_mapping = {"image-to-text": Pix2StructForConditionalGeneration} if is_torch_available() else {}
+    all_generative_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else {}
+    pipeline_model_mapping = (
+        {"image-to-text": Pix2StructForConditionalGeneration, "image-text-to-text": Pix2StructForConditionalGeneration}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -445,6 +452,16 @@ def test_model(self):
                 ),
             )
 
+    def test_generative_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config).eval().to(torch_device)
+
+            output = model.generate(**input_dict, use_cache=False, min_new_tokens=10, max_new_tokens=10)
+            output_use_cache = model.generate(**input_dict, use_cache=True, min_new_tokens=10, max_new_tokens=10)
+
+            torch.testing.assert_close(output, output_use_cache)
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
@@ -736,6 +753,26 @@ def test_load_vision_text_config(self):
             text_config = Pix2StructTextConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
 
+    def _check_encoder_attention_for_generate(self, attentions, batch_size, config, seq_length):
+        # overwrite because # pix2struct seq length depends on image inputs
+        seq_length = self.model_tester.max_patches
+        encoder_expected_shape = (batch_size, config.num_attention_heads, seq_length, seq_length)
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [layer_attentions.shape for layer_attentions in attentions],
+            [encoder_expected_shape] * len(attentions),
+        )
+
+    def _check_encoder_hidden_states_for_generate(self, hidden_states, batch_size, config, seq_length):
+        # overwrite because # pix2struct seq length depends on image inputs
+        seq_length = self.model_tester.max_patches
+        encoder_expected_shape = (batch_size, seq_length, config.hidden_size)
+        self.assertIsInstance(hidden_states, tuple)
+        self.assertListEqual(
+            [layer_hidden_states.shape for layer_hidden_states in hidden_states],
+            [encoder_expected_shape] * len(hidden_states),
+        )
+
 
 # We will verify our results on an image of a stop sign
 def prepare_img():
diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py
index 9a128f6ad28823..0c36cb5a4e0554 100644
--- a/tests/models/pixtral/test_modeling_pixtral.py
+++ b/tests/models/pixtral/test_modeling_pixtral.py
@@ -14,22 +14,16 @@
 # limitations under the License.
 """Testing suite for the PyTorch Pixtral model."""
 
-import gc
 import unittest
 
-import requests
-
 from transformers import (
-    AutoProcessor,
     PixtralVisionConfig,
     PixtralVisionModel,
     is_torch_available,
     is_vision_available,
 )
 from transformers.testing_utils import (
-    require_bitsandbytes,
     require_torch,
-    slow,
     torch_device,
 )
 
@@ -43,7 +37,7 @@
     is_torch_greater_or_equal_than_2_0 = False
 
 if is_vision_available():
-    from PIL import Image
+    pass
 
 
 class PixtralVisionModelTester:
@@ -148,6 +142,7 @@ class PixtralVisionModelModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (PixtralVisionModel,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
+    test_torchscript = False
 
     def setUp(self):
         self.model_tester = PixtralVisionModelTester(self)
@@ -258,35 +253,3 @@ def test_disk_offload_safetensors(self):
     @unittest.skip(reason="Not supported yet")
     def test_determinism(self):
         pass
-
-
-@require_torch
-class PixtralVisionModelIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        self.processor = AutoProcessor.from_pretrained("hf-internal-testing/pixtral-12b")
-
-    def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @slow
-    @require_bitsandbytes
-    def test_small_model_integration_test(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-        model = PixtralVisionModel.from_pretrained("hf-internal-testing/pixtral-12b", load_in_4bit=True)
-
-        prompt = "<s>[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
-        image_file = "https://pixtral-vl.github.io/static/images/view.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = self.processor(prompt, raw_image, return_tensors="pt")
-
-        EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]])  # fmt: skip
-        self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
-
-        output = model.generate(**inputs, max_new_tokens=20)
-        EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,"  # fmt: skip
-
-        self.assertEqual(
-            self.processor.decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py
index 8cdbf93c6476b8..c3496dff3cdf81 100644
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -171,7 +171,7 @@ def test_processor_with_multiple_images_single_list(self):
             input_ids[0].tolist(),
             # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
             [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
-        )
+                    )
         # fmt: on
 
         # Test passing in a url
@@ -246,6 +246,25 @@ def test_processor_with_multiple_images_multiple_lists(self):
         )
         # fmt: on
 
+    def test_processor_returns_full_length_batches(self):
+        # to avoid https://github.com/huggingface/transformers/issues/34204
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
+        prompt_string = [
+            "USER: [IMG]\nWhat's the content of the image? ASSISTANT:",
+        ] * 5
+        processor.tokenizer.pad_token = "</s>"
+        image_inputs = [self.image_0] * 5
+
+        # Make small for checking image token expansion
+        processor.image_processor.size = {"longest_edge": 30}
+        processor.image_processor.patch_size = {"height": 2, "width": 2}
+
+        # Test passing in an image
+        inputs_image = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
+        self.assertIn("input_ids", inputs_image)
+        self.assertTrue(len(inputs_image["input_ids"]) == 5)
+        self.assertTrue(len(inputs_image["pixel_values"]) == 5)
+
     # Override as PixtralProcessor needs nested images to work properly with batched inputs
     @require_vision
     def prepare_image_inputs(self, batch_size: Optional[int] = None):
diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py
index 301937079ae694..f51dc2e0a5e26f 100644
--- a/tests/models/qwen2/test_modeling_qwen2.py
+++ b/tests/models/qwen2/test_modeling_qwen2.py
@@ -15,7 +15,6 @@
 """Testing suite for the PyTorch Qwen2 model."""
 
 import gc
-import tempfile
 import unittest
 
 import pytest
@@ -343,14 +342,6 @@ def is_pipeline_test_to_skip(
     ):
         return True
 
-    # Ignore copy
-    # TODO: @Fxmarty
-    @require_torch_sdpa
-    @slow
-    @unittest.skip(reason="Currently failing.")
-    def test_eager_matches_sdpa_generate(self):
-        super().test_eager_matches_sdpa_generate()
-
     def setUp(self):
         self.model_tester = Qwen2ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Qwen2Config, hidden_size=37)
@@ -436,85 +427,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Qwen2 apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index 314f870f5d9096..42b521e518e22e 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Qwen2Audio model."""
 
-import gc
 import tempfile
 import unittest
 from io import BytesIO
@@ -29,6 +28,7 @@
     is_torch_available,
 )
 from transformers.testing_utils import (
+    cleanup,
     require_torch,
     require_torch_sdpa,
     slow,
@@ -222,8 +222,7 @@ def setUp(self):
         self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
 
     def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     def test_small_model_integration_test_single(self):
diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
index 30d7996d7e7b09..abc7b57919b083 100644
--- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
+++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
@@ -15,7 +15,6 @@
 """Testing suite for the PyTorch Qwen2MoE model."""
 
 import gc
-import tempfile
 import unittest
 
 import pytest
@@ -368,12 +367,6 @@ def is_pipeline_test_to_skip(
     ):
         return True
 
-    # Ignore copy
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        super().test_eager_matches_sdpa_generate()
-
     def setUp(self):
         self.model_tester = Qwen2MoeModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Qwen2MoeConfig, hidden_size=37)
@@ -459,85 +452,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Qwen2Moe apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 956243dccebebf..f2a3719e17b4c6 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -58,7 +58,7 @@ class Qwen2VLVisionText2TextModelTester:
     def __init__(
         self,
         parent,
-        batch_size=2,
+        batch_size=3,
         seq_length=7,
         num_channels=3,
         ignore_index=-100,
@@ -66,12 +66,12 @@ def __init__(
         bos_token_id=0,
         eos_token_id=1,
         pad_token_id=2,
-        vision_start_token_id=151652,
-        image_token_id=151655,
-        video_token_id=151656,
+        vision_start_token_id=3,
+        image_token_id=4,
+        video_token_id=5,
         hidden_act="silu",
         hidden_size=32,
-        vocab_size=152064,
+        vocab_size=99,
         intermediate_size=37,
         max_position_embeddings=512,
         max_window_layers=3,
@@ -166,6 +166,8 @@ def prepare_config_and_inputs_for_common(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
 
+        input_ids[:, -1] = self.pad_token_id
+        input_ids[input_ids == self.video_token_id] = self.pad_token_id
         input_ids[input_ids == self.image_token_id] = self.pad_token_id
         input_ids[:, self.num_image_tokens] = self.image_token_id
         labels = torch.zeros(
@@ -224,6 +226,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
 
     all_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else ()
     all_generative_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": Qwen2VLForConditionalGeneration}
     test_pruning = False
     test_head_masking = False
 
@@ -231,6 +234,9 @@ def setUp(self):
         self.model_tester = Qwen2VLVisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -245,32 +251,44 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
+    def test_mismatching_num_image_tokens(self):
+        """
+        Tests that VLMs through an error with explicit message saying what is wrong
+        when number of images don't match number of image tokens in the text.
+        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            _ = model(**input_dict)  # successfull forward with no modifications
+
+            # remove one image but leave the image token in text
+            patch_size = config.vision_config.patch_size
+            one_img_length = (self.model_tester.image_size**2) // (patch_size**2)
+            input_dict["pixel_values"] = input_dict["pixel_values"][-one_img_length:, ...]
+            input_dict["image_grid_thw"] = input_dict["image_grid_thw"][-1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(**input_dict)
+
+            # simulate multi-image case by concatenating inputs where each has exactly one image/image-token
+            input_ids = input_dict["input_ids"][:1]
+            pixel_values = input_dict["pixel_values"][:one_img_length]
+            image_grid_thw = input_dict["image_grid_thw"][:1]
+            input_ids = torch.cat([input_ids, input_ids], dim=0)
+
+            # one image and two image tokens raise an error
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, pixel_values=pixel_values, image_grid_thw=image_grid_thw)
+
+            # two images and two image tokens don't raise an error
+            pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
+            image_grid_thw = torch.cat([image_grid_thw, image_grid_thw], dim=0)
+            _ = model(input_ids=input_ids, pixel_values=pixel_values, image_grid_thw=image_grid_thw)
 
     @unittest.skip(reason="Feedforward chunking is not yet supported")
     def test_feed_forward_chunking(self):
         pass
 
-    @unittest.skip(reason="Generate needs input ids")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        pass
-
     @unittest.skip(reason="CPU offload is not yet supported")
     def test_cpu_offload(self):
         pass
diff --git a/tests/models/rag/test_modeling_rag.py b/tests/models/rag/test_modeling_rag.py
index d00c06344118b3..3e3f7b9c457589 100644
--- a/tests/models/rag/test_modeling_rag.py
+++ b/tests/models/rag/test_modeling_rag.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 
-import gc
 import json
 import os
 import shutil
@@ -29,6 +28,7 @@
 from transformers.models.dpr.tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer
 from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
 from transformers.testing_utils import (
+    cleanup,
     get_tests_dir,
     require_sentencepiece,
     require_tokenizers,
@@ -196,8 +196,7 @@ def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device)
 
     def get_retriever(self, config):
         dataset = Dataset.from_dict(
@@ -684,8 +683,7 @@ class RagModelIntegrationTests(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     @cached_property
     def sequence_model(self):
@@ -1043,8 +1041,7 @@ class RagModelSaveLoadTests(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     def get_rag_config(self):
         question_encoder_config = AutoConfig.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
index 542955f9fa4511..985115d7707b6e 100644
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@@ -420,10 +420,6 @@ def _check_hidden_states_for_generate(
     def test_initialization(self):
         pass
 
-    @unittest.skip(reason="RecurrentGemma does not support generating with input embeddings (missing position_ids)")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        pass
-
 
 @require_torch_accelerator
 @slow
diff --git a/tests/models/rembert/test_tokenization_rembert.py b/tests/models/rembert/test_tokenization_rembert.py
index 9578a6782fce47..113d7b7676ae85 100644
--- a/tests/models/rembert/test_tokenization_rembert.py
+++ b/tests/models/rembert/test_tokenization_rembert.py
@@ -185,7 +185,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir)
                 )
                 EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
                 with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
-                    self.assertEqual(tokenizer._eos_token, new_eos)
+                    self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
                     self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))
 
                 with tempfile.TemporaryDirectory() as tmp_dir_2:
@@ -223,7 +223,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir)
                 with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
                     if self.rust_tokenizer_class is not None:
                         tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
-                        self.assertEqual(tokenizer_fast._eos_token, new_eos)
+                        self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
                         self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
                         # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
                         with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py
index ca557937803cff..1c128513b17d13 100644
--- a/tests/models/roberta/test_modeling_roberta.py
+++ b/tests/models/roberta/test_modeling_roberta.py
@@ -16,7 +16,7 @@
 
 import unittest
 
-from transformers import RobertaConfig, is_torch_available
+from transformers import AutoTokenizer, RobertaConfig, is_torch_available
 from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -41,6 +41,7 @@
         RobertaEmbeddings,
         create_position_ids_from_input_ids,
     )
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
 
 ROBERTA_TINY = "sshleifer/tiny-distilroberta-base"
 
@@ -576,3 +577,43 @@ def test_inference_classification_head(self):
         # expected_tensor = roberta.predict("mnli", input_ids, return_logits=True).detach()
 
         self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
+
+    @slow
+    def test_export(self):
+        if not is_torch_greater_or_equal_than_2_4:
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        roberta_model = "FacebookAI/roberta-base"
+        device = "cpu"
+        attn_implementation = "sdpa"
+        max_length = 512
+
+        tokenizer = AutoTokenizer.from_pretrained(roberta_model)
+        inputs = tokenizer(
+            "The goal of life is <mask>.",
+            return_tensors="pt",
+            padding="max_length",
+            max_length=max_length,
+        )
+
+        model = RobertaForMaskedLM.from_pretrained(
+            roberta_model,
+            device_map=device,
+            attn_implementation=attn_implementation,
+            use_cache=True,
+        )
+
+        logits = model(**inputs).logits
+        eager_predicted_mask = tokenizer.decode(logits[0, 6].topk(5).indices)
+        self.assertEqual(eager_predicted_mask.split(), ["happiness", "love", "peace", "freedom", "simplicity"])
+
+        exported_program = torch.export.export(
+            model,
+            args=(inputs["input_ids"],),
+            kwargs={"attention_mask": inputs["attention_mask"]},
+            strict=True,
+        )
+
+        result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"])
+        exported_predicted_mask = tokenizer.decode(result.logits[0, 6].topk(5).indices)
+        self.assertEqual(eager_predicted_mask, exported_predicted_mask)
diff --git a/tests/models/rt_detr/test_image_processing_rt_detr.py b/tests/models/rt_detr/test_image_processing_rt_detr.py
index 2a38664d433fea..e7bfbae3f9c27a 100644
--- a/tests/models/rt_detr/test_image_processing_rt_detr.py
+++ b/tests/models/rt_detr/test_image_processing_rt_detr.py
@@ -16,8 +16,8 @@
 
 import requests
 
-from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
@@ -25,7 +25,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import RTDetrImageProcessor
+    from transformers import RTDetrImageProcessor, RTDetrImageProcessorFast
 
 if is_torch_available():
     import torch
@@ -91,6 +91,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 @require_vision
 class RtDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = RTDetrImageProcessor if is_vision_available() else None
+    fast_image_processing_class = RTDetrImageProcessorFast if is_torchvision_available() else None
 
     def setUp(self):
         super().setUp()
@@ -101,17 +102,19 @@ def image_processor_dict(self):
         return self.image_processor_tester.prepare_image_processor_dict()
 
     def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "resample"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "rescale_factor"))
-        self.assertTrue(hasattr(image_processing, "return_tensors"))
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "size"))
+            self.assertTrue(hasattr(image_processing, "resample"))
+            self.assertTrue(hasattr(image_processing, "do_rescale"))
+            self.assertTrue(hasattr(image_processing, "rescale_factor"))
+            self.assertTrue(hasattr(image_processing, "return_tensors"))
 
     def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 640, "width": 640})
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.size, {"height": 640, "width": 640})
 
     def test_valid_coco_detection_annotations(self):
         # prepare image and target
@@ -121,32 +124,33 @@ def test_valid_coco_detection_annotations(self):
 
         params = {"image_id": 39769, "annotations": target}
 
-        # encode them
-        image_processing = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
+        for image_processing_class in self.image_processor_list:
+            # encode them
+            image_processing = image_processing_class.from_pretrained("PekingU/rtdetr_r50vd")
 
-        # legal encodings (single image)
-        _ = image_processing(images=image, annotations=params, return_tensors="pt")
-        _ = image_processing(images=image, annotations=[params], return_tensors="pt")
+            # legal encodings (single image)
+            _ = image_processing(images=image, annotations=params, return_tensors="pt")
+            _ = image_processing(images=image, annotations=[params], return_tensors="pt")
 
-        # legal encodings (batch of one image)
-        _ = image_processing(images=[image], annotations=params, return_tensors="pt")
-        _ = image_processing(images=[image], annotations=[params], return_tensors="pt")
+            # legal encodings (batch of one image)
+            _ = image_processing(images=[image], annotations=params, return_tensors="pt")
+            _ = image_processing(images=[image], annotations=[params], return_tensors="pt")
 
-        # legal encoding (batch of more than one image)
-        n = 5
-        _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt")
+            # legal encoding (batch of more than one image)
+            n = 5
+            _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt")
 
-        # example of an illegal encoding (missing the 'image_id' key)
-        with self.assertRaises(ValueError) as e:
-            image_processing(images=image, annotations={"annotations": target}, return_tensors="pt")
+            # example of an illegal encoding (missing the 'image_id' key)
+            with self.assertRaises(ValueError) as e:
+                image_processing(images=image, annotations={"annotations": target}, return_tensors="pt")
 
-        self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations"))
+            self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations"))
 
-        # example of an illegal encoding (unequal lengths of images and annotations)
-        with self.assertRaises(ValueError) as e:
-            image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt")
+            # example of an illegal encoding (unequal lengths of images and annotations)
+            with self.assertRaises(ValueError) as e:
+                image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt")
 
-        self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.")
+            self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.")
 
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
@@ -157,55 +161,57 @@ def test_call_pytorch_with_coco_detection_annotations(self):
 
         target = {"image_id": 39769, "annotations": target}
 
-        # encode them
-        image_processing = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
-        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 640, 640])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.5490, 0.5647, 0.5725])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([2827.9883, 5403.4761, 235036.7344, 402070.2188, 71068.8281, 79601.2812])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([640, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+        for image_processing_class in self.image_processor_list:
+            # encode them
+            image_processing = image_processing_class.from_pretrained("PekingU/rtdetr_r50vd")
+            encoding = image_processing(images=image, annotations=target, return_tensors="pt")
+
+            # verify pixel values
+            expected_shape = torch.Size([1, 3, 640, 640])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+            expected_slice = torch.tensor([0.5490, 0.5647, 0.5725])
+            self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+            # verify area
+            expected_area = torch.tensor([2827.9883, 5403.4761, 235036.7344, 402070.2188, 71068.8281, 79601.2812])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+            # verify boxes
+            expected_boxes_shape = torch.Size([6, 4])
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+            expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+            # verify image_id
+            expected_image_id = torch.tensor([39769])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+            # verify is_crowd
+            expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+            # verify class_labels
+            expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+            # verify orig_size
+            expected_orig_size = torch.tensor([480, 640])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+            # verify size
+            expected_size = torch.tensor([640, 640])
+            self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
 
     @slow
     def test_image_processor_outputs(self):
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
 
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        encoding = image_processing(images=image, return_tensors="pt")
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            encoding = image_processing(images=image, return_tensors="pt")
 
-        # verify pixel values: shape
-        expected_shape = torch.Size([1, 3, 640, 640])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+            # verify pixel values: shape
+            expected_shape = torch.Size([1, 3, 640, 640])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)
 
-        # verify pixel values: output values
-        expected_slice = torch.tensor([0.5490196347236633, 0.5647059082984924, 0.572549045085907])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-5))
+            # verify pixel values: output values
+            expected_slice = torch.tensor([0.5490196347236633, 0.5647059082984924, 0.572549045085907])
+            self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-5))
 
     def test_multiple_images_processor_outputs(self):
         images_urls = [
@@ -224,31 +230,32 @@ def test_multiple_images_processor_outputs(self):
             image = Image.open(requests.get(url, stream=True).raw)
             images.append(image)
 
-        # apply image processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        encoding = image_processing(images=images, return_tensors="pt")
-
-        # verify if pixel_values is part of the encoding
-        self.assertIn("pixel_values", encoding)
-
-        # verify pixel values: shape
-        expected_shape = torch.Size([8, 3, 640, 640])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # verify pixel values: output values
-        expected_slices = torch.tensor(
-            [
-                [0.5333333611488342, 0.5568627715110779, 0.5647059082984924],
-                [0.5372549295425415, 0.4705882668495178, 0.4274510145187378],
-                [0.3960784673690796, 0.35686275362968445, 0.3686274588108063],
-                [0.20784315466880798, 0.1882353127002716, 0.15294118225574493],
-                [0.364705890417099, 0.364705890417099, 0.3686274588108063],
-                [0.8078432083129883, 0.8078432083129883, 0.8078432083129883],
-                [0.4431372880935669, 0.4431372880935669, 0.4431372880935669],
-                [0.19607844948768616, 0.21176472306251526, 0.3607843220233917],
-            ]
-        )
-        self.assertTrue(torch.allclose(encoding["pixel_values"][:, 1, 0, :3], expected_slices, atol=1e-5))
+        for image_processing_class in self.image_processor_list:
+            # apply image processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            encoding = image_processing(images=images, return_tensors="pt")
+
+            # verify if pixel_values is part of the encoding
+            self.assertIn("pixel_values", encoding)
+
+            # verify pixel values: shape
+            expected_shape = torch.Size([8, 3, 640, 640])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+            # verify pixel values: output values
+            expected_slices = torch.tensor(
+                [
+                    [0.5333333611488342, 0.5568627715110779, 0.5647059082984924],
+                    [0.5372549295425415, 0.4705882668495178, 0.4274510145187378],
+                    [0.3960784673690796, 0.35686275362968445, 0.3686274588108063],
+                    [0.20784315466880798, 0.1882353127002716, 0.15294118225574493],
+                    [0.364705890417099, 0.364705890417099, 0.3686274588108063],
+                    [0.8078432083129883, 0.8078432083129883, 0.8078432083129883],
+                    [0.4431372880935669, 0.4431372880935669, 0.4431372880935669],
+                    [0.19607844948768616, 0.21176472306251526, 0.3607843220233917],
+                ]
+            )
+            self.assertTrue(torch.allclose(encoding["pixel_values"][:, 1, 0, :3], expected_slices, atol=1e-5))
 
     @slow
     def test_batched_coco_detection_annotations(self):
@@ -277,89 +284,146 @@ def test_batched_coco_detection_annotations(self):
         images = [image_0, image_1]
         annotations = [annotations_0, annotations_1]
 
-        image_processing = RTDetrImageProcessor()
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            return_tensors="pt",  # do_convert_annotations=True
-        )
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class()
+            encoding = image_processing(
+                images=images,
+                annotations=annotations,
+                return_segmentation_masks=True,
+                return_tensors="pt",  # do_convert_annotations=True
+            )
+
+            # Check the pixel values have been padded
+            postprocessed_height, postprocessed_width = 640, 640
+            expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+            # Check the bounding boxes have been adjusted for padded images
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+            self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+            expected_boxes_0 = torch.tensor(
+                [
+                    [0.6879, 0.4609, 0.0755, 0.3691],
+                    [0.2118, 0.3359, 0.2601, 0.1566],
+                    [0.5011, 0.5000, 0.9979, 1.0000],
+                    [0.5010, 0.5020, 0.9979, 0.9959],
+                    [0.3284, 0.5944, 0.5884, 0.8112],
+                    [0.8394, 0.5445, 0.3213, 0.9110],
+                ]
+            )
+            expected_boxes_1 = torch.tensor(
+                [
+                    [0.5503, 0.2765, 0.0604, 0.2215],
+                    [0.1695, 0.2016, 0.2080, 0.0940],
+                    [0.5006, 0.4933, 0.9977, 0.9865],
+                    [0.5008, 0.5002, 0.9983, 0.9955],
+                    [0.2627, 0.5456, 0.4707, 0.8646],
+                    [0.7715, 0.4115, 0.4570, 0.7161],
+                ]
+            )
+            self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
+            self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
+
+            # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
+            # format and not in the range [0, 1]
+            encoding = image_processing(
+                images=images,
+                annotations=annotations,
+                return_segmentation_masks=True,
+                do_convert_annotations=False,
+                return_tensors="pt",
+            )
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+            self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+            # Convert to absolute coordinates
+            unnormalized_boxes_0 = torch.vstack(
+                [
+                    expected_boxes_0[:, 0] * postprocessed_width,
+                    expected_boxes_0[:, 1] * postprocessed_height,
+                    expected_boxes_0[:, 2] * postprocessed_width,
+                    expected_boxes_0[:, 3] * postprocessed_height,
+                ]
+            ).T
+            unnormalized_boxes_1 = torch.vstack(
+                [
+                    expected_boxes_1[:, 0] * postprocessed_width,
+                    expected_boxes_1[:, 1] * postprocessed_height,
+                    expected_boxes_1[:, 2] * postprocessed_width,
+                    expected_boxes_1[:, 3] * postprocessed_height,
+                ]
+            ).T
+            # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
+            expected_boxes_0 = torch.vstack(
+                [
+                    unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
+                    unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
+                    unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
+                    unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
+                ]
+            ).T
+            expected_boxes_1 = torch.vstack(
+                [
+                    unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
+                    unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
+                    unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
+                    unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
+                ]
+            ).T
+            self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
+            self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
 
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 640, 640
-        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        expected_boxes_0 = torch.tensor(
-            [
-                [0.6879, 0.4609, 0.0755, 0.3691],
-                [0.2118, 0.3359, 0.2601, 0.1566],
-                [0.5011, 0.5000, 0.9979, 1.0000],
-                [0.5010, 0.5020, 0.9979, 0.9959],
-                [0.3284, 0.5944, 0.5884, 0.8112],
-                [0.8394, 0.5445, 0.3213, 0.9110],
-            ]
+    @slow
+    @require_torch_gpu
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations
+    def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        processor = self.image_processor_list[1]()
+        # 1. run processor on CPU
+        encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu")
+        # 2. run processor on GPU
+        encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda")
+
+        # verify pixel values
+        self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)
+        self.assertTrue(
+            torch.allclose(
+                encoding_cpu["pixel_values"][0, 0, 0, :3],
+                encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"),
+                atol=1e-4,
+            )
         )
-        expected_boxes_1 = torch.tensor(
-            [
-                [0.5503, 0.2765, 0.0604, 0.2215],
-                [0.1695, 0.2016, 0.2080, 0.0940],
-                [0.5006, 0.4933, 0.9977, 0.9865],
-                [0.5008, 0.5002, 0.9983, 0.9955],
-                [0.2627, 0.5456, 0.4707, 0.8646],
-                [0.7715, 0.4115, 0.4570, 0.7161],
-            ]
+        # verify area
+        self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu")))
+        # verify boxes
+        self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape)
+        self.assertTrue(
+            torch.allclose(
+                encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3
+            )
         )
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="pt",
+        # verify image_id
+        self.assertTrue(
+            torch.allclose(encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu"))
         )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = torch.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = torch.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = torch.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = torch.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
+        # verify is_crowd
+        self.assertTrue(
+            torch.allclose(encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu"))
+        )
+        # verify class_labels
+        self.assertTrue(
+            torch.allclose(
+                encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu")
+            )
+        )
+        # verify orig_size
+        self.assertTrue(
+            torch.allclose(encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu"))
+        )
+        # verify size
+        self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu")))
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index 8ec97d7b2e5d8b..7faace0096c8de 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -14,13 +14,12 @@
 # limitations under the License.
 """Testing suite for the PyTorch SAM model."""
 
-import gc
 import unittest
 
 import requests
 
 from transformers import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig, pipeline
-from transformers.testing_utils import backend_empty_cache, require_torch, slow, torch_device
+from transformers.testing_utils import cleanup, require_torch, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -469,8 +468,7 @@ class SamModelIntegrationTest(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        backend_empty_cache(torch_device)
+        cleanup(torch_device, gc_collect=True)
 
     def test_inference_mask_generation_no_point(self):
         model = SamModel.from_pretrained("facebook/sam-vit-base")
diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py
index 2fe06b1511a471..61ac78f102994a 100644
--- a/tests/models/siglip/test_modeling_siglip.py
+++ b/tests/models/siglip/test_modeling_siglip.py
@@ -667,9 +667,12 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test
     test_disk_offload_bin = False
     _is_composite = True
 
-    # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.setUp with CLIP->Siglip
     def setUp(self):
         self.model_tester = SiglipModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=SiglipConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.test_model
     def test_model(self):
diff --git a/tests/models/stablelm/test_modeling_stablelm.py b/tests/models/stablelm/test_modeling_stablelm.py
index e1f9bc2b8e8f9f..91044a4eb750d1 100644
--- a/tests/models/stablelm/test_modeling_stablelm.py
+++ b/tests/models/stablelm/test_modeling_stablelm.py
@@ -21,11 +21,9 @@
 
 from transformers import StableLmConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
-    is_flaky,
     require_bitsandbytes,
     require_flash_attn,
     require_torch,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -558,67 +556,3 @@ def test_model_3b_long_prompt(self):
         input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
         generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
         self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-3:].tolist())
-
-    # Copied from transformers.tests.models.llama.test_modeling_llama.LlamaModelTest.test_eager_matches_sdpa_generate with Llama->StableLm,saibo/llama-1B->stabilityai/stablelm-3b-4e1t
-    # TODO: @Fxmarty
-    @is_flaky(max_attempts=3, description="flaky on some models.")
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        max_new_tokens = 30
-
-        tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t")
-
-        model_sdpa = StableLmForCausalLM.from_pretrained(
-            "stabilityai/stablelm-3b-4e1t",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-        ).to(torch_device)
-
-        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-        model_eager = StableLmForCausalLM.from_pretrained(
-            "stabilityai/stablelm-3b-4e1t",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-            attn_implementation="eager",
-        ).to(torch_device)
-
-        self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-        for name, submodule in model_eager.named_modules():
-            if "SdpaAttention" in submodule.__class__.__name__:
-                raise ValueError("The eager model should not have SDPA attention layers")
-
-        has_sdpa = False
-        for name, submodule in model_sdpa.named_modules():
-            if "SdpaAttention" in submodule.__class__.__name__:
-                has_sdpa = True
-                break
-        if not has_sdpa:
-            raise ValueError("The SDPA model should have SDPA attention layers")
-
-        texts = [
-            "hi here's a longer context, getting longer and",
-            "Hello this is a very long sentence my friend, very long for real",
-            "Today I am in Paris and",
-        ]
-
-        for padding_side in ["left", "right"]:
-            tokenizer.padding_side = padding_side
-            tokenizer.pad_token = tokenizer.eos_token
-
-            inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
-
-            res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-            res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-
-            with self.subTest(f"{padding_side}"):
-                torch.testing.assert_close(
-                    res_eager,
-                    res_sdpa,
-                    msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
-                )
diff --git a/tests/models/starcoder2/test_modeling_starcoder2.py b/tests/models/starcoder2/test_modeling_starcoder2.py
index 32d28143d72ffa..df743f132c1140 100644
--- a/tests/models/starcoder2/test_modeling_starcoder2.py
+++ b/tests/models/starcoder2/test_modeling_starcoder2.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Starcoder2 model."""
 
-import tempfile
 import unittest
 
 import pytest
@@ -404,85 +403,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Starcoder2 apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index 90bbf82d1ed80a..c2eae872004c77 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -16,7 +16,7 @@
 import numpy as np
 
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_vision_available
+from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_image_processing_common import (
     ImageProcessingTestMixin,
@@ -24,6 +24,11 @@
 )
 
 
+if is_torch_available():
+    import torch
+
+    from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput
+
 if is_vision_available():
     from transformers import SuperPointImageProcessor
 
@@ -70,6 +75,23 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
             torchify=torchify,
         )
 
+    def prepare_keypoint_detection_output(self, pixel_values):
+        max_number_keypoints = 50
+        batch_size = len(pixel_values)
+        mask = torch.zeros((batch_size, max_number_keypoints))
+        keypoints = torch.zeros((batch_size, max_number_keypoints, 2))
+        scores = torch.zeros((batch_size, max_number_keypoints))
+        descriptors = torch.zeros((batch_size, max_number_keypoints, 16))
+        for i in range(batch_size):
+            random_number_keypoints = np.random.randint(0, max_number_keypoints)
+            mask[i, :random_number_keypoints] = 1
+            keypoints[i, :random_number_keypoints] = torch.rand((random_number_keypoints, 2))
+            scores[i, :random_number_keypoints] = torch.rand((random_number_keypoints,))
+            descriptors[i, :random_number_keypoints] = torch.rand((random_number_keypoints, 16))
+        return SuperPointKeypointDescriptionOutput(
+            loss=None, keypoints=keypoints, scores=scores, descriptors=descriptors, mask=mask, hidden_states=None
+        )
+
 
 @require_torch
 @require_vision
@@ -110,3 +132,33 @@ def test_input_image_properly_converted_to_grayscale(self):
         pre_processed_images = image_processor.preprocess(image_inputs)
         for image in pre_processed_images["pixel_values"]:
             self.assertTrue(np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]))
+
+    @require_torch
+    def test_post_processing_keypoint_detection(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        image_inputs = self.image_processor_tester.prepare_image_inputs()
+        pre_processed_images = image_processor.preprocess(image_inputs, return_tensors="pt")
+        outputs = self.image_processor_tester.prepare_keypoint_detection_output(**pre_processed_images)
+
+        def check_post_processed_output(post_processed_output, image_size):
+            for post_processed_output, image_size in zip(post_processed_output, image_size):
+                self.assertTrue("keypoints" in post_processed_output)
+                self.assertTrue("descriptors" in post_processed_output)
+                self.assertTrue("scores" in post_processed_output)
+                keypoints = post_processed_output["keypoints"]
+                all_below_image_size = torch.all(keypoints[:, 0] <= image_size[1]) and torch.all(
+                    keypoints[:, 1] <= image_size[0]
+                )
+                all_above_zero = torch.all(keypoints[:, 0] >= 0) and torch.all(keypoints[:, 1] >= 0)
+                self.assertTrue(all_below_image_size)
+                self.assertTrue(all_above_zero)
+
+        tuple_image_sizes = [(image.size[0], image.size[1]) for image in image_inputs]
+        tuple_post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, tuple_image_sizes)
+
+        check_post_processed_output(tuple_post_processed_outputs, tuple_image_sizes)
+
+        tensor_image_sizes = torch.tensor([image.size for image in image_inputs]).flip(1)
+        tensor_post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, tensor_image_sizes)
+
+        check_post_processed_output(tensor_post_processed_outputs, tensor_image_sizes)
diff --git a/tests/models/superpoint/test_modeling_superpoint.py b/tests/models/superpoint/test_modeling_superpoint.py
index 25c384a7955793..8db435502ca565 100644
--- a/tests/models/superpoint/test_modeling_superpoint.py
+++ b/tests/models/superpoint/test_modeling_superpoint.py
@@ -260,7 +260,7 @@ def test_inference(self):
         inputs = preprocessor(images=images, return_tensors="pt").to(torch_device)
         with torch.no_grad():
             outputs = model(**inputs)
-        expected_number_keypoints_image0 = 567
+        expected_number_keypoints_image0 = 568
         expected_number_keypoints_image1 = 830
         expected_max_number_keypoints = max(expected_number_keypoints_image0, expected_number_keypoints_image1)
         expected_keypoints_shape = torch.Size((len(images), expected_max_number_keypoints, 2))
@@ -275,11 +275,13 @@ def test_inference(self):
         self.assertEqual(outputs.keypoints.shape, expected_keypoints_shape)
         self.assertEqual(outputs.scores.shape, expected_scores_shape)
         self.assertEqual(outputs.descriptors.shape, expected_descriptors_shape)
-        expected_keypoints_image0_values = torch.tensor([[480.0, 9.0], [494.0, 9.0], [489.0, 16.0]]).to(torch_device)
+        expected_keypoints_image0_values = torch.tensor([[0.75, 0.0188], [0.7719, 0.0188], [0.7641, 0.0333]]).to(
+            torch_device
+        )
         expected_scores_image0_values = torch.tensor(
-            [0.0064, 0.0137, 0.0589, 0.0723, 0.5166, 0.0174, 0.1515, 0.2054, 0.0334]
+            [0.0064, 0.0139, 0.0591, 0.0727, 0.5170, 0.0175, 0.1526, 0.2057, 0.0335]
         ).to(torch_device)
-        expected_descriptors_image0_value = torch.tensor(-0.1096).to(torch_device)
+        expected_descriptors_image0_value = torch.tensor(-0.1095).to(torch_device)
         predicted_keypoints_image0_values = outputs.keypoints[0, :3]
         predicted_scores_image0_values = outputs.scores[0, :9]
         predicted_descriptors_image0_value = outputs.descriptors[0, 0, 0]
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index 68dd5a52b3d69b..b03416390766d0 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -580,9 +580,6 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     # The small T5 model needs higher percentages for CPU/MP tests
     model_split_percents = [0.5, 0.8, 0.9]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "google-t5/t5-small"
-
     def setUp(self):
         self.model_tester = T5ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py
index 49327a39cd80d3..0a911f7182b4a0 100644
--- a/tests/models/tapas/test_tokenization_tapas.py
+++ b/tests/models/tapas/test_tokenization_tapas.py
@@ -1290,3 +1290,7 @@ def test_chat_template(self):
     @unittest.skip("Chat is not supported")
     def test_chat_template_return_assistant_tokens_mask(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        pass
diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py
index 9d82173b1aed6c..d55400799dbd30 100644
--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@@ -275,7 +275,11 @@ class UdopModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         else ()
     )
     all_generative_model_classes = (UdopForConditionalGeneration,) if is_torch_available() else ()
-    pipeline_model_mapping = {"feature-extraction": UdopModel} if is_torch_available() else {}
+    pipeline_model_mapping = (
+        {"feature-extraction": UdopModel, "image-text-to-text": UdopForConditionalGeneration}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
     test_pruning = False
     test_torchscript = False
diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py
index 78153172f2c729..a6ac2ff3d38096 100644
--- a/tests/models/udop/test_tokenization_udop.py
+++ b/tests/models/udop/test_tokenization_udop.py
@@ -1161,6 +1161,10 @@ def test_chat_template(self):
     def test_chat_template_return_assistant_tokens_mask(self):
         pass
 
+    @unittest.skip("Chat is not supported")
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        pass
+
     @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
     def test_chat_template_batched(self):
         pass
@@ -1538,7 +1542,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         special_tokens_map = {}
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                 special_token = getattr(tokenizer, token)
                 special_tokens_map[special_token] = f"{special_token}a"
 
@@ -1550,7 +1554,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         # Check the changes
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                 continue
             special_token = getattr(tokenizer, token)
             if special_token in special_tokens_map:
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index ec4c1d019b6d17..377668851c5815 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -317,9 +317,6 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     # The small UMT5 model needs higher percentages for CPU/MP tests
     model_split_percents = [0.5, 0.8, 0.9]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "google/umt5-small"
-
     def setUp(self):
         self.model_tester = UMT5ModelTester(self)
 
diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py
index 84d28c645874d1..6310224e07f297 100644
--- a/tests/models/univnet/test_modeling_univnet.py
+++ b/tests/models/univnet/test_modeling_univnet.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
 import inspect
 import random
 import unittest
@@ -21,7 +20,7 @@
 
 from transformers import UnivNetConfig, UnivNetFeatureExtractor
 from transformers.testing_utils import (
-    backend_empty_cache,
+    cleanup,
     is_torch_available,
     require_torch,
     require_torch_accelerator,
@@ -211,8 +210,7 @@ def test_unbatched_inputs_outputs(self):
 class UnivNetModelIntegrationTests(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
+        cleanup(torch_device, gc_collect=True)
 
     def _load_datasamples(self, num_samples, sampling_rate=24000):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 1bd01843981deb..14b079665ab6d6 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch VideoLlava model."""
 
-import gc
 import unittest
 
 import numpy as np
@@ -28,7 +27,15 @@
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_gpu, slow, torch_device
+from transformers.testing_utils import (
+    cleanup,
+    require_bitsandbytes,
+    require_torch,
+    require_torch_gpu,
+    run_test_using_subprocess,
+    slow,
+    torch_device,
+)
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -50,8 +57,8 @@ def __init__(
         image_token_index=0,
         video_token_index=1,
         projector_hidden_act="gelu",
-        seq_length=13,
-        num_frames=8,
+        seq_length=3,
+        num_frames=2,
         vision_feature_select_strategy="default",
         vision_feature_layer=-1,
         text_config={
@@ -81,7 +88,7 @@ def __init__(
         vision_config={
             "model_type": "clip_vision_model",
             "batch_size": 12,
-            "image_size": 30,
+            "image_size": 8,
             "patch_size": 6,
             "num_channels": 3,
             "is_training": True,
@@ -116,10 +123,11 @@ def __init__(
         self.batch_size = 5
         self.num_channels = 3
         self.image_size = 224
-        self.encoder_seq_length = 64
-        self.num_image_tokens = 25
-        self.num_video_tokens = 26
+
+        self.num_image_tokens = (vision_config["image_size"] // vision_config["patch_size"]) ** 2
+        self.num_video_tokens = (self.num_image_tokens + 1) * self.num_frames
         self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens
+        self.encoder_seq_length = self.seq_length
 
     def get_config(self):
         return VideoLlavaConfig(
@@ -210,7 +218,13 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
 
     def setUp(self):
         self.model_tester = VideoLlavaVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VideoLlavaConfig, has_text_modality=False)
+        common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"]
+        self.config_tester = ConfigTester(
+            self, config_class=VideoLlavaConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
@@ -248,9 +262,7 @@ def test_flash_attn_2_fp32_ln(self):
     def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
         pass
 
-    @unittest.skip(
-        reason="After #33533, this still passes, but many subsequential tests fail with `device-side assert triggered`"
-    )
+    @run_test_using_subprocess
     def test_mixed_input(self):
         config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
@@ -262,7 +274,7 @@ def test_mixed_input(self):
             # if we remove some images from inputs leaving only one
             # image number mismatch error should raise
             inputs["pixel_values_images"] = inputs["pixel_values_images"][:1]
-            with self.assertRaises(RuntimeError):
+            with self.assertRaises(ValueError):
                 _ = model(**inputs)
 
     def test_video_only_input(self):
@@ -396,6 +408,35 @@ def test_inputs_embeds_matches_input_ids(self):
                 out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
             self.assertTrue(torch.allclose(out_embeds, out_ids))
 
+    def test_mismatching_num_image_tokens(self):
+        """
+        Tests that VLMs through an error with explicit message saying what is wrong
+        when number of images don't match number of image tokens in the text.
+        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            _ = model(**input_dict)  # successfull forward with no modifications
+
+            # remove one image but leave the image token in text
+            input_dict["pixel_values_images"] = input_dict["pixel_values_images"][-1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(**input_dict)
+
+            # simulate multi-image case by concatenating inputs where each has exactly one image/image-token
+            input_ids = input_dict["input_ids"][:1]
+            pixel_values = input_dict["pixel_values_images"][:1]
+            input_ids = torch.cat([input_ids, input_ids], dim=0)
+
+            # one image and two image tokens raise an error
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, pixel_values_images=pixel_values)
+
+            # two images and two image tokens don't raise an error
+            pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
+            _ = model(input_ids=input_ids, pixel_values_images=pixel_values)
+
 
 @require_torch
 class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
@@ -403,8 +444,7 @@ def setUp(self):
         self.processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
 
     def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     @require_bitsandbytes
@@ -585,12 +625,14 @@ def test_expansion_in_processing_images(self):
         # check processing with expansion of inputs
         processor.vision_feature_select_strategy = "default"
         processor.patch_size = 14
+        processor.num_additional_image_tokens = 1
         inputs_expanded = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16)
         self.assertTrue(inputs_expanded.input_ids.shape[-1] == 274)
 
         # check processing without expansion of inputs (legacy behavior)
         processor.vision_feature_select_strategy = None
         processor.patch_size = None
+        processor.num_additional_image_tokens = None
         inputs = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16)
         self.assertTrue(inputs.input_ids.shape[-1] == 19)
 
@@ -617,12 +659,14 @@ def test_expansion_in_processing(self):
         # check processing with expansion of inputs
         processor.vision_feature_select_strategy = "default"
         processor.patch_size = 14
+        processor.num_additional_image_tokens = 1
         inputs_expanded = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
         self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2074)
 
         # check processing without expansion of inputs (legacy behavior)
         processor.vision_feature_select_strategy = None
         processor.patch_size = None
+        processor.num_additional_image_tokens = None
         inputs = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
         self.assertTrue(inputs.input_ids.shape[-1] == 19)
 
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 801990331fea53..212eae1471222f 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -22,7 +22,7 @@
 
 from transformers import VideoMAEConfig
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import require_torch, require_torch_sdpa, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -213,6 +213,11 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
 
         return inputs_dict
 
+    @unittest.skip("`mse_cpu` not implemented for 'BFloat16'")
+    @require_torch_sdpa
+    def test_eager_matches_sdpa_inference_1_bfloat16(self):
+        pass
+
     def test_config(self):
         self.config_tester.run_common_tests()
 
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index 2c241c23f26158..4f501fc10a028f 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch VipLlava model."""
 
-import gc
 import unittest
 
 import requests
@@ -26,7 +25,14 @@
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_gpu, slow, torch_device
+from transformers.testing_utils import (
+    cleanup,
+    require_bitsandbytes,
+    require_torch,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -79,7 +85,7 @@ def __init__(
         is_training=True,
         vision_config={
             "batch_size": 12,
-            "image_size": 30,
+            "image_size": 8,
             "patch_size": 2,
             "num_channels": 3,
             "is_training": True,
@@ -111,9 +117,9 @@ def __init__(
         self.batch_size = 3
         self.num_channels = 3
         self.image_size = 336
-        self.encoder_seq_length = 232
-        self.num_image_tokens = 225
+        self.num_image_tokens = (self.vision_config["image_size"] // self.vision_config["patch_size"]) ** 2
         self.seq_length = seq_length + self.num_image_tokens
+        self.encoder_seq_length = self.seq_length
 
     def get_config(self):
         return VipLlavaConfig(
@@ -164,6 +170,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
 
     all_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
     all_generative_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": VipLlavaForConditionalGeneration} if is_torch_available() else {}
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = True
@@ -172,7 +179,13 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
 
     def setUp(self):
         self.model_tester = VipLlavaVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False)
+        common_properties = ["image_token_index", "vision_feature_layers", "image_seq_length"]
+        self.config_tester = ConfigTester(
+            self, config_class=VipLlavaConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
     def test_inputs_embeds(self):
@@ -217,6 +230,36 @@ def test_inputs_embeds_matches_input_ids(self):
                 out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
             self.assertTrue(torch.allclose(out_embeds, out_ids))
 
+    # Copied from tests.models.llava.test_modeling_llava.LlavaForConditionalGenerationModelTest.test_mismatching_num_image_tokens
+    def test_mismatching_num_image_tokens(self):
+        """
+        Tests that VLMs through an error with explicit message saying what is wrong
+        when number of images don't match number of image tokens in the text.
+        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            _ = model(**input_dict)  # successfull forward with no modifications
+
+            # remove one image but leave the image token in text
+            input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(**input_dict)
+
+            # simulate multi-image case by concatenating inputs where each has exactly one image/image-token
+            input_ids = input_dict["input_ids"][:1]
+            pixel_values = input_dict["pixel_values"][:1]
+            input_ids = torch.cat([input_ids, input_ids], dim=0)
+
+            # one image and two image tokens raise an error
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, pixel_values=pixel_values)
+
+            # two images and two image tokens don't raise an error
+            pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
+            _ = model(input_ids=input_ids, pixel_values=pixel_values)
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
@@ -260,8 +303,7 @@ def setUp(self):
         self.processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")
 
     def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     @require_bitsandbytes
@@ -332,12 +374,14 @@ def test_expansion_in_processing(self):
         # check processing with expansion of inputs
         processor.vision_feature_select_strategy = "default"
         processor.patch_size = 14
+        processor.num_additional_image_tokens = 1
         inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
         self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
 
         # check processing without expansion of inputs (legacy behavior)
         processor.vision_feature_select_strategy = None
         processor.patch_size = None
+        processor.num_additional_image_tokens = None
         inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
         self.assertTrue(inputs.input_ids.shape[-1] == 18)
 
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index ff7a85218d3a00..b2d90adc79da96 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Wav2Vec2 model."""
 
-import gc
 import math
 import multiprocessing
 import os
@@ -30,7 +29,7 @@
 from transformers import Wav2Vec2Config, is_torch_available
 from transformers.testing_utils import (
     CaptureLogger,
-    backend_empty_cache,
+    cleanup,
     is_pt_flax_cross_test,
     is_pyctcdecode_available,
     is_torchaudio_available,
@@ -1460,8 +1459,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        backend_empty_cache(torch_device)
+        cleanup(torch_device, gc_collect=True)
 
     def _load_datasamples(self, num_samples):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 8b600c304f255c..bf0da2c2757bbf 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1534,59 +1534,6 @@ def test_generate_output_type(self, return_dict_in_generate):
             )
             self.assertIsInstance(pred_ids, expected_output_type)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_reuse_cache(self):
-        max_new_tokens = 2
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name][..., :10]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # run generate once to get filled cache
-                output = model.generate(
-                    dummy_input,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                    return_dict_in_generate=True,
-                )
-                past_key_values = output.past_key_values
-
-                # Try to continue generation from where we left, given that we have more than 1 new token to process
-                # e.g. this can happen in speculative decoding when feeding candidate tokens back to target model
-                _ = model.generate(
-                    dummy_input,
-                    decoder_input_ids=output.sequences,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                    past_key_values=past_key_values,
-                )
-
     def test_labels_sequence_max_length_correct(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -3941,11 +3888,6 @@ def test_generate_without_input_ids(self):
         # generate only works with input ids for whisper
         pass
 
-    @unittest.skip(reason="Generate needs input ids")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        # generate only works with input ids for whisper
-        pass
-
     @unittest.skip(reason="Decoder can't keep attention grads")
     def test_retain_grad_hidden_states_attentions(self):
         return
@@ -3954,18 +3896,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip(
-        reason="FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
-    )
-    def test_flash_attn_2_generate_reuse_cache(self):
-        pass
-
-    @unittest.skip(
-        "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
-    )
-    def test_flash_attn_2_generate_padding_right(self):
-        pass
-
     @unittest.skip(
         "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
     )
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 8b91019bae18cc..04dd2d9d29687a 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -547,6 +547,13 @@ class XCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = XCLIPModelTester(self)
+        common_properties = ["projection_dim", "prompt_layers", "prompt_num_attention_heads"]
+        self.config_tester = ConfigTester(
+            self, config_class=XCLIPConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index 07a1e579c60b1f..373bc9eea4a388 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -14,12 +14,12 @@
 # limitations under the License.
 
 import datetime
-import gc
 import math
 import unittest
 
 from transformers import XGLMConfig, is_torch_available
 from transformers.testing_utils import (
+    cleanup,
     require_torch,
     require_torch_accelerator,
     require_torch_fp16,
@@ -343,8 +343,7 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     def _test_lm_generate_xglm_helper(
         self,
diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
index 5b426d27799fbb..5d9abb238e793d 100644
--- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
+++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
@@ -14,11 +14,10 @@
 # limitations under the License.
 
 
-import tempfile
 import unittest
 
 from transformers import XLMRobertaXLConfig, is_torch_available
-from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device
+from transformers.testing_utils import require_torch, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -523,84 +522,6 @@ def test_create_position_ids_from_inputs_embeds(self):
         self.assertEqual(position_ids.shape, expected_positions.shape)
         self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
 
-    # TODO: Remove this and use the parent method (in common tests) once XLM RoBERTa XL supports low_cpu_mem_usage=True.
-    @require_torch_sdpa
-    @slow
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate
-    def test_eager_matches_sdpa_generate(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        max_new_tokens = 30
-
-        if len(self.all_generative_model_classes) == 0:
-            self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_sdpa:
-                self.skipTest(f"{model_class.__name__} does not support SDPA")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                # Ignore copy
-                model_sdpa = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=False,
-                ).to(torch_device)
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-                # Ignore copy
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=False,
-                    attn_implementation="eager",
-                ).to(torch_device)
-
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-                for name, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
-                has_sdpa = False
-                for name, submodule in model_sdpa.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        has_sdpa = True
-                        break
-                if not has_sdpa:
-                    raise ValueError("The SDPA model should have SDPA attention layers")
-
-                # Just test that a large cache works as expected
-                res_eager = model_eager.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
-
-                res_sdpa = model_sdpa.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(res_eager, res_sdpa))
-
 
 @require_torch
 class XLMRobertaModelXLIntegrationTest(unittest.TestCase):
diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py
index c0a8020bedd76a..a6dd516f98a412 100644
--- a/tests/models/zamba/test_modeling_zamba.py
+++ b/tests/models/zamba/test_modeling_zamba.py
@@ -542,93 +542,6 @@ def test_flash_attn_2_fp32_ln(self):
                 # with attention mask
                 _ = model(dummy_input, attention_mask=dummy_attention_mask)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        r"""
-        Overriding the test_flash_attn_2_generate_padding_right test as the Zamba model, like Mixtral, doesn't support
-        right padding + use cache with FA2
-        """
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        r"""
-        Overriding the test_flash_attn_2_generate_use_cache test as the Zamba model, like Mixtral, doesn't support
-        right padding + use cache with FA2
-        """
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Zamba does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/pipelines/test_pipelines_image_text_to_text.py b/tests/pipelines/test_pipelines_image_text_to_text.py
new file mode 100644
index 00000000000000..7b9e17edd36fe9
--- /dev/null
+++ b/tests/pipelines/test_pipelines_image_text_to_text.py
@@ -0,0 +1,304 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import unittest
+
+from transformers import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, is_vision_available
+from transformers.pipelines import ImageTextToTextPipeline, pipeline
+from transformers.testing_utils import (
+    is_pipeline_test,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_vision
+class ImageTextToTextPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
+
+    def get_test_pipeline(self, model, tokenizer, processor, image_processor, torch_dtype="float32"):
+        pipe = ImageTextToTextPipeline(model=model, processor=processor, torch_dtype=torch_dtype)
+        image_token = getattr(processor.tokenizer, "image_token", "")
+        examples = [
+            {
+                "images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                "text": f"{image_token}This is a ",
+            },
+            {
+                "images": "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "text": f"{image_token}Here I see a ",
+            },
+        ]
+        return pipe, examples
+
+    def run_pipeline_test(self, pipe, examples):
+        outputs = pipe(examples[0].get("images"), text=examples[0].get("text"))
+        self.assertEqual(
+            outputs,
+            [
+                {"input_text": ANY(str), "generated_text": ANY(str)},
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt_token(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        text = "<image> What this is? Assistant: This is"
+
+        outputs = pipe(image, text=text)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
+                }
+            ],
+        )
+
+        outputs = pipe([image, image], text=[text, text])
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
+                },
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
+                },
+            ],
+        )
+
+    @require_torch
+    def test_consistent_batching_behaviour(self):
+        pipe = pipeline("image-text-to-text", model="microsoft/kosmos-2-patch14-224")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        prompt = "a photo of"
+
+        outputs = pipe([image, image], text=[prompt, prompt])
+        outputs_batched = pipe([image, image], text=[prompt, prompt], batch_size=2)
+        self.assertEqual(outputs, outputs_batched)
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        image_ny = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        image_chicago = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What’s the difference between these two images?"},
+                    {"type": "image"},
+                    {"type": "image"},
+                ],
+            }
+        ]
+        outputs = pipe([image_ny, image_chicago], text=messages)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What’s the difference between these two images?"},
+                                {"type": "image"},
+                                {"type": "image"},
+                            ],
+                        }
+                    ],
+                    "generated_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What’s the difference between these two images?"},
+                                {"type": "image"},
+                                {"type": "image"},
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": "The first image shows a statue of the Statue of Liberty in the foreground, while the second image shows",
+                        },
+                    ],
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_continue_final_message(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": "There is a dog and"},
+                ],
+            },
+        ]
+        outputs = pipe(text=messages)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        },
+                        {"role": "assistant", "content": [{"type": "text", "text": "There is a dog and"}]},
+                    ],
+                    "generated_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "There is a dog and a person in the image. The dog is sitting on the sand, and the person is sitting on",
+                                }
+                            ],
+                        },
+                    ],
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_new_text(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=False)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        }
+                    ],
+                    "generated_text": "In the image, a woman is sitting on the sandy beach, her legs crossed in a relaxed manner",
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_image_url(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+                        },
+                    },
+                    {"type": "text", "text": "Describe this image in one sentence."},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)[0]["generated_text"]
+        self.assertEqual(outputs, "The image captures the iconic Statue of Liberty, a")
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_image_url_base64(self):
+        with open("./tests/fixtures/tests_samples/COCO/000000039769.png", "rb") as image_file:
+            base64_image = base64.b64encode(image_file.read()).decode("utf-8")
+
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                    },
+                    {"type": "text", "text": "Describe this image in one sentence."},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)[0]["generated_text"]
+        self.assertEqual(outputs, "Two cats are sleeping on a pink blanket, with")
diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py
index d06f88d1f08844..bf4fc7db1db6b5 100644
--- a/tests/pipelines/test_pipelines_question_answering.py
+++ b/tests/pipelines/test_pipelines_question_answering.py
@@ -27,6 +27,7 @@
 from transformers.testing_utils import (
     compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
+    is_torch_available,
     nested_simplify,
     require_tf,
     require_torch,
@@ -34,6 +35,10 @@
     slow,
 )
 
+
+if is_torch_available():
+    import torch
+
 from .test_pipelines_common import ANY
 
 
@@ -165,6 +170,34 @@ def test_small_model_pt(self):
 
         self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"})
 
+    @require_torch
+    def test_small_model_pt_fp16(self):
+        question_answerer = pipeline(
+            "question-answering",
+            model="sshleifer/tiny-distilbert-base-cased-distilled-squad",
+            torch_dtype=torch.float16,
+        )
+
+        outputs = question_answerer(
+            question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
+        )
+
+        self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"})
+
+    @require_torch
+    def test_small_model_pt_bf16(self):
+        question_answerer = pipeline(
+            "question-answering",
+            model="sshleifer/tiny-distilbert-base-cased-distilled-squad",
+            torch_dtype=torch.bfloat16,
+        )
+
+        outputs = question_answerer(
+            question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
+        )
+
+        self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"})
+
     @require_torch
     def test_small_model_pt_iterator(self):
         # https://github.com/huggingface/transformers/issues/18510
diff --git a/tests/pipelines/test_pipelines_summarization.py b/tests/pipelines/test_pipelines_summarization.py
index 465dba9743c648..613b9dca8e1a71 100644
--- a/tests/pipelines/test_pipelines_summarization.py
+++ b/tests/pipelines/test_pipelines_summarization.py
@@ -85,8 +85,9 @@ def run_pipeline_test(self, summarizer, _):
                 and len(summarizer.model.trainable_weights) > 0
                 and "GPU" in summarizer.model.trainable_weights[0].device
             ):
-                with self.assertRaises(Exception):
-                    outputs = summarizer("This " * 1000)
+                if str(summarizer.device) == "cpu":
+                    with self.assertRaises(Exception):
+                        outputs = summarizer("This " * 1000)
         outputs = summarizer("This " * 1000, truncation=TruncationStrategy.ONLY_FIRST)
 
     @require_torch
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 277c870b4d1074..51f3cae5e31235 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -493,17 +493,19 @@ def run_pipeline_test(self, text_generator, _):
             and text_generator.model.__class__.__name__ not in EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS
         ):
             # Handling of large generations
-            with self.assertRaises((RuntimeError, IndexError, ValueError, AssertionError)):
-                text_generator("This is a test" * 500, max_new_tokens=20)
+            if str(text_generator.device) == "cpu":
+                with self.assertRaises((RuntimeError, IndexError, ValueError, AssertionError)):
+                    text_generator("This is a test" * 500, max_new_tokens=20)
 
             outputs = text_generator("This is a test" * 500, handle_long_generation="hole", max_new_tokens=20)
             # Hole strategy cannot work
-            with self.assertRaises(ValueError):
-                text_generator(
-                    "This is a test" * 500,
-                    handle_long_generation="hole",
-                    max_new_tokens=tokenizer.model_max_length + 10,
-                )
+            if str(text_generator.device) == "cpu":
+                with self.assertRaises(ValueError):
+                    text_generator(
+                        "This is a test" * 500,
+                        handle_long_generation="hole",
+                        max_new_tokens=tokenizer.model_max_length + 10,
+                    )
 
     @require_torch
     @require_accelerate
diff --git a/tests/pipelines/test_pipelines_text_to_audio.py b/tests/pipelines/test_pipelines_text_to_audio.py
index e07e2ad392a3e6..dac2ce6b30ec22 100644
--- a/tests/pipelines/test_pipelines_text_to_audio.py
+++ b/tests/pipelines/test_pipelines_text_to_audio.py
@@ -27,6 +27,7 @@
     require_torch,
     require_torch_accelerator,
     require_torch_or_tf,
+    run_test_using_subprocess,
     slow,
     torch_device,
 )
@@ -66,8 +67,10 @@ def test_small_musicgen_pt(self):
         audio = [output["audio"] for output in outputs]
         self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
 
+    # TODO: @ylacombe: `SeamlessM4TForTextToSpeech.generate` has issue with `generation_config`. See issue #34811
     @slow
     @require_torch
+    @run_test_using_subprocess
     def test_medium_seamless_m4t_pt(self):
         speech_generator = pipeline(task="text-to-audio", model="facebook/hf-seamless-m4t-medium", framework="pt")
 
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
index 48cdb9bd15ca53..5ed48de3610eb9 100644
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -14,7 +14,12 @@
 
 import unittest
 
-from transformers import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING, is_vision_available, pipeline
+from transformers import (
+    MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
+    ZeroShotObjectDetectionPipeline,
+    is_vision_available,
+    pipeline,
+)
 from transformers.testing_utils import (
     is_pipeline_test,
     nested_simplify,
@@ -52,9 +57,11 @@ def get_test_pipeline(
         processor=None,
         torch_dtype="float32",
     ):
-        object_detector = pipeline(
-            "zero-shot-object-detection",
-            model="hf-internal-testing/tiny-random-owlvit-object-detection",
+        object_detector = ZeroShotObjectDetectionPipeline(
+            model=model,
+            processor=processor,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
             torch_dtype=torch_dtype,
         )
 
@@ -67,7 +74,7 @@ def get_test_pipeline(
         return object_detector, examples
 
     def run_pipeline_test(self, object_detector, examples):
-        outputs = object_detector(examples[0], threshold=0.0)
+        outputs = object_detector(examples[0].get("image"), examples[0].get("candidate_labels"), threshold=0.0)
 
         n = len(outputs)
         self.assertGreater(n, 0)
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 0ac9b3d82fc7b0..3eae429abb206a 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -29,6 +29,7 @@
     BitsAndBytesConfig,
     pipeline,
 )
+from transformers.models.opt.modeling_opt import OPTAttention
 from transformers.testing_utils import (
     apply_skip_if_not_implemented,
     is_bitsandbytes_available,
@@ -565,7 +566,7 @@ def test_training(self):
 
         # Step 2: add adapters
         for _, module in model.named_modules():
-            if "OPTAttention" in repr(type(module)):
+            if isinstance(module, OPTAttention):
                 module.q_proj = LoRALayer(module.q_proj, rank=16)
                 module.k_proj = LoRALayer(module.k_proj, rank=16)
                 module.v_proj = LoRALayer(module.v_proj, rank=16)
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 5a99ab32e42b8c..567aa956271b70 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -29,6 +29,7 @@
     BitsAndBytesConfig,
     pipeline,
 )
+from transformers.models.opt.modeling_opt import OPTAttention
 from transformers.testing_utils import (
     apply_skip_if_not_implemented,
     is_accelerate_available,
@@ -868,7 +869,7 @@ def test_training(self):
 
         # Step 2: add adapters
         for _, module in model.named_modules():
-            if "OPTAttention" in repr(type(module)):
+            if isinstance(module, OPTAttention):
                 module.q_proj = LoRALayer(module.q_proj, rank=16)
                 module.k_proj = LoRALayer(module.k_proj, rank=16)
                 module.v_proj = LoRALayer(module.v_proj, rank=16)
diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py
index ddc791e96a6489..42b05f18449ded 100644
--- a/tests/quantization/ggml/test_ggml.py
+++ b/tests/quantization/ggml/test_ggml.py
@@ -59,6 +59,10 @@ class GgufIntegrationTests(unittest.TestCase):
     starcoder2_model_id = "QuantFactory/starcoder2-3b-GGUF"
     starcoder2_fp16_model_id = "brittlewis12/starcoder2-3b-GGUF"
     starcoder2_original_model_id = "bigcode/starcoder2-3b"
+    mamba_original_model_id = "state-spaces/mamba-2.8b-hf"
+    mamba_model_id = "jpodivin/mamba-2.8b-hf-GGUF"
+    nemotron_original_model_id = "nvidia/Nemotron-Mini-4B-Instruct"
+    nemotron_model_id = "bartowski/Nemotron-Mini-4B-Instruct-GGUF"
 
     # standard quants
     q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
@@ -102,6 +106,10 @@ class GgufIntegrationTests(unittest.TestCase):
     q6_k_gpt2_xl_model_id = "gpt2-xl.Q6_K.gguf"
     q6_k_starcoder2_model_id = "starcoder2-3b.Q6_K.gguf"
     fp16_starcoder2_gguf_model_id = "starcoder2-3b.fp16.gguf"
+    q6_k_mamba_model_id = "ggml-model-Q6_K.gguf"
+    fp16_mamba_model_id = "ggml-model-f16.gguf"
+    q6_k_nemotron_model_id = "Nemotron-Mini-4B-Instruct-Q6_K.gguf"
+    fp16_nemotron_model_id = "Nemotron-Mini-4B-Instruct-f16.gguf"
 
     example_text = "Hello"
 
@@ -573,6 +581,8 @@ def test_gpt2_weights_conversion_fp16(self):
             if layer_name in quantized_state_dict:
                 self.assertTrue(original_params.shape == quantized_state_dict[layer_name].shape)
                 torch.testing.assert_close(original_params, quantized_state_dict[layer_name])
+            else:
+                raise ValueError(f"Layer {layer_name} is not presented in GGUF model")
 
     def test_gpt2_xl_Q6_K(self):
         tokenizer = AutoTokenizer.from_pretrained(self.gpt2_xl_model_id, gguf_file=self.q6_k_gpt2_xl_model_id)
@@ -639,6 +649,8 @@ def test_falcon7b_weights_conversion_fp16(self):
             if layer_name in quantized_state_dict:
                 self.assertTrue(original_params.shape == quantized_state_dict[layer_name].shape)
                 torch.testing.assert_close(original_params, quantized_state_dict[layer_name])
+            else:
+                raise ValueError(f"Layer {layer_name} is not presented in GGUF model")
 
     def test_stablelm_q4_k_m(self):
         model = AutoModelForCausalLM.from_pretrained(
@@ -665,10 +677,6 @@ def test_stablelm_fp16(self):
             self.stablelm2_model_id,
             gguf_file=self.fp16_stablelm2_model_id,
             torch_dtype=torch.float16,
-            # for precise comparison it is required to use the original model config
-            # as quantized one is different in parameters: use_parallel_residual and use_qkv_bias
-            # and it highly influences on the output results
-            config=original_model.config,
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.stablelm2_model_id, gguf_file=self.fp16_stablelm2_model_id)
@@ -695,10 +703,6 @@ def test_stablelm_weights_conversion_fp16(self):
             gguf_file=self.fp16_stablelm2_model_id,
             device_map="auto",
             torch_dtype=torch.float16,
-            # for precise comparison it is required to use the original model config
-            # as quantized one is different in parameters: use_parallel_residual and use_qkv_bias
-            # and it highly influences on the output results
-            config=original_model.config,
         )
 
         converted_state_dict = converted_model.state_dict()
@@ -708,6 +712,8 @@ def test_stablelm_weights_conversion_fp16(self):
             if layer_name in converted_state_dict:
                 self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape)
                 torch.testing.assert_close(original_params, converted_state_dict[layer_name])
+            else:
+                raise ValueError(f"Layer {layer_name} is not presented in GGUF model")
 
     def test_starcoder2_weights_conversion_fp16(self):
         original_model = AutoModelForCausalLM.from_pretrained(
@@ -727,10 +733,11 @@ def test_starcoder2_weights_conversion_fp16(self):
         original_state_dict = original_model.state_dict()
 
         for layer_name, original_params in original_state_dict.items():
-            if layer_name in converted_state_dict and layer_name != "lm_head.weight":
-                # quantized models do not contain "lm_head.weight" layer
+            if layer_name in converted_state_dict:
                 self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape)
                 torch.testing.assert_close(original_params, converted_state_dict[layer_name])
+            else:
+                raise ValueError(f"Layer {layer_name} is not presented in GGUF model")
 
     def test_starcoder2_q6_k(self):
         example_function_text = "def print_hello_world():"
@@ -748,6 +755,83 @@ def test_starcoder2_q6_k(self):
         EXPECTED_TEXT = 'def print_hello_world():\n    print("Hello World")\n\ndef print'
         self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
 
+    def test_mamba_weights_conversion_fp16(self):
+        original_model = AutoModelForCausalLM.from_pretrained(
+            self.mamba_original_model_id,
+            torch_dtype=torch.float16,
+        )
+
+        converted_model = AutoModelForCausalLM.from_pretrained(
+            self.mamba_model_id,
+            gguf_file=self.fp16_mamba_model_id,
+            torch_dtype=torch.float16,
+        )
+
+        converted_state_dict = converted_model.state_dict()
+        original_state_dict = original_model.state_dict()
+
+        for layer_name, original_params in original_state_dict.items():
+            if layer_name in converted_state_dict:
+                self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape)
+                if "mixer.A_log" in layer_name:
+                    # we should increase tolerance after exponential reversing
+                    # and performing np.log(-weights) operation as numbers are slightly different
+                    torch.testing.assert_close(original_params, converted_state_dict[layer_name], atol=1e-3, rtol=1e-3)
+                else:
+                    torch.testing.assert_close(original_params, converted_state_dict[layer_name])
+            else:
+                raise ValueError(f"Layer {layer_name} is not presented in GGUF model")
+
+    def test_mamba_q6_k(self):
+        model = AutoModelForCausalLM.from_pretrained(
+            self.mamba_model_id,
+            gguf_file=self.q6_k_mamba_model_id,
+            torch_dtype=torch.float16,
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained(self.mamba_model_id, gguf_file=self.q6_k_mamba_model_id)
+        text = tokenizer(self.example_text, return_tensors="pt")["input_ids"]
+        out = model.generate(text, max_new_tokens=10)
+
+        EXPECTED_TEXT = "Hello,I answerthe question.\n\nA"
+        self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
+
+    def test_nemotron_weights_conversion_fp16(self):
+        original_model = AutoModelForCausalLM.from_pretrained(
+            self.nemotron_original_model_id,
+            torch_dtype=torch.float16,
+        )
+
+        converted_model = AutoModelForCausalLM.from_pretrained(
+            self.nemotron_model_id,
+            gguf_file=self.fp16_nemotron_model_id,
+            torch_dtype=torch.float16,
+        )
+
+        converted_state_dict = converted_model.state_dict()
+        original_state_dict = original_model.state_dict()
+
+        for layer_name, original_params in original_state_dict.items():
+            if layer_name in converted_state_dict:
+                self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape)
+                torch.testing.assert_close(original_params, converted_state_dict[layer_name])
+            else:
+                raise ValueError(f"Layer {layer_name} is not presented in GGUF model")
+
+    def test_nemotron_q6_k(self):
+        model = AutoModelForCausalLM.from_pretrained(
+            self.nemotron_model_id,
+            gguf_file=self.q6_k_nemotron_model_id,
+            torch_dtype=torch.float16,
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained(self.nemotron_model_id, gguf_file=self.q6_k_nemotron_model_id)
+        text = tokenizer(self.example_text, return_tensors="pt")["input_ids"]
+        out = model.generate(text, max_new_tokens=10)
+
+        EXPECTED_TEXT = "'Hello. hotmail.com.'"
+        self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
+
     def test_tokenization_xnli(self):
         import tqdm
         from datasets import load_dataset
diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py
index 8014f745d08688..d0263f45f18075 100644
--- a/tests/quantization/torchao_integration/test_torchao.py
+++ b/tests/quantization/torchao_integration/test_torchao.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import gc
+import tempfile
 import unittest
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
@@ -74,6 +75,13 @@ def test_post_init_check(self):
         with self.assertRaisesRegex(ValueError, "Unexpected keyword arg"):
             _ = TorchAoConfig("int4_weight_only", group_size1=32)
 
+    def test_repr(self):
+        """
+        Check that there is no error in the repr
+        """
+        quantization_config = TorchAoConfig("int4_weight_only", modules_to_not_convert=["conv"], group_size=8)
+        repr(quantization_config)
+
 
 @require_torch_gpu
 @require_torchao
@@ -208,6 +216,121 @@ def test_int4wo_offload(self):
 
         self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
 
+    def test_int8_dynamic_activation_int8_weight_quant(self):
+        """
+        Simple LLM model testing int8_dynamic_activation_int8_weight
+        """
+        quant_config = TorchAoConfig("int8_dynamic_activation_int8_weight")
+
+        # Note: we quantize the bfloat16 model on the fly to int4
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=torch_device,
+            quantization_config=quant_config,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
+        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
+
+
+@require_torch_gpu
+@require_torchao
+class TorchAoSerializationTest(unittest.TestCase):
+    input_text = "What are we having for dinner?"
+    max_new_tokens = 10
+    ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
+    # TODO: investigate why we don't have the same output as the original model for this test
+    SERIALIZED_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32}
+    device = "cuda:0"
+
+    # called only once for all test in this class
+    @classmethod
+    def setUpClass(cls):
+        cls.quant_config = TorchAoConfig(cls.quant_scheme, **cls.quant_scheme_kwargs)
+        cls.quantized_model = AutoModelForCausalLM.from_pretrained(
+            cls.model_name,
+            torch_dtype=torch.bfloat16,
+            device_map=cls.device,
+            quantization_config=cls.quant_config,
+        )
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def test_original_model_expected_output(self):
+        input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device)
+        output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
+
+        self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.ORIGINAL_EXPECTED_OUTPUT)
+
+    def check_serialization_expected_output(self, device, expected_output):
+        """
+        Test if we can serialize and load/infer the model again on the same device
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.quantized_model.save_pretrained(tmpdirname, safe_serialization=False)
+            loaded_quantized_model = AutoModelForCausalLM.from_pretrained(
+                self.model_name, torch_dtype=torch.bfloat16, device_map=self.device
+            )
+            input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+            output = loaded_quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
+            self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
+
+    def test_serialization_expected_output(self):
+        self.check_serialization_expected_output(self.device, self.SERIALIZED_EXPECTED_OUTPUT)
+
+
+class TorchAoSerializationW8A8Test(TorchAoSerializationTest):
+    quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
+    ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+    SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
+    device = "cuda:0"
+
+
+class TorchAoSerializationW8Test(TorchAoSerializationTest):
+    quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
+    ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+    SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
+    device = "cuda:0"
+
+
+class TorchAoSerializationW8A8CPUTest(TorchAoSerializationTest):
+    quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
+    ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+    SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
+    device = "cpu"
+
+    def test_serialization_expected_output_cuda(self):
+        """
+        Test if we can serialize on device (cpu) and load/infer the model on cuda
+        """
+        new_device = "cuda:0"
+        self.check_serialization_expected_output(new_device, self.SERIALIZED_EXPECTED_OUTPUT)
+
+
+class TorchAoSerializationW8CPUTest(TorchAoSerializationTest):
+    quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
+    ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+    SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
+    device = "cpu"
+
+    def test_serialization_expected_output_cuda(self):
+        """
+        Test if we can serialize on device (cpu) and load/infer the model on cuda
+        """
+        new_device = "cuda:0"
+        self.check_serialization_expected_output(new_device, self.SERIALIZED_EXPECTED_OUTPUT)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index 81c6a008b133ca..4dbbdedbbc2eb7 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -17,12 +17,17 @@
 import json
 import os
 import tempfile
+from pathlib import Path
 
 from transformers import is_torch_available
+from transformers.utils import direct_transformers_import
 
 from .utils.test_configuration_utils import config_common_kwargs
 
 
+transformers_module = direct_transformers_import(Path(__file__).parent)
+
+
 class ConfigTester:
     def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs):
         self.parent = parent
@@ -35,9 +40,10 @@ def create_and_test_config_common_properties(self):
         config = self.config_class(**self.inputs_dict)
         common_properties = (
             ["hidden_size", "num_attention_heads", "num_hidden_layers"]
-            if self.common_properties is None
+            if self.common_properties is None and not self.config_class.sub_configs
             else self.common_properties
         )
+        common_properties = [] if common_properties is None else common_properties
 
         # Add common fields for text models
         if self.has_text_modality:
@@ -110,6 +116,44 @@ def create_and_test_config_from_and_save_pretrained_subfolder(self):
 
         self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
 
+    def create_and_test_config_from_and_save_pretrained_composite(self):
+        """
+        Tests that composite or nested cofigs can be loaded and saved correctly. In case the config
+        has a sub-config, we should be able to call `sub_config.from_pretrained('general_config_file')`
+        and get a result same as if we loaded the whole config and obtained `config.sub_config` from it.
+        """
+        config = self.config_class(**self.inputs_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            config.save_pretrained(tmpdirname)
+            general_config_loaded = self.config_class.from_pretrained(tmpdirname)
+            general_config_dict = config.to_dict()
+
+            # Iterate over all sub_configs if there are any and load them with their own classes
+            sub_configs = self.config_class.sub_configs
+            for sub_config_key, sub_class in sub_configs.items():
+                if sub_class.__name__ == "AutoConfig":
+                    sub_class = sub_class.for_model(**general_config_dict[sub_config_key]).__class__
+                    sub_config_loaded = sub_class.from_pretrained(tmpdirname)
+                else:
+                    sub_config_loaded = sub_class.from_pretrained(tmpdirname)
+
+                # Pop `transformers_version`, it never exists when a config is part of a general composite config
+                # Verify that loading with subconfig class results in same dict as if we loaded with general composite config class
+                sub_config_loaded_dict = sub_config_loaded.to_dict()
+                sub_config_loaded_dict.pop("transformers_version", None)
+                self.parent.assertEqual(sub_config_loaded_dict, general_config_dict[sub_config_key])
+
+                # Verify that the loaded config type is same as in the general config
+                type_from_general_config = type(getattr(general_config_loaded, sub_config_key))
+                self.parent.assertTrue(isinstance(sub_config_loaded, type_from_general_config))
+
+                # Now save only the sub-config and load it back to make sure the whole load-save-load pipeline works
+                with tempfile.TemporaryDirectory() as tmpdirname2:
+                    sub_config_loaded.save_pretrained(tmpdirname2)
+                    sub_config_loaded_2 = sub_class.from_pretrained(tmpdirname2)
+                    self.parent.assertEqual(sub_config_loaded.to_dict(), sub_config_loaded_2.to_dict())
+
     def create_and_test_config_with_num_labels(self):
         config = self.config_class(**self.inputs_dict, num_labels=5)
         self.parent.assertEqual(len(config.id2label), 5)
@@ -128,6 +172,9 @@ def check_config_can_be_init_without_params(self):
             self.parent.assertIsNotNone(config)
 
     def check_config_arguments_init(self):
+        if self.config_class.sub_configs:
+            return  # TODO: @raushan composite models are not consistent in how they set general params
+
         kwargs = copy.deepcopy(config_common_kwargs)
         config = self.config_class(**kwargs)
         wrong_values = []
@@ -153,6 +200,7 @@ def run_common_tests(self):
         self.create_and_test_config_to_json_file()
         self.create_and_test_config_from_and_save_pretrained()
         self.create_and_test_config_from_and_save_pretrained_subfolder()
+        self.create_and_test_config_from_and_save_pretrained_composite()
         self.create_and_test_config_with_num_labels()
         self.check_config_can_be_init_without_params()
         self.check_config_arguments_init()
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 964b7b912b4e0f..4cfc91aade2825 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -22,7 +22,6 @@
 import random
 import re
 import tempfile
-import time
 import warnings
 from collections import defaultdict
 from contextlib import contextmanager
@@ -37,10 +36,7 @@
 from transformers import (
     AutoModel,
     AutoModelForCausalLM,
-    AutoModelForSeq2SeqLM,
     AutoModelForSequenceClassification,
-    AutoTokenizer,
-    GenerationConfig,
     PretrainedConfig,
     PreTrainedModel,
     is_torch_available,
@@ -86,7 +82,6 @@
     require_deepspeed,
     require_flash_attn,
     require_non_xpu,
-    require_read_token,
     require_safetensors,
     require_torch,
     require_torch_accelerator,
@@ -852,31 +847,33 @@ def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=No
                     ]
                     or not model_class.supports_gradient_checkpointing
                 ):
-                    self.skipTest(reason=f"`supports_gradient_checkpointing` is False for {model_class.__name__}.")
+                    # TODO (ydshieh): use `skipTest` once pytest-dev/pytest-subtests/pull/169 is merged
+                    # self.skipTest(reason=f"`supports_gradient_checkpointing` is False for {model_class.__name__}.")
+                    continue
 
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-            model = model_class(config)
+                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+                config.use_cache = False
+                config.return_dict = True
+                model = model_class(config)
 
-            model.to(torch_device)
-            model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
-            model.train()
+                model.to(torch_device)
+                model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
+                model.train()
 
-            # unfreeze additional layers
-            for p in model.parameters():
-                p.requires_grad_(True)
+                # unfreeze additional layers
+                for p in model.parameters():
+                    p.requires_grad_(True)
 
-            optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+                optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
 
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
-            optimizer.step()
+                inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                loss = model(**inputs).loss
+                loss.backward()
+                optimizer.step()
 
-            for k, v in model.named_parameters():
-                if v.requires_grad:
-                    self.assertTrue(v.grad is not None, f"{k} in {model_class.__name__} has no gradient!")
+                for k, v in model.named_parameters():
+                    if v.requires_grad:
+                        self.assertTrue(v.grad is not None, f"{k} in {model_class.__name__} has no gradient!")
 
     def test_training(self):
         if not self.model_tester.is_training:
@@ -1816,6 +1813,7 @@ def test_resize_tokens_embeddings(self):
             original_config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
+        inputs_dict.pop("labels", None)
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -1993,6 +1991,7 @@ def test_resize_embeddings_untied(self):
 
         original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         original_config.tie_word_embeddings = False
+        inputs_dict.pop("labels", None)
 
         # if model cannot untied embeddings -> leave test
         if original_config.tie_word_embeddings:
@@ -3000,71 +2999,6 @@ def test_inputs_embeds_matches_input_ids(self):
                     )[0]
             self.assertTrue(torch.allclose(out_embeds, out_ids))
 
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            if model_class.__name__ not in [
-                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
-                *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES),
-            ]:
-                continue
-
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            model_forward_args = inspect.signature(model.forward).parameters
-            if any(argument not in model_forward_args for argument in ["inputs_embeds", "position_ids"]):
-                self.skipTest(reason="This model doesn't use `inputs_embeds` or `position_ids`.")
-            has_inputs_embeds_forwarding = "inputs_embeds" in set(
-                inspect.signature(model.prepare_inputs_for_generation).parameters.keys()
-            )
-            if not has_inputs_embeds_forwarding:
-                self.skipTest(reason="This model doesn't support `inputs_embeds` passed to `generate`.")
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
-
-            # VLMs can't generate with embeds and pixels at the same time. We expect the user to pass merged
-            # embeds already
-            if model_class.__name__ in get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES):
-                inputs.pop("pixel_values", None)
-                inputs.pop("pixel_values_videos", None)
-                inputs.pop("pixel_values_images", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                # some models infer position ids/attn mask differently when input ids
-                # by check if pad_token let's make sure no padding is in input ids
-                not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
-                input_ids[input_ids == pad_token_id] = not_pad_token_id
-                del inputs["input_ids"]
-                inputs_embeds = wte(input_ids)
-                out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:]
-                out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2)
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
-                decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-                inputs_embeds = wte(encoder_input_ids)
-                decoder_inputs_embeds = wte(decoder_input_ids)
-                out_ids = model.generate(
-                    input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2
-                )[:, -2:]
-                out_embeds = model.generate(
-                    inputs_embeds=inputs_embeds,
-                    decoder_inputs_embeds=decoder_inputs_embeds,
-                    **inputs,
-                    max_new_tokens=2,
-                )
-            # NOTE: this test changes the order of FP ops, there may be tiny differences in the output
-            number_of_different_tokens = (out_ids != out_embeds).sum()
-            max_differences = int(out_ids.shape[0] * out_ids.shape[1] * 0.1)
-            self.assertTrue(number_of_different_tokens <= max_differences)  # accept up to 10% mismatch
-
     @require_non_xpu
     @require_torch_multi_gpu
     def test_multi_gpu_data_parallel_forward(self):
@@ -3857,102 +3791,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
                 assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    @is_flaky()
-    def test_flash_attn_2_generate_left_padding(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # make sure we do left padding
-                dummy_attention_mask[:, :-1] = 0
-                dummy_attention_mask[:, -1:] = 1
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @is_flaky()
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # make sure we do right padding
-                dummy_attention_mask[:, :-1] = 1
-                dummy_attention_mask[:, -1:] = 0
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
     def test_attn_implementation_composite_models(self):
         """
         Tests if composite models can receive a dict object as attn_implementation, where each key should be
@@ -3966,22 +3804,18 @@ def test_attn_implementation_composite_models(self):
                 self.skipTest("Model is not a composite model.")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            sub_configs = {
-                key: getattr(config, key) for key in config if isinstance(getattr(config, key), PretrainedConfig)
-            }
 
             # set eager as it will be the one supported in all models
             # we just need to test if passing 'attn_implementation' as a dict fails or not
             attn_implementation_per_subconfig = {}
-            for key, sub_config in sub_configs.items():
+            for key in config.sub_configs.keys():
                 attn_implementation_per_subconfig[key] = "eager"
 
             config._attn_implementation = attn_implementation_per_subconfig
             model = model_class(config)
-            for key in model.config:
-                if isinstance(getattr(model.config, key), PretrainedConfig):
-                    sub_config = getattr(model.config, key)
-                    self.assertTrue(sub_config._attn_implementation == "eager")
+            for key in config.sub_configs.keys():
+                sub_config = getattr(model.config, key)
+                self.assertTrue(sub_config._attn_implementation == "eager")
 
             for name, submodule in model.named_modules():
                 class_name = submodule.__class__.__name__
@@ -3990,7 +3824,7 @@ def test_attn_implementation_composite_models(self):
                     or "SdpaSelfAttention" in class_name
                     or "FlashAttention" in class_name
                 ):
-                    raise ValueError("The eager model should not have SDPA/FA2 attention layers")
+                    raise ValueError(f"The eager model should not have SDPA/FA2 attention layers but got {class_name}")
 
     @require_torch_sdpa
     def test_sdpa_can_dispatch_non_composite_models(self):
@@ -4096,7 +3930,6 @@ def test_sdpa_can_dispatch_composite_models(self):
 
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
-    @slow
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         if not self.has_attentions:
             self.skipTest(reason="Model architecture does not support attentions")
@@ -4122,8 +3955,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
 
         atols = {
             ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-6,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -4134,8 +3969,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         }
         rtols = {
             ("cpu", False, torch.float32): 1e-4,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-4,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-4,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -4148,8 +3985,34 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         def get_mean_reldiff(failcase, x, ref, atol, rtol):
             return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
 
+        if hasattr(self.model_tester, "num_hidden_layers"):
+            self.model_tester.num_hidden_layers = 1
+        if hasattr(self.model_tester, "vision_config") and "num_hidden_layers" in self.model_tester.vision_config:
+            self.model_tester.vision_config = copy.deepcopy(self.model_tester.vision_config)
+            self.model_tester.vision_config["num_hidden_layers"] = 1
+        if hasattr(self.model_tester, "text_config") and "num_hidden_layers" in self.model_tester.text_config:
+            self.model_tester.text_config = copy.deepcopy(self.model_tester.text_config)
+            self.model_tester.text_config["num_hidden_layers"] = 1
+
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            config.rms_norm_eps = 1.0
+            config.layer_norm_eps = 1.0
+            config.norm_eps = 1.0
+            config.norm_epsilon = 1.0
+            config.layer_norm_epsilon = 1.0
+
+            # norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance.
+            # (We don't need the original epsilon values to check eager/sdpa matches)
+            for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]:
+                if hasattr(config, attr):
+                    getattr(config, attr).rms_norm_eps = 1.0
+                    getattr(config, attr).layer_norm_eps = 1.0
+                    getattr(config, attr).norm_eps = 1.0
+                    getattr(config, attr).norm_epsilon = 1.0
+                    getattr(config, attr).layer_norm_epsilon = 1.0
+
             model = model_class(config)
             # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors.
             # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask.
@@ -4161,14 +4024,22 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
-                model_sdpa = model_sdpa.eval().to(torch_device)
+                model_sdpa = model_sdpa.eval().to(torch_device, dtype=torch_dtype)
 
                 model_eager = model_class.from_pretrained(
                     tmpdirname,
                     torch_dtype=torch_dtype,
                     attn_implementation="eager",
                 )
-                model_eager = model_eager.eval().to(torch_device)
+                model_eager = model_eager.eval().to(torch_device, dtype=torch_dtype)
+
+                # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
+                for x in model_eager.modules():
+                    if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
+                        x.eps = 1.0
+                for x in model_sdpa.modules():
+                    if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
+                        x.eps = 1.0
 
                 # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
                 # but it would be nicer to have an efficient way to use parameterized.expand
@@ -4179,7 +4050,8 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                             can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
                             if not (self.has_attentions and can_output_attn) and output_attentions:
                                 continue
-                            for batch_size in [1, 5]:
+                            # TODO: if we can also check with `batch_size=1` without being flaky?
+                            for batch_size in [7]:
                                 dummy_input = inputs_dict[model.main_input_name]
 
                                 if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
@@ -4230,14 +4102,14 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                     dummy_attention_mask[:] = 1
                                     if padding_side == "left":
-                                        dummy_attention_mask[-1, :-1] = 1
-                                        dummy_attention_mask[-1, -4:] = 0
+                                        dummy_attention_mask[-1, :2] = 0
+                                        dummy_attention_mask[-1, 2:] = 1
                                     elif padding_side == "right":
-                                        dummy_attention_mask[-1, 1:] = 1
-                                        dummy_attention_mask[-1, :3] = 0
+                                        dummy_attention_mask[-1, -2:] = 0
+                                        dummy_attention_mask[-1, :-2] = 1
 
                                 for enable_kernels in [False, True]:
-                                    failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
+                                    failcase = f"padding_side={padding_side}, use_mask={use_mask}, enable_kernels={enable_kernels}"
                                     if is_encoder_decoder:
                                         decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[
                                             :batch_size
@@ -4327,52 +4199,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                     # Masked tokens output slightly deviates - we don't mind that.
                                     if use_mask:
+                                        _logits_sdpa = torch.zeros_like(input=logits_sdpa)
+                                        _logits_eager = torch.zeros_like(input=logits_eager)
+
+                                        _logits_sdpa[:-1] = logits_sdpa[:-1]
+                                        _logits_eager[:-1] = logits_eager[:-1]
+
                                         if padding_side == "left":
-                                            sub_sdpa = logits_sdpa[:-1]
-                                            sub_eager = logits_eager[:-1]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            sub_sdpa = logits_sdpa[-1, :-4]
-                                            sub_eager = logits_eager[-1, :-4]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            # Testing the padding tokens is not really meaningful but anyway
-                                            # sub_sdpa = logits_sdpa[-1, -4:]
-                                            # sub_eager = logits_eager[-1, -4:]
-                                            # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-                                        elif padding_side == "right":
-                                            sub_sdpa = logits_sdpa[:-1]
-                                            sub_eager = logits_eager[:-1]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            sub_sdpa = logits_sdpa[-1, 3:]
-                                            sub_eager = logits_eager[-1, 3:]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            # Testing the padding tokens is not really meaningful but anyway
-                                            # sub_sdpa = logits_sdpa[-1, :3]
-                                            # sub_eager = logits_eager[-1, :3]
-                                            # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
+                                            _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
 
-                                    else:
-                                        if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                            )
+                                        elif padding_side == "right":
+                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
+                                            _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
+
+                                        logits_sdpa = _logits_sdpa
+                                        logits_eager = _logits_eager
+
+                                    results = [
+                                        torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
+                                        for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
+                                    ]
+                                    # If 80% batch elements have matched results, it's fine
+                                    if np.mean(results) < 0.8:
+                                        fail_cases.append(
+                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                        )
 
                 self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
@@ -4469,62 +4321,6 @@ def test_sdpa_can_compile_dynamic(self):
                 with torch.no_grad():
                     _ = model(**inputs_dict)
 
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        max_new_tokens = 30
-
-        if len(self.all_generative_model_classes) == 0:
-            self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_sdpa:
-                self.skipTest(f"{model_class.__name__} does not support SDPA")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model_sdpa = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                    attn_implementation="eager",
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                res_eager = model_eager.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
-
-                res_sdpa = model_sdpa.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(res_eager, res_sdpa))
-
     @require_torch_sdpa
     def test_sdpa_matches_eager_sliding_window(self):
         if not self.has_attentions:
@@ -4581,65 +4377,6 @@ def test_sdpa_matches_eager_sliding_window(self):
                     torch.allclose(res_eager[attention_mask == 1], res_sdpa[attention_mask == 1], rtol=1e-4, atol=1e-4)
                 )
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
-                # Generate with one batch only to test generation when attention mask will be None
-                # when real inputs are used, because there is no padding. See issue #32237 for more
-                dummy_input = dummy_input[:1, ...]
-                dummy_attention_mask = torch.ones_like(dummy_attention_mask[:1, ...])
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @mark.flash_attn_test
@@ -4696,62 +4433,6 @@ def test_flash_attn_2_can_dispatch_composite_models(self):
                     if not has_fa2:
                         raise ValueError("The FA2 model should have FA2 layers")
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_reuse_cache(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        max_new_tokens = 2
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # run generate once to get filled cache
-                output = model.generate(
-                    dummy_input,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                    return_dict_in_generate=True,
-                )
-                past_key_values = output.past_key_values
-
-                # Try to continue generation from where we left, given that we have more than 1 new token to process
-                # e.g. this can happen in speculative decoding when feeding candidate tokens back to target model
-                dummy_input_updated = torch.cat([dummy_input, output.sequences], dim=-1)
-                _ = model.generate(
-                    dummy_input_updated,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                    past_key_values=past_key_values,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @require_bitsandbytes
@@ -5055,82 +4736,6 @@ def test_custom_4d_attention_mask(self):
             normalized_1 = F.softmax(out_shared_prefix_last_tokens)
             torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
 
-    def test_static_cache_matches_dynamic(self):
-        """
-        Tests that generating with static cache give almost same results as with dynamic cache.
-        This test does not compile the model and check only logits similarity for numerical precision
-        errors.
-        """
-        if len(self.all_generative_model_classes) == 0:
-            self.skipTest(
-                reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks"
-            )
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_static_cache:
-                self.skipTest(f"{model_class.__name__} does not support static cache")
-
-            if not model_class._supports_cache_class:
-                self.skipTest(f"{model_class.__name__} does not support cache class")
-
-            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-            if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0:
-                self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test")
-
-            model = model_class(config).to(device=torch_device, dtype=torch.float32)
-            model.eval()
-
-            dynamic_out = model.generate(
-                **inputs, do_sample=False, max_new_tokens=10, output_logits=True, return_dict_in_generate=True
-            )
-            static_out = model.generate(
-                **inputs,
-                do_sample=False,
-                max_new_tokens=10,
-                cache_implementation="static",
-                output_logits=True,
-                return_dict_in_generate=True,
-            )
-            self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4))
-
-    # For now, Let's focus only on GPU for `torch.compile`
-    @slow
-    @require_torch_accelerator
-    @require_read_token
-    def test_torch_compile(self):
-        if version.parse(torch.__version__) < version.parse("2.3"):
-            self.skipTest(reason="This test requires torch >= 2.3 to run.")
-        torch.compiler.reset()
-        if not hasattr(self, "_torch_compile_test_ckpt"):
-            self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
-        ckpt = self._torch_compile_test_ckpt
-        revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision
-
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-        batch_size = 1
-        n_iter = 3
-
-        tokenizer = AutoTokenizer.from_pretrained(ckpt)
-        if self.is_encoder_decoder:
-            model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
-                torch_device
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
-                torch_device
-            )
-
-        model.generation_config.max_new_tokens = 4
-
-        model.generation_config.cache_implementation = "static"
-        model.forward = torch.compile(model.forward, mode="reduce-overhead")
-
-        input_text = "Why dogs are cute?"
-        input_ids = tokenizer([input_text] * batch_size, return_tensors="pt").to(torch_device)
-
-        for i in range(n_iter):
-            _ = model.generate(**input_ids, do_sample=False)
-
     @slow
     @require_torch_gpu
     def test_torch_compile_for_training(self):
@@ -5174,74 +4779,6 @@ def test_torch_compile_for_training(self):
         for name, param in model._orig_mod.named_parameters():
             torch.testing.assert_close(param.grad.detach().cpu(), params[name], rtol=1e-4, atol=1e-4)
 
-    @slow
-    @require_torch_gpu  # Testing cuda graphs.
-    @require_read_token
-    def test_compile_cuda_graph_time(self):
-        if version.parse(torch.__version__) < version.parse("2.3"):
-            self.skipTest(reason="This test requires torch >= 2.3 to run.")
-
-        # TODO felix: All models supporting `StaticCache` or `torch.compile` should be tested.
-        # At the moment, only llama, gemma and gemma2 are tested here!
-        if not hasattr(self, "_torch_compile_test_ckpt"):
-            self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
-        ckpt = self._torch_compile_test_ckpt
-        revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision
-
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-        tokenizer = AutoTokenizer.from_pretrained(ckpt)
-        if self.is_encoder_decoder:
-            model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
-                torch_device
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
-                torch_device
-            )
-
-        cache_implementation = "static"
-        if model.config.model_type == "gemma2":
-            cache_implementation = "hybrid"
-
-        new_tokens = 50
-        gen_config = GenerationConfig(
-            max_new_tokens=new_tokens,
-            min_new_tokens=new_tokens,
-            use_cache=True,
-            pad_token_id=tokenizer.pad_token_id,
-            num_beams=1,
-            do_sample=False,
-            eos_token_id=None,  # This is required for min_new_tokens to actually have an effect.
-        )
-        model.generation_config.eos_token_id = None  # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect.
-
-        model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
-
-        inp = tokenizer("Why cats are cute?", return_tensors="pt").to(torch_device)
-
-        # First run: the first run warms up each graph, which does things like CuBlas or Triton benchmarking
-        start = time.perf_counter()
-        _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
-        end = time.perf_counter()
-        graph_warmup_time = end - start
-
-        # Second run: CUDA Graph recording, and replays it
-        start = time.perf_counter()
-        _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
-        end = time.perf_counter()
-        record_time = end - start
-
-        # Finally: we hit the optimized, CUDA Graph replay path
-        start = time.perf_counter()
-        _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
-        end = time.perf_counter()
-        opt_time = end - start
-
-        # For the recording step, we expect only two cuda graphs and this step should be much faster than the first.
-        self.assertTrue(record_time < 0.15 * graph_warmup_time)
-        self.assertTrue(opt_time < record_time)
-
     def test_forward_with_num_logits_to_keep(self):
         for model_class in self.all_generative_model_classes:
             if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index fe8a197237291a..94bc3d5fae1ad2 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -71,6 +71,7 @@
 from .pipelines.test_pipelines_image_classification import ImageClassificationPipelineTests
 from .pipelines.test_pipelines_image_feature_extraction import ImageFeatureExtractionPipelineTests
 from .pipelines.test_pipelines_image_segmentation import ImageSegmentationPipelineTests
+from .pipelines.test_pipelines_image_text_to_text import ImageTextToTextPipelineTests
 from .pipelines.test_pipelines_image_to_image import ImageToImagePipelineTests
 from .pipelines.test_pipelines_image_to_text import ImageToTextPipelineTests
 from .pipelines.test_pipelines_mask_generation import MaskGenerationPipelineTests
@@ -102,6 +103,7 @@
     "image-classification": {"test": ImageClassificationPipelineTests},
     "image-feature-extraction": {"test": ImageFeatureExtractionPipelineTests},
     "image-segmentation": {"test": ImageSegmentationPipelineTests},
+    "image-text-to-text": {"test": ImageTextToTextPipelineTests},
     "image-to-image": {"test": ImageToImagePipelineTests},
     "image-to-text": {"test": ImageToTextPipelineTests},
     "mask-generation": {"test": MaskGenerationPipelineTests},
@@ -586,6 +588,18 @@ def test_pipeline_image_segmentation(self):
     def test_pipeline_image_segmentation_fp16(self):
         self.run_task_tests(task="image-segmentation", torch_dtype="float16")
 
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_image_text_to_text(self):
+        self.run_task_tests(task="image-text-to-text")
+
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_image_text_to_text_fp16(self):
+        self.run_task_tests(task="image-text-to-text", torch_dtype="float16")
+
     @is_pipeline_test
     @require_vision
     def test_pipeline_image_to_text(self):
@@ -916,6 +930,8 @@ def parse_args_from_docstring_by_indentation(docstring):
 
 
 def compare_pipeline_args_to_hub_spec(pipeline_class, hub_spec):
+    ALLOWED_TRANSFORMERS_ONLY_ARGS = ["timeout"]
+
     docstring = inspect.getdoc(pipeline_class.__call__).strip()
     docstring_args = set(parse_args_from_docstring_by_indentation(docstring))
     hub_args = set(get_arg_names_from_hub_spec(hub_spec))
@@ -933,6 +949,11 @@ def compare_pipeline_args_to_hub_spec(pipeline_class, hub_spec):
         hub_args.remove(js_generate_args[0])
         docstring_args.remove(docstring_generate_args[0])
 
+    # Special casing 2: We permit some transformers-only arguments that don't affect pipeline output
+    for arg in ALLOWED_TRANSFORMERS_ONLY_ARGS:
+        if arg in docstring_args and arg not in hub_args:
+            docstring_args.remove(arg)
+
     if hub_args != docstring_args:
         error = [f"{pipeline_class.__name__} differs from JS spec {hub_spec.__name__}"]
         matching_args = hub_args & docstring_args
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 8020b0e711cf1f..f04a4255556baf 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -1327,6 +1327,110 @@ def test_chat_template_return_assistant_tokens_mask(self):
                     [0] * (assistant_start2 - assistant_end - 1),
                 )
 
+    @require_jinja
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        dummy_template = (
+            "{% for message in messages %}"
+            "{% if (message['role'] != 'assistant') %}"
+            "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
+            "{% elif (message['role'] == 'assistant')%}"
+            "{{'<|im_start|>' + message['role'] + '\n'}}"
+            "{% generation %}"
+            "{{message['content'] + '<|im_end|>'}}"
+            "{% endgeneration %}"
+            "{{'\n'}}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        conversations = [
+            [
+                {"role": "system", "content": "system message"},
+                {"role": "user", "content": "user message"},
+                {
+                    "role": "assistant",
+                    "content": (
+                        "start turn assistant. long string to be truncated, long string to be truncated, "
+                        "long string to be truncated, long string to be truncated, long string to be truncated"
+                    ),
+                },
+                {"role": "user", "content": "another user message"},
+            ],
+            [
+                {"role": "system", "content": "system message"},
+                {"role": "user", "content": "user message"},
+                {
+                    "role": "assistant",
+                    "content": (
+                        "start turn assistant. long string to be truncated, long string to be truncated, "
+                        "long string to be truncated, long string to be truncated, long string to be truncated"
+                    ),
+                },
+                {"role": "user", "content": "another user message"},
+            ],
+        ]
+
+        for tokenizer, pretrained_name, _ in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                if not self.test_rust_tokenizer:
+                    self.skipTest(reason="No fast tokenizer defined")
+
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name)
+
+                # Find where to truncate, as the amount of tokens is different for different tokenizers and I want the
+                # truncation to happen in the middle of the assistant content.
+                full_encoding = tokenizer_r.apply_chat_template(
+                    conversations[0],
+                    chat_template=dummy_template,
+                    tokenize=True,
+                    return_dict=True,
+                )
+                chat_string = tokenizer_r.apply_chat_template(
+                    conversations[0], tokenize=False, chat_template=dummy_template
+                )
+                truncation_position = full_encoding.char_to_token(chat_string.index(", long string to be truncated,"))
+
+                # check batched
+                output = tokenizer_r.apply_chat_template(
+                    conversations,
+                    chat_template=dummy_template,
+                    tokenize=True,
+                    return_assistant_tokens_mask=True,
+                    max_length=truncation_position,
+                    truncation=True,
+                    return_dict=True,
+                )
+                for i, conv in enumerate(conversations):
+                    chat_string = tokenizer_r.apply_chat_template(conv, tokenize=False, chat_template=dummy_template)
+                    assistant_start = output.char_to_token(i, chat_string.index("start turn assistant"))
+
+                    # assert 1 from assistant_start to the end because the rest is truncated.
+                    self.assertEqual(
+                        output["assistant_masks"][i][assistant_start:],
+                        [1] * (len(output["assistant_masks"][i]) - assistant_start),
+                    )
+
+                # check not batched
+                output = tokenizer_r.apply_chat_template(
+                    conversations[0],
+                    chat_template=dummy_template,
+                    tokenize=True,
+                    return_assistant_tokens_mask=True,
+                    return_dict=True,
+                    max_length=truncation_position,
+                    truncation=True,
+                )
+
+                chat_string = tokenizer_r.apply_chat_template(
+                    conversations[0], tokenize=False, chat_template=dummy_template
+                )
+                assistant_start = output.char_to_token(0, chat_string.index("start turn assistant"))
+
+                # assert 1 from assistant_start to the end because the rest is truncated.
+                self.assertEqual(
+                    output["assistant_masks"][assistant_start:],
+                    [1] * (len(output["assistant_masks"]) - assistant_start),
+                )
+
     @require_jinja
     def test_continue_final_message(self):
         dummy_template = """
@@ -1357,6 +1461,38 @@ def test_continue_final_message(self):
                     "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message",
                 )
 
+    @require_jinja
+    def test_continue_final_message_with_trim(self):
+        """Regression test for chat templates with trimming: https://github.com/huggingface/transformers/pull/34214"""
+
+        dummy_template = """
+        {%- for message in messages %}
+            {{- "<|im_start|>" + message['role'] + "\n" + message['content'] | trim + "<|im_end|>" + "\n"}}
+        {%- endfor %}"""
+        dummy_conversation = [
+            {"role": "system", "content": "system message"},
+            {"role": "user", "content": "user message"},
+            {"role": "assistant", "content": "assistant message "},  # Note the trailing whitespace
+        ]
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                output = tokenizer.apply_chat_template(
+                    dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=False
+                )
+                self.assertEqual(
+                    output,
+                    "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message<|im_end|>\n",
+                )
+                prefill_output = tokenizer.apply_chat_template(
+                    dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=True
+                )
+                # Assert that the final message is unterminated
+                self.assertEqual(
+                    prefill_output,
+                    "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message",
+                )
+
     @require_jinja
     def test_chat_template_dict(self):
         dummy_template_1 = "{{'a'}}"
@@ -4156,8 +4292,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         special_tokens_list.remove("additional_special_tokens")
         special_tokens_map = {}
         for token in special_tokens_list:
-            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                 special_token = getattr(tokenizer, token)
                 special_tokens_map[special_token] = f"{special_token}a"
 
@@ -4169,7 +4304,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         # Check the changes
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                 continue
             special_token = getattr(tokenizer, token)
             if special_token in special_tokens_map:
@@ -4411,7 +4546,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir)
                 tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
                 EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
                 with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
-                    self.assertEqual(tokenizer._eos_token, new_eos)
+                    self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
                     self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))
 
                 with tempfile.TemporaryDirectory() as tmp_dir_2:
@@ -4449,7 +4584,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir)
                 with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
                     if self.rust_tokenizer_class is not None:
                         tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
-                        self.assertEqual(tokenizer_fast._eos_token, new_eos)
+                        self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
                         self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
                         # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
                         with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py
index 2c8f71ba977246..5171af67300813 100644
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@@ -28,6 +28,7 @@
     BatchEncoding,
     BertTokenizer,
     BertTokenizerFast,
+    LlamaTokenizerFast,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
     TensorType,
@@ -280,6 +281,54 @@ def test_decoding_single_token(self):
                 self.assertEqual(decoded_flat, "##：")
                 self.assertEqual(decoded_list, "##：")
 
+    def test_extra_special_tokens_multimodal(self):
+        special_tokens_list = [
+            "bos_token",
+            "eos_token",
+            "unk_token",
+            "sep_token",
+            "pad_token",
+            "cls_token",
+            "mask_token",
+            "additional_special_tokens",
+        ]
+        llama_tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
+        llama_tokenizer.extra_special_tokens = {
+            "boi_token": "<image_start>",
+            "eoi_token": "<image_end>",
+            "image_token": "<image>",
+        }
+        self.assertListEqual(llama_tokenizer.SPECIAL_TOKENS_ATTRIBUTES, special_tokens_list)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            llama_tokenizer.save_pretrained(tmpdirname)
+
+            # load back and check we have extra special tokens set
+            loaded_tokenizer = LlamaTokenizerFast.from_pretrained(tmpdirname)
+            multimodal_special_tokens_list = special_tokens_list + ["boi_token", "eoi_token", "image_token"]
+            self.assertListEqual(loaded_tokenizer.SPECIAL_TOKENS_ATTRIBUTES, multimodal_special_tokens_list)
+
+            # We set an image_token_id before, so we can get an "image_token" as str that matches the id
+            self.assertTrue(loaded_tokenizer.image_token == "<image>")
+            self.assertTrue(loaded_tokenizer.image_token_id == loaded_tokenizer.convert_tokens_to_ids("<image>"))
+
+        # save one more time and make sure the image token can get loaded back
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            loaded_tokenizer.save_pretrained(tmpdirname)
+            loaded_tokenizer_with_extra_tokens = LlamaTokenizerFast.from_pretrained(tmpdirname)
+            self.assertTrue(loaded_tokenizer_with_extra_tokens.image_token == "<image>")
+
+        # test that we can also indicate extra tokens during load time
+        extra_special_tokens = {
+            "boi_token": "<image_start>",
+            "eoi_token": "<image_end>",
+            "image_token": "<image>",
+        }
+        tokenizer = LlamaTokenizerFast.from_pretrained(
+            "huggyllama/llama-7b", extra_special_tokens=extra_special_tokens
+        )
+        self.assertTrue(tokenizer.image_token == "<image>")
+        self.assertTrue(tokenizer.image_token_id == loaded_tokenizer.convert_tokens_to_ids("<image>"))
+
     @require_tokenizers
     def test_decoding_skip_special_tokens(self):
         for tokenizer_class in [BertTokenizer, BertTokenizerFast]:
diff --git a/tests/tp/test_tp.py b/tests/tp/test_tp.py
new file mode 100644
index 00000000000000..2139a648867b61
--- /dev/null
+++ b/tests/tp/test_tp.py
@@ -0,0 +1,91 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from transformers import is_torch_available
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaModel
+from transformers.testing_utils import (
+    TestCasePlus,
+    execute_subprocess_async,
+    get_torch_dist_unique_port,
+    require_torch_multi_gpu,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+class TestTensorParallel(TestCasePlus):
+    @require_torch_multi_gpu
+    def test_tp(self):
+        distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
+            --master_port={get_torch_dist_unique_port()}
+            {self.test_file_dir}/test_tp.py
+        """.split()
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"--output_dir {output_dir} --report_to none".split()
+        cmd = ["torchrun"] + distributed_args + args
+        print(cmd)
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
+
+if __name__ == "__main__":
+    # The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
+    # CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/tp/test_tp.py
+    # or
+    # PYTHONPATH="src" python -m torch.distributed.run --nproc_per_node 2 ./tests/tp/test_tp.py
+
+    if not is_torch_available():
+        exit(0)
+
+    # Test settings
+    model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+    bs = 4
+    seqlen = 64
+
+    # Get distributed settings
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+
+    # Initialize distributed
+    device = torch.device(f"cuda:{rank}")
+    torch.distributed.init_process_group("nccl", device_id=device)
+    device_mesh = torch.distributed.init_device_mesh("cuda", (world_size,))
+
+    # Get model config
+    config = LlamaConfig.from_pretrained(model_id)
+    # Shrink model size
+    config.num_hidden_layers //= 8
+    config.vocab_size //= 8
+
+    # Instantiate model
+    with device:
+        model = LlamaModel(config)
+
+    model.eval()
+
+    # Tensor Parallel
+    if world_size > 1:
+        model.tensor_parallel(device_mesh)
+
+    # Run model
+    inputs = torch.randint(config.vocab_size, (bs, seqlen), device=device)
+    with torch.no_grad():
+        out = model(inputs)
+
+    assert out.last_hidden_state.shape == torch.Size([bs, seqlen, config.hidden_size])
diff --git a/tests/trainer/test_data_collator.py b/tests/trainer/test_data_collator.py
index 8c1f593ff4bcb8..70870be7718bee 100644
--- a/tests/trainer/test_data_collator.py
+++ b/tests/trainer/test_data_collator.py
@@ -299,7 +299,7 @@ def _test_no_pad_and_pad(self, no_pad_features, pad_features):
         self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
         self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
 
-        tokenizer._pad_token = None
+        tokenizer.pad_token = None
         data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
         with self.assertRaises(ValueError):
             # Expect error due to padding token missing
@@ -978,7 +978,7 @@ def _test_no_pad_and_pad(self, no_pad_features, pad_features):
         self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
         self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
 
-        tokenizer._pad_token = None
+        tokenizer.pad_token = None
         data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")
         with self.assertRaises(ValueError):
             # Expect error due to padding token missing
@@ -1673,7 +1673,7 @@ def _test_no_pad_and_pad(self, no_pad_features, pad_features):
         self.assertEqual(batch["input_ids"].shape, (2, 16))
         self.assertEqual(batch["labels"].shape, (2, 16))
 
-        tokenizer._pad_token = None
+        tokenizer.pad_token = None
         data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="np")
         with self.assertRaises(ValueError):
             # Expect error due to padding token missing
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 5c03355785d2b5..5658372fa71308 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -272,6 +272,19 @@ def __getitem__(self, i):
         return {"input_ids": self.x, "labels": self.x}
 
 
+class SequenceClassificationDataset:
+    def __init__(self, length=64, vocab_size=100, num_labels=5):
+        self.length = length
+        self.sequences = [torch.randint(0, vocab_size, (64,)).tolist() for _ in range(length)]
+        self.labels = torch.randint(0, num_labels, (length,)).tolist()
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, i):
+        return {"input_ids": self.sequences[i], "label": self.labels[i]}
+
+
 class DynamicShapesDataset:
     def __init__(self, length=64, seed=42, batch_size=8):
         self.length = length
@@ -1144,6 +1157,23 @@ def test_number_of_steps_in_training_with_ipex(self):
             train_output = trainer.train()
             self.assertEqual(train_output.global_step, 10)
 
+    def test_torch_compile_loss_func_compatibility(self):
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(
+                tmp_dir,
+                per_device_train_batch_size=2,
+                torch_compile=True,
+                max_steps=1,  # compile happens on the first step
+            )
+            trainer = Trainer(model=tiny_llama, args=args, train_dataset=train_dataset)  # noqa
+            trainer.train()
+
     @require_peft
     @require_bitsandbytes
     def test_bnb_compile(self):
@@ -3676,9 +3706,6 @@ def test_accelerator_config_from_dict(self):
             self.assertEqual(trainer.accelerator.even_batches, False)
             self.assertEqual(trainer.accelerator.use_seedable_sampler, True)
 
-            if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE:
-                self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["sync_each_batch"], True)
-
     def test_accelerator_config_from_yaml(self):
         # Checks that accelerator kwargs can be passed through
         # and the accelerator is initialized respectively
@@ -3691,8 +3718,6 @@ def test_accelerator_config_from_yaml(self):
                     "even_batches": False,
                     "use_seedable_sampler": False,
                 }
-                if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE:
-                    accelerator_config["gradient_accumulation_kwargs"] = {"sync_each_batch": True}
                 json.dump(accelerator_config, f)
             config = RegressionModelConfig(a=1.5, b=2.5)
             model = RegressionPreTrainedModel(config)
@@ -3706,9 +3731,6 @@ def test_accelerator_config_from_yaml(self):
             self.assertEqual(trainer.accelerator.even_batches, False)
             self.assertEqual(trainer.accelerator.use_seedable_sampler, False)
 
-            if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE:
-                self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["sync_each_batch"], True)
-
     def test_accelerator_config_from_dataclass(self):
         # Checks that accelerator kwargs can be passed through
         # and the accelerator is initialized respectively
@@ -3754,10 +3776,7 @@ def test_accelerate_config_from_dataclass_grad_accum(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             args = RegressionTrainingArguments(output_dir=tmp_dir, accelerator_config=accelerator_config)
             trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset)
-            self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["num_steps"], 10)
-            self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["adjust_scheduler"], False)
-            self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["sync_with_dataloader"], False)
-            self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["sync_each_batch"], True)
+            self.assertEqual(trainer.args.gradient_accumulation_steps, 10)
 
     def test_accelerator_config_from_partial(self):
         # Checks that accelerator kwargs can be passed through
@@ -4041,6 +4060,89 @@ def test_trainer_saves_processor(self):
             reloaded_tokenizer(test_sentence, padding="max_length").input_ids,
         )
 
+    def test_save_best_checkpoint(self):
+        freq = int(64 / self.batch_size)
+        total = int(self.n_epochs * 64 / self.batch_size)
+
+        # Case 1: args.metric_for_best_model == "accuracy".
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                a=1.5,
+                b=2.5,
+                output_dir=tmpdir,
+                learning_rate=0.1,
+                eval_strategy="epoch",
+                save_strategy="best",
+                metric_for_best_model="accuracy",
+                compute_metrics=AlmostAccuracy(),
+            )
+            self.assertTrue(trainer.args.metric_for_best_model == "accuracy")
+
+            with patch.object(
+                trainer,
+                "_evaluate",
+                side_effect=[
+                    {"eval_loss": 0.03, "eval_accuracy": 0.60, "epoch": 1.0},
+                    {"eval_loss": 0.02, "eval_accuracy": 0.65, "epoch": 2.0},
+                    {"eval_loss": 0.01, "eval_accuracy": 0.64, "epoch": 3.0},
+                ],
+            ):
+                trainer.train()
+
+                self.assertEqual(len(os.listdir(tmpdir)), 2)
+                self.check_saved_checkpoints(
+                    output_dir=tmpdir,
+                    freq=freq,
+                    total=total,
+                )
+
+        # Case 2: args.metric_for_best_model == "loss".
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                a=1.5,
+                b=2.5,
+                output_dir=tmpdir,
+                learning_rate=0.1,
+                eval_strategy="epoch",
+                save_strategy="best",
+                metric_for_best_model="loss",
+                compute_metrics=AlmostAccuracy(),
+            )
+            self.assertTrue(trainer.args.metric_for_best_model == "loss")
+
+            with patch.object(
+                trainer,
+                "_evaluate",
+                side_effect=[
+                    {"eval_loss": 0.03, "eval_accuracy": 0.60, "epoch": 1.0},
+                    {"eval_loss": 0.02, "eval_accuracy": 0.65, "epoch": 2.0},
+                    {"eval_loss": 0.03, "eval_accuracy": 0.66, "epoch": 3.0},
+                ],
+            ):
+                trainer.train()
+
+                self.assertEqual(len(os.listdir(tmpdir)), 2)
+                self.check_saved_checkpoints(
+                    output_dir=tmpdir,
+                    freq=freq,
+                    total=total,
+                )
+
+        # Case 3: Metric name not provided; throw error.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with self.assertRaises(ValueError) as context:
+                trainer = get_regression_trainer(
+                    a=1.5,
+                    b=2.5,
+                    output_dir=tmpdir,
+                    learning_rate=0.1,
+                    eval_strategy="epoch",
+                    save_strategy="best",
+                    compute_metrics=AlmostAccuracy(),
+                )
+
+            self.assertIn("`args.metric_for_best_model` must be provided", str(context.exception))
+
 
 @require_torch
 @is_staging_test
diff --git a/tests/trainer/test_trainer_fsdp.py b/tests/trainer/test_trainer_fsdp.py
index 994a82a8db0c44..eca6a30664f045 100644
--- a/tests/trainer/test_trainer_fsdp.py
+++ b/tests/trainer/test_trainer_fsdp.py
@@ -20,6 +20,8 @@
     execute_subprocess_async,
     get_torch_dist_unique_port,
     require_accelerate,
+    require_fp8,
+    require_fsdp,
     require_torch_multi_gpu,
 )
 
@@ -64,6 +66,7 @@ def __getitem__(self, i: int) -> str:
 class TestFSDPTrainer(TestCasePlus):
     @require_accelerate
     @require_torch_multi_gpu
+    @require_fsdp
     def test_trainer(self):
         output_dir = self.get_auto_remove_tmp_dir()
         cmd = [
@@ -86,6 +89,62 @@ def test_trainer(self):
         # successful return here == success - any errors would have caused an error in the sub-call
 
 
+class TestFSDPTrainerFP8(TestCasePlus):
+    @require_accelerate
+    @require_torch_multi_gpu
+    @require_fsdp
+    @require_fp8
+    def test_trainer(self):
+        output_dir = self.get_auto_remove_tmp_dir()
+        cmd = [
+            "accelerate",
+            "launch",
+            "--use_fsdp",
+            "--main_process_port",
+            f"{get_torch_dist_unique_port()}",
+            "--num_processes",
+            f"{torch.cuda.device_count()}",
+            "--mixed_precision",
+            "fp8",
+            "--fsdp_transformer_layer_cls_to_wrap",
+            "GPT2Block",
+            f"{self.test_file_dir}/test_trainer_fsdp.py",
+            "--output_dir",
+            f"{output_dir}",
+            "--report_to",
+            "none",
+        ]
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
+    class TestFSDPTrainerWrap(TestCasePlus):
+        @require_accelerate
+        @require_torch_multi_gpu
+        @require_fsdp
+        def test_trainer(self):
+            output_dir = self.get_auto_remove_tmp_dir()
+            cmd = [
+                "accelerate",
+                "launch",
+                "--use_fsdp",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
+                "--num_processes",
+                f"{torch.cuda.device_count()}",
+                "--fsdp_transformer_layer_cls_to_wrap",
+                "GPT2Block",
+                f"{self.test_file_dir}/test_trainer_fsdp.py",
+                "--output_dir",
+                f"{output_dir}",
+                "--report_to",
+                "none",
+                "--auto_find_batch_size",
+                "True",
+            ]
+            execute_subprocess_async(cmd, env=self.get_env())
+            # successful return here == success - any errors would have caused an error in the sub-call
+
+
 if __name__ == "__main__":
     parser = HfArgumentParser((Seq2SeqTrainingArguments,))
     training_args = parser.parse_args_into_dataclasses()[0]
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index 8af47cde8e5315..85e7c20dd5272e 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 import copy
 import glob
+import itertools
 import json
 import os
 import os.path
@@ -105,7 +106,6 @@
         _find_disjoint,
         _find_identical,
         dtype_byte_size,
-        shard_checkpoint,
     )
     from transformers.pytorch_utils import isin_mps_friendly
 
@@ -460,6 +460,19 @@ def test_model_from_config_torch_dtype_str(self):
         with self.assertRaises(ValueError):
             model = AutoModel.from_pretrained(TINY_T5, torch_dtype="int64")
 
+    @require_torch
+    def test_model_from_pretrained_meta_device(self):
+        def is_on_meta(model_id, dtype):
+            with torch.device("meta"):
+                model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype)
+                return all(value.device.type == "meta" for value in model.state_dict().values())
+
+        model_ids = ("fxmarty/tiny-llama-fast-tokenizer", "fxmarty/small-llama-testing")
+        dtypes = (None, "auto", torch.float16)
+
+        for model_id, dtype in itertools.product(model_ids, dtypes):
+            self.assertTrue(is_on_meta(model_id, dtype))
+
     def test_model_from_pretrained_torch_dtype(self):
         # test that the model can be instantiated with dtype of either
         # 1. explicit from_pretrained's torch_dtype argument
@@ -668,71 +681,6 @@ def test_no_super_init_config_and_model(self):
         for p1, p2 in zip(model.parameters(), new_model.parameters()):
             self.assertTrue(torch.equal(p1, p2))
 
-    def test_shard_checkpoint(self):
-        # This is the model we will use, total size 340,000 bytes.
-        model = torch.nn.Sequential(
-            torch.nn.Linear(100, 200, bias=False),  # size 80,000
-            torch.nn.Linear(200, 200, bias=False),  # size 160,000
-            torch.nn.Linear(200, 100, bias=False),  # size 80,000
-            torch.nn.Linear(100, 50, bias=False),  # size 20,000
-        )
-        state_dict = model.state_dict()
-
-        with self.subTest("No shard when max size is bigger than model size"):
-            shards, index = shard_checkpoint(state_dict)
-            self.assertIsNone(index)
-            self.assertDictEqual(shards, {WEIGHTS_NAME: state_dict})
-
-        with self.subTest("Test sharding, no weights bigger than max size"):
-            shards, index = shard_checkpoint(state_dict, max_shard_size="300kB")
-            # Split is first two layers then last two.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "0.weight": "pytorch_model-00001-of-00002.bin",
-                        "1.weight": "pytorch_model-00001-of-00002.bin",
-                        "2.weight": "pytorch_model-00002-of-00002.bin",
-                        "3.weight": "pytorch_model-00002-of-00002.bin",
-                    },
-                },
-            )
-
-            shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]}
-            shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
-            self.assertDictEqual(
-                shards, {"pytorch_model-00001-of-00002.bin": shard1, "pytorch_model-00002-of-00002.bin": shard2}
-            )
-
-        with self.subTest("Test sharding with weights bigger than max size"):
-            shards, index = shard_checkpoint(state_dict, max_shard_size="100kB")
-            # Split is first layer, second layer then last 2.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "0.weight": "pytorch_model-00001-of-00003.bin",
-                        "1.weight": "pytorch_model-00002-of-00003.bin",
-                        "2.weight": "pytorch_model-00003-of-00003.bin",
-                        "3.weight": "pytorch_model-00003-of-00003.bin",
-                    },
-                },
-            )
-
-            shard1 = {"0.weight": state_dict["0.weight"]}
-            shard2 = {"1.weight": state_dict["1.weight"]}
-            shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
-            self.assertDictEqual(
-                shards,
-                {
-                    "pytorch_model-00001-of-00003.bin": shard1,
-                    "pytorch_model-00002-of-00003.bin": shard2,
-                    "pytorch_model-00003-of-00003.bin": shard3,
-                },
-            )
-
     def test_checkpoint_sharding_local_bin(self):
         model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
 
@@ -1544,15 +1492,16 @@ def test_pretrained_low_mem_new_config(self):
             self.assertEqual(model.__class__.__name__, model_ref.__class__.__name__)
 
     def test_generation_config_is_loaded_with_model(self):
-        # Note: `TinyLlama/TinyLlama-1.1B-Chat-v1.0` has a `generation_config.json` containing `max_length: 2048`
+        # Note: `hf-internal-testing/tiny-random-MistralForCausalLM` has a `generation_config.json`
+        # containing `bos_token_id: 1`
 
         # 1. Load without further parameters
-        model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-        self.assertEqual(model.generation_config.max_length, 2048)
+        model = AutoModelForCausalLM.from_pretrained(TINY_MISTRAL)
+        self.assertEqual(model.generation_config.bos_token_id, 1)
 
         # 2. Load with `device_map`
-        model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto")
-        self.assertEqual(model.generation_config.max_length, 2048)
+        model = AutoModelForCausalLM.from_pretrained(TINY_MISTRAL, device_map="auto")
+        self.assertEqual(model.generation_config.bos_token_id, 1)
 
     @require_safetensors
     def test_safetensors_torch_from_torch(self):
@@ -1710,7 +1659,12 @@ def test_isin_mps_friendly(self):
                 torch.isin(random_ids, random_test_integer), isin_mps_friendly(random_ids, random_test_integer)
             )
         )
-        # We can match against an tensor of integers
+        # We can match against an 0D tensor
+        random_test_tensor = torch.randint(0, 100, (1,)).squeeze()
+        self.assertTrue(
+            torch.equal(torch.isin(random_ids, random_test_tensor), isin_mps_friendly(random_ids, random_test_tensor))
+        )
+        # We can match against an 1D tensor (with many items)
         random_test_tensor = torch.randint(0, 100, (10,))
         self.assertTrue(
             torch.equal(torch.isin(random_ids, random_test_tensor), isin_mps_friendly(random_ids, random_test_tensor))
diff --git a/tests/utils/tiny_model_summary.json b/tests/utils/tiny_model_summary.json
index 911783bc5cfbac..f27f720ec3d593 100644
--- a/tests/utils/tiny_model_summary.json
+++ b/tests/utils/tiny_model_summary.json
@@ -2896,7 +2896,7 @@
         "model_classes": [
             "IdeficsForVisionText2Text"
         ],
-        "sha": "2c2f2e2cd6b02a77d0cdd8c3767ba9a6267dbd20"
+        "sha": "a6be81294ff7a3d44f3aef0ed18e42b97c426831"
     },
     "IdeficsModel": {
         "tokenizer_classes": [
diff --git a/utils/check_bad_commit.py b/utils/check_bad_commit.py
index adb25f11264b12..45b01537127fac 100644
--- a/utils/check_bad_commit.py
+++ b/utils/check_bad_commit.py
@@ -75,6 +75,9 @@ def find_bad_commit(target_test, start_commit, end_commit):
         `str`: The earliest commit at which `target_test` fails.
     """
 
+    if start_commit == end_commit:
+        return start_commit
+
     create_script(target_test=target_test)
 
     bash = f"""
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index f31be7cbe1f28d..0be960f4a33e6d 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -335,6 +335,7 @@
     "ImageFeatureExtractionPipeline",
     "ImageGPTConfig",
     "ImageSegmentationPipeline",
+    "ImageTextToTextPipeline",
     "ImageToImagePipeline",
     "ImageToTextPipeline",
     "InformerConfig",
diff --git a/utils/check_modular_conversion.py b/utils/check_modular_conversion.py
index 09b237c1e6c673..86af396e03a07c 100644
--- a/utils/check_modular_conversion.py
+++ b/utils/check_modular_conversion.py
@@ -4,6 +4,8 @@
 import logging
 from io import StringIO
 
+from create_dependency_mapping import find_priority_list
+
 # Console for rich printing
 from modular_model_converter import convert_modular_file
 from rich.console import Console
@@ -69,7 +71,7 @@ def compare_files(modular_file_path, fix_and_overwrite=False):
     if args.files == ["all"]:
         args.files = glob.glob("src/transformers/models/**/modular_*.py", recursive=True)
     non_matching_files = 0
-    for modular_file_path in args.files:
+    for modular_file_path in find_priority_list(args.files):
         non_matching_files += compare_files(modular_file_path, args.fix_and_overwrite)
 
     if non_matching_files and not args.fix_and_overwrite:
diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index c107a483186231..8d6c6782a572f4 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -17,13 +17,14 @@
 import importlib
 import os
 import re
+from abc import ABC, abstractmethod
 from collections import defaultdict, deque
-from typing import Dict, List, Optional, Set
+from typing import Dict, Set
 
 import libcst as cst
 from check_copies import run_ruff
 from create_dependency_mapping import find_priority_list
-from libcst import ClassDef, CSTTransformer, CSTVisitor
+from libcst import ClassDef, CSTVisitor
 from libcst import matchers as m
 from libcst.metadata import MetadataWrapper, ParentNodeProvider, PositionProvider, ScopeProvider
 
@@ -34,13 +35,6 @@
 logger = logging.get_logger(__name__)
 
 
-# This is used to avoid overwriting these top-level assignments even if they are in the dependency graph. Otherwise, the
-# value from the dependency is used, then mapped to current name convention, resulting in wrong value.
-# The corresponding mapped value is used to define the file target for the assignment
-ASSIGNMENTS_TO_KEEP = {
-    "_CHECKPOINT_FOR_DOC": "modeling",
-}
-
 AUTO_GENERATED_MESSAGE = """#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 #           This file was automatically generated from {relative_path}.
 #               Do NOT edit this file manually as any edits will be overwritten by the generation of
@@ -56,142 +50,28 @@ def get_module_source_from_name(module_name: str) -> str:
     if spec is None or spec.origin is None:
         return f"Module {module_name} not found"
 
-    with open(spec.origin, "r") as file:
+    with open(spec.origin, "r", encoding="utf-8") as file:
         source_code = file.read()
     return source_code
 
 
-class ClassFinder(CSTVisitor):
-    """A visitor class which analyses a module, creating a mapping of dependencies between classes and functions.
-    For example if the visited code has
-    ```python3
-    def init_value(): return 1
-
-    class LlamaModel(PreTrainedModel):
-        def __init__(self):
-            super().__init__(self)
-            self.value = init_value()
-    ```
-    then the `class_dependency_mapping` should be: `{"LlamaModel":["PreTrainedModel","init_value"], "init_value":[]}
-
-    The dependency mapping is updated via the `visit_Name`, `visit_Arg` and `visit_Decorator`. This is very broad, and by
-    checking the parent node, or the scope of a `cst.Name` or `cst.Arg` or `cst.Decorator` we are able to map the
-    dependence parent -> child.
-
-    When visiting such nodes, we update the dependency of the parent node, to take into account the visited node.
-
-    All `visit_XXX` correspond to the code executed when vising the cst.Node of type XXX.
-    """
-
-    METADATA_DEPENDENCIES = (ParentNodeProvider, ScopeProvider, PositionProvider)
+def preserve_case_replace(text, patterns: dict, default_name: str):
+    # Create a regex pattern to match all variations
+    regex_pattern = "|".join(re.escape(key) for key in patterns.keys())
+    compiled_regex = re.compile(regex_pattern, re.IGNORECASE)
 
-    def __init__(self, python_module: cst.Module):
-        # fmt: off
-        self.python_module: cst.Module = python_module  # original cst.Module being visited
-        self.classes: Dict[str, cst.ClassDef] = {}      # stores a mapping from classname to the cst.Node
-        self.imports = {}                               # stores all import statements
-        self.function_def = {}                          # stores global scope function definition
-        self.assignments = {}                           # LLAMA_DOCSTRING
-        self.class_dependency_mapping = {}              # "LlamaModel":["LlamaDecoderLayer, "LlamaRMSNorm", "LlamaPreTrainedModel"], "LlamaDecoderLayer":["LlamaAttention","Llama"]
-        self.first_lvl_dependency_mapping = {}              # "LlamaModel":["LlamaDecoderLayer, "LlamaRMSNorm", "LlamaPreTrainedModel"], "LlamaDecoderLayer":["LlamaAttention","Llama"]
-        # fmt: on
-
-    def _update_class_dependency(self, name, value):
-        """Update the dependency mapping for `name` with `value` by appending the previous
-        dependencies to the new `value`.
-        """
-        dep = set(self.first_lvl_dependency_mapping.get(name, set())) | set({value})
-        self.first_lvl_dependency_mapping[name] = dep
-
-        dep = set(self.class_dependency_mapping.get(value, set()))
-        dep |= set(self.class_dependency_mapping.get(name, {})) | set({value})
-        self.class_dependency_mapping[name] = dep
-
-    def visit_ClassDef(self, node: ClassDef) -> None:
-        """We don't have non global scope class defs in transformers. Here we add the inheritance dependencies"""
-        self.classes[node.name.value] = node
-        for k in node.bases:  # deal with inheritance
-            base_name = self.python_module.code_for_node(k)
-            self._update_class_dependency(node.name.value, base_name)
-
-    def visit_SimpleStatementLine(self, node):
-        """
-        Global Assigns like `GEMMA_INPUT_DOCSTRING = 'THIS IS THE INPUT' and all import statements
-        are extracted and saved in their corresponding dict. They are then used when updating dependency mappings.
-        """
-        if m.matches(node, m.SimpleStatementLine(body=[m.Assign()])) and m.matches(
-            self.get_metadata(cst.metadata.ParentNodeProvider, node), m.Module()
-        ):
-            left_hand_side = node.body[0].targets[0].target
-            if hasattr(left_hand_side, "value"):
-                if left_hand_side.value not in ASSIGNMENTS_TO_KEEP.keys():
-                    self.assignments[left_hand_side.value] = node
-            else:
-                for idx, target in enumerate(list(left_hand_side.elements)):
-                    if target.value.value not in ASSIGNMENTS_TO_KEEP.keys():
-                        self.assignments[target.value.value] = node.body[0].value.elements[idx].value
-        if m.matches(node, m.SimpleStatementLine(body=[m.Import() | m.ImportFrom()])):
-            self.imports[node.body[0].names] = node
+    def replace(match):
+        word = match.group(0)
+        result = patterns.get(word, default_name)
+        return result
 
-    def visit_FunctionDef(self, node):
-        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
-        if m.matches(parent_node, m.Module()):
-            self.function_def[node.name.value] = node
+    return compiled_regex.sub(replace, text)
 
-    def leave_If(self, node):
-        for stmt in node.body.body:
-            if m.matches(stmt, m.SimpleStatementLine(body=[m.ImportFrom() | m.Import()])):
-                self.imports[stmt.body[0].names] = node
-
-    def leave_Name(self, node):
-        if node.value in self.classes.keys() | self.assignments.keys() | self.function_def.keys():
-            parent = self.get_metadata(cst.metadata.ScopeProvider, node)
-            if not isinstance(parent, cst.metadata.scope_provider.GlobalScope):
-                self._update_class_dependency(parent._name_prefix.split(".")[0], node.value)
-
-    def leave_Arg(self, node):
-        if m.matches(node.value, m.Name()):
-            parent = self.get_metadata(ParentNodeProvider, node)
-            if m.matches(parent, m.ClassDef()) and parent.bases:
-                self._update_class_dependency(parent.name.value, node.value.value)
-
-    def leave_Dict(self, node):
-        parent = self.get_metadata(cst.metadata.ParentNodeProvider, node)
-        if m.matches(parent, m.Assign(targets=[m.AssignTarget()])):
-            name = parent.targets[0].target.value
-            if name in self.assignments:
-                for k in node.elements:
-                    dep_name = k.value.value
-                    if dep_name in self.classes:
-                        self._update_class_dependency(name, dep_name)
-
-    def leave_Decorator(self, node):
-        if hasattr(node.decorator, "args"):
-            for k in node.decorator.args:
-                if m.matches(k.value, m.Call(func=m.Attribute(value=m.Name()))):  # and k.value.func.value.value:
-                    if k.value.func.value.value not in self.assignments:
-                        raise ValueError(
-                            f"We detected a call to {k.value.func.value.value}, but it was not assigned. See the list of assigments {self.assignments.keys()}"
-                        )
-                    parent = self.get_metadata(cst.metadata.ParentNodeProvider, node)
-                    scope = self.get_metadata(cst.metadata.ScopeProvider, node)
-                    name = scope._name_prefix.split(".")[0] if scope._name_prefix != "" else parent.name.value
-                    self._update_class_dependency(name, k.value.func.value.value)
-                elif m.matches(k, m.Arg(value=m.Name())) and k.value.value in self.assignments:
-                    parent = self.get_metadata(cst.metadata.ParentNodeProvider, node)
-                    scope = self.get_metadata(cst.metadata.ScopeProvider, node)
-                    name = scope._name_prefix.split(".")[0] if scope._name_prefix != "" else parent.name.value
-                    self._update_class_dependency(name, k.value.value)
 
-    def leave_Module(self, node):
-        """When leaving the module, we store the position of each global scoped node (Assigns, function def and class def)
-        to allow sorting the dependencies based on their position in the code. We use the PositionProvider metadata wrapper for this.
-        """
-        self.global_nodes = {**self.assignments, **self.classes, **self.function_def}
-        # now sort the class dependency_mapping based on the position of the nodes
-        self.class_start_line = {}
-        for id, node in self.global_nodes.items():
-            self.class_start_line[id] = self.get_metadata(cst.metadata.PositionProvider, node).start.line
+def convert_to_camelcase(text, old_name: str, default_old_name: str):
+    # Regex pattern to match consecutive uppercase letters and lowercase the first set
+    result = re.sub(rf"^({old_name})(?=[a-z]+)", lambda m: default_old_name, text, flags=re.IGNORECASE, count=1)
+    return result
 
 
 class ReplaceNameTransformer(m.MatcherDecoratableTransformer):
@@ -210,8 +90,6 @@ def __init__(
         new_name,
         given_old_name=None,
         given_new_name=None,
-        old_class_name: str = None,
-        new_class_name: str = None,
     ):
         super().__init__()
         self.old_name = old_name
@@ -232,70 +110,17 @@ def __init__(
             self.default_old_name = CONFIG_MAPPING_NAMES[self.old_name].replace("Config", "")
             if self.default_old_name.isupper():
                 self.default_old_name = self.default_old_name.capitalize()
-        if new_class_name is not None and old_class_name is not None and old_class_name not in self.patterns:
-            # In last recourse, when the suffix of the new class is not the same as the old class,
-            # and if the old and new classes start with the default name, we keep the default class name
-            # and replace the old suffix with the new one.
-            # Useful when we have a class like `ColPaliForRetrieval` inheriting from `PaliGemmaForConditionalGeneration`
-            # where a model extends another model, but is used for a different task.
-            if old_class_name.startswith(self.default_old_name) and new_class_name.startswith(self.default_name):
-                self.patterns[old_class_name[len(self.default_old_name) :]] = new_class_name[len(self.default_name) :]
-
-    def preserve_case_replace(self, text):
-        # Create a regex pattern to match all variations
-        regex_pattern = "|".join(re.escape(key) for key in self.patterns.keys())
-        compiled_regex = re.compile(regex_pattern, re.IGNORECASE)
-
-        def replace(match):
-            word = match.group(0)
-            result = self.patterns.get(word, self.default_name)
-            return result
-
-        return compiled_regex.sub(replace, text)
-
-    def convert_to_camelcase(self, text):
-        # Regex pattern to match consecutive uppercase letters and lowercase the first set
-        result = re.sub(
-            rf"^({self.old_name})(?=[a-z]+)", lambda m: self.default_old_name, text, flags=re.IGNORECASE, count=1
-        )
-        return result
 
     @m.leave(m.Name() | m.SimpleString() | m.Comment())
     def replace_name(self, original_node, updated_node):
         if re.findall(r"# Copied from", updated_node.value):
             return cst.RemoveFromParent()
-        update = self.preserve_case_replace(updated_node.value)
+        update = preserve_case_replace(updated_node.value, self.patterns, self.default_name)
         return updated_node.with_changes(value=update)
 
     def leave_ClassDef(self, original_node, updated_node):
-        return updated_node.with_changes(name=cst.Name(self.convert_to_camelcase(updated_node.name.value)))
-
-
-def find_classes_in_file(
-    module: cst.Module,
-    old_id="llama",
-    new_id="gemma",
-    given_old_name=None,
-    given_new_name=None,
-    old_class_name=None,
-    new_class_name=None,
-):
-    """Helper function to rename and then parse a source file using the ClassFinder"""
-    transformer = ReplaceNameTransformer(
-        old_id,
-        new_id,
-        given_old_name=given_old_name,
-        given_new_name=given_new_name,
-        old_class_name=old_class_name,
-        new_class_name=new_class_name,
-    )
-    new_module = module.visit(transformer)
-
-    wrapper = MetadataWrapper(new_module)
-
-    class_finder = ClassFinder(new_module)
-    wrapper.visit(class_finder)
-    return class_finder
+        new_name = convert_to_camelcase(updated_node.name.value, self.old_name, self.default_old_name)
+        return updated_node.with_changes(name=cst.Name(new_name))
 
 
 DOCSTRING_NODE = m.SimpleStatementLine(
@@ -320,45 +145,69 @@ def is_call_to_super(node, func_name):
     )
 
 
+def get_full_attribute_name(node: cst.Attribute | cst.Name) -> str | None:
+    """Get the full name of an Attribute or Name node (e.g. `"nn.Module"` for an Attribute representing it). If the
+    successive value of an Attribute are not Name nodes, return `None`."""
+    if m.matches(node, m.Name()):
+        return node.value
+    elif m.matches(node, m.Attribute()):
+        if not m.matches(node.attr, m.Name()):
+            return None
+        name = node.attr.value
+        new_node = node.value
+        while m.matches(new_node, m.Attribute()):
+            if not m.matches(new_node.attr, m.Name()):
+                return None
+            name = new_node.attr.value + "." + name
+            new_node = new_node.value
+        if not m.matches(new_node, m.Name()):
+            return None
+        return new_node.value + "." + name
+    return None
+
+
 # Transformer class to replace ClassB.call_to_method and ClassB().call_to_method with super().call_to_method
 class ReplaceMethodCallTransformer(cst.CSTTransformer):
     def __init__(self, all_bases: Set[str]):
         self.all_bases = all_bases
 
     def leave_Attribute(self, original_node: cst.Attribute, updated_node: cst.Attribute) -> cst.CSTNode:
-        # Handle ClassB.call_to_method
+        # Handle ClassB.call_to_method or module.classB.call_to_method
         if (
-            isinstance(original_node.value, cst.Name)
-            and original_node.value.value in self.all_bases
-            and isinstance(original_node.attr, cst.Name)
+            m.matches(original_node.value, m.Name() | m.Attribute())
+            and get_full_attribute_name(original_node.value) in self.all_bases
+            and m.matches(original_node.attr, m.Name())
         ):
             # Replace with super().call_to_method
             return updated_node.with_changes(
                 value=cst.Call(cst.Name("super")),
             )
-        # Handle ClassB().call_to_method
+        # Handle ClassB().call_to_method or module.ClassB().call_to_method
         elif (
-            isinstance(original_node.value, cst.Call)
-            and isinstance(original_node.value.func, cst.Name)
-            and original_node.value.func.value in self.all_bases
-            and isinstance(original_node.attr, cst.Name)
+            m.matches(original_node.value, m.Call())
+            and m.matches(original_node.value.func, m.Name() | m.Attribute())
+            and get_full_attribute_name(original_node.value.func) in self.all_bases
+            and m.matches(original_node.attr, m.Name())
         ):
             # Replace with super().call_to_method
-            return updated_node.with_changes(func=cst.Attribute(value=cst.Call(func=cst.Name("super"))))
+            return updated_node.with_changes(value=cst.Call(cst.Name("super")))
         return updated_node
 
     def leave_Call(self, original_node: cst.Call, updated_node: cst.Call) -> cst.CSTNode:
         # Check if the function being called is of the form ClassB().func_a or ClassB.func_a
-        if isinstance(original_node.func, cst.Attribute) and (
-            # Match ClassB().func_a(...)
+        if m.matches(original_node.func, m.Attribute()) and (
+            # Match ClassB().func_a(...) or module
             (
-                isinstance(original_node.func.value, cst.Call)
-                and isinstance(original_node.func.value.func, cst.Name)
-                and original_node.func.value.func.value in self.all_bases
+                m.matches(original_node.func.value, m.Call())
+                and m.matches(original_node.func.value.func, m.Name() | m.Attribute())
+                and get_full_attribute_name(original_node.func.value.func) in self.all_bases
             )
             or
             # Match ClassB.func_a(...)
-            (isinstance(original_node.func.value, cst.Name) and original_node.func.value.value in self.all_bases)
+            (
+                m.matches(original_node.func.value, m.Name() | m.Attribute())
+                and get_full_attribute_name(original_node.func.value) in self.all_bases
+            )
         ):
             # Check if the first argument is 'self', and remove it
             if len(original_node.args) > 0 and m.matches(original_node.args[0].value, m.Name("self")):
@@ -412,13 +261,12 @@ def merge_docstrings(original_docstring, updated_docstring):
 class SuperTransformer(cst.CSTTransformer):
     METADATA_DEPENDENCIES = (ParentNodeProvider,)
 
-    def __init__(self, python_module: cst.Module, original_methods, updated_methods, class_name="", all_bases=None):
+    def __init__(self, python_module: cst.Module, original_methods, updated_methods, all_bases=None):
         self.python_module = python_module
         self.original_methods = original_methods
         self.updated_methods = updated_methods
         self.all_assign_target = {}
         self.deleted_targets = {}  # child node can delete some arguments
-        self.class_name = class_name
         self.all_bases = all_bases or []
         self.transformer = ReplaceMethodCallTransformer(set(self.all_bases))
 
@@ -437,16 +285,17 @@ def update_body(self, existing_body, new_statements):
             if m.matches(node, m.SimpleStatementLine(body=[m.Del()])):
                 target = self.python_module.code_for_node(node.body[0].target)
                 self.deleted_targets[target] = node
-                continue
 
         for stmt in existing_body:
             if m.matches(stmt, m.SimpleStatementLine(body=[m.Assign()])):
                 target = self.python_module.code_for_node(stmt.body[0].targets[0].target)
                 if target in self.deleted_targets:
-                    logger.warning(f"Deleted the assign for {target}")
                     continue
                 if target in self.all_assign_target:
                     stmt = self.all_assign_target[target]
+            # Skip the docstring (will be added later on, at the beginning)
+            elif m.matches(stmt, DOCSTRING_NODE):
+                continue
             comment_less_code = re.sub(r"#.*", "", self.python_module.code_for_node(stmt)).strip()
             comment_less_code = re.sub(r"\ *\n", "\n", comment_less_code).strip()
             deduplicated_new_body.append(stmt)
@@ -456,17 +305,47 @@ def update_body(self, existing_body, new_statements):
             code = self.python_module.code_for_node(node)
             comment_less_code = re.sub(r"#.*", "", code).strip()
             comment_less_code = re.sub(r"\ *\n", "\n", comment_less_code).strip()
-            if (
-                node not in deduplicated_new_body
-                and "super().__init__" not in comment_less_code
-                and comment_less_code not in existing_nodes
-            ):
+            if node not in deduplicated_new_body and comment_less_code not in existing_nodes:
                 if not m.matches(node, m.SimpleStatementLine(body=[m.Del()])):
-                    # HACK here to fix the pos_init() that has to be last we kinda do this.
-                    deduplicated_new_body = deduplicated_new_body[:-1] + [node] + deduplicated_new_body[-1:]
+                    deduplicated_new_body.append(node)
                     existing_nodes.add(comment_less_code)
+
+        deduplicated_new_body = self._fix_post_init_location(deduplicated_new_body)
+
         return deduplicated_new_body
 
+    def _fix_post_init_location(self, new_body: list[cst.CSTNode]):
+        """Fix the location of the `post_init()` in the new body, if we added statements after the call to
+        `super()` (it needs to be the very last statement called)"""
+        # Fix the post_init() that has to be last
+        for i, node in enumerate(new_body):
+            code = self.python_module.code_for_node(node)
+            comment_less_code = re.sub(r"#.*", "", code).strip()
+            comment_less_code = re.sub(r"\ *\n", "\n", comment_less_code).strip()
+            if "self.post_init(" in comment_less_code and i < len(new_body) - 1:
+                # Remove it and add it again at the end
+                new_body.pop(i)
+                new_body.append(node)
+                break
+        return new_body
+
+    def _fix_init_location(self, new_body):
+        """Fix the location of the `super().__init__()` in the new body, if we had new statements before it."""
+        start_index = 0
+        for i, node in enumerate(new_body):
+            if m.matches(node, DOCSTRING_NODE) and i == start_index:
+                start_index += 1
+                continue
+            code = self.python_module.code_for_node(node)
+            comment_less_code = re.sub(r"#.*", "", code).strip()
+            comment_less_code = re.sub(r"\ *\n", "\n", comment_less_code).strip()
+            if "super().__init__" in comment_less_code and i > start_index:
+                # Remove it and add it again at the top after the docstrings
+                node = new_body.pop(i)
+                new_body = new_body[:start_index] + [node] + new_body[start_index:]
+                break
+        return new_body
+
     def replace_super_calls(self, node: cst.IndentedBlock, func_name: str) -> cst.CSTNode:
         """Updates the body of the input `node`'s `func_name` function by replacing calls
         to super().func_name() with the source code of the parent class' `func_name`.
@@ -479,10 +358,11 @@ def replace_super_calls(self, node: cst.IndentedBlock, func_name: str) -> cst.CS
         new_body = []
         has_super_call = False
 
-        for expr in node.body:
+        for i, expr in enumerate(node.body):
             if is_call_to_super(expr, func_name):
                 has_super_call = True
-                new_body.extend(self.update_body(self.original_methods[func_name].body.body, node.body))
+                new_body.extend(self.update_body(self.original_methods[func_name].body.body, node.body[i + 1 :]))
+                new_body = self._fix_init_location(new_body)
             else:
                 expr = expr.visit(self.transformer)
             if m.matches(expr, DOCSTRING_NODE):
@@ -524,11 +404,468 @@ def leave_Return(self, original_node: cst.Return, updated_node: cst.Return) -> c
         return updated_node
 
 
-def replace_call_to_super(
-    class_finder: ClassFinder, updated_node: cst.ClassDef, class_name: str, all_bases: List[str]
-):
+def find_all_dependencies(
+    dependency_mapping: Dict[str, set],
+    start_entity: str | None = None,
+    initial_dependencies: set | None = None,
+    initial_checked_dependencies: set | None = None,
+    return_parent: bool = False,
+) -> list | set:
+    """Return all the dependencies of the given `start_entity` or `initial_dependencies`. This is basically some kind of
+    BFS traversal algorithm. It can either start from `start_entity`, or `initial_dependencies`.
+
+    Args:
+        dependency_mapping (`Dict[str, set]`):
+            A mapping from entities (usually function/assignment names), to immediate dependencies. That is, for function names,
+            a mapping {"foo": {"bar", "test"}} would indicate that functions `bar` and `test` are immediately called
+            in `foo`'s definition.
+        start_entity (str | None, *optional*):
+            A key of `dependency_mapping`, indicating from which entity to start the search.
+        initial_dependencies (set | None, *optional*):
+            If `start_entity` is not provided, this can be used as an alternative. In this case, the search will continue
+            from all the entities in `initial_dependencies`, if they are in `dependency_mapping`.
+        initial_checked_dependencies (set | None, *optional*):
+            If provided, entities already present in `initial_checked_dependencies` will not be part of the returned dependencies.
+        return_parent (bool, *optional*):
+            If `True`, will return a list consisting of tuples (dependency, parent) instead of a simple set of dependencies. Note
+            that the order of the items in the list reflects the traversal order. Thus, no parent can ever appear before childs.
+    Returns:
+        A set of all the dependencies, or a list of tuples `(dependency, parent)` if `return_parent=True`.
+
+    Example:
+    Given the following structure in the `modular_xxx.py` file:
+    ```
+    def foo1():
+        pass
+
+    def foo2():
+        pass
+
+    def bar():
+        foo1()
+
+    def foobar():
+        bar()
+        foo2()
+
+    class MyLayer(SomeOtherModelLayer):
+        def forward(...):
+            foobar()
+    ```
+    and the `dependency_mapping` created when visiting the `modular_xxx.py` file, we get:
+    ```
+    dependency_mapping = {'bar': {'foo1'}, 'foobar': {'bar', 'foo2'}}
+    find_all_dependencies(dependency_mapping, start_entity='foobar', return_parent=True)
+    >>> [('bar', 'foobar'), ('foo2', 'foobar'), ('foo1', 'bar')]
+    ```
+    That is, all the functions needed (and potentially their immediate parent) so that the function to be added
+    in MyLayer (`foobar`) can work correctly.
     """
-    Given the `class_name`, the `updated_node`'s call to super are unpacked.
+    if initial_dependencies is None and start_entity is not None:
+        initial_dependencies = dependency_mapping[start_entity]
+    if initial_checked_dependencies is None:
+        initial_checked_dependencies = set()
+
+    dependency_queue = deque(initial_dependencies)
+    all_dependencies = set()
+    all_dependencies_with_parent = []
+    checked_dependencies = set(initial_checked_dependencies)
+    parents = {initial_dep: start_entity for initial_dep in initial_dependencies}
+    while len(dependency_queue) > 0:
+        # Pick element to visit
+        current = dependency_queue.popleft()
+        if current not in checked_dependencies:
+            # Add the dependencies
+            all_dependencies.add(current)
+            all_dependencies_with_parent += [(current, parents[current])]
+            if current in dependency_mapping.keys():
+                # Update dependency queue
+                dependency_queue.extend(dependency_mapping[current])
+                parents.update({dep: current for dep in dependency_mapping[current]})
+            # add visited node to the list
+            checked_dependencies.add(current)
+
+    if not return_parent:
+        return all_dependencies
+    # no child can ever appear before its parent thanks to the queue (needed to add them at the correct location in the body later)
+    return all_dependencies_with_parent
+
+
+# These top-level variables will always use the value in the `modular_xxx.py` file
+ASSIGNMENTS_TO_KEEP = {
+    "_CHECKPOINT_FOR_DOC",
+}
+
+
+class ClassDependencyMapper(CSTVisitor):
+    """A visitor which is designed to analyze a single class node to get all its dependencies that are shared with the set of
+    `global_names`.
+    """
+
+    def __init__(
+        self, class_name: str, global_names: set[str], objects_imported_from_modeling: set[str] | None = None
+    ):
+        super().__init__()
+        self.class_name = class_name
+        self.dependencies = set()
+        self.global_names = global_names
+        self.objects_imported_from_modeling = (
+            set() if objects_imported_from_modeling is None else objects_imported_from_modeling
+        )
+
+    def visit_Name(self, node):
+        if (
+            node.value != self.class_name
+            and node.value in self.global_names
+            and node.value not in self.objects_imported_from_modeling
+        ):
+            self.dependencies.add(node.value)
+
+
+def dependencies_for_class_node(node: cst.ClassDef, global_names: set[str]) -> set:
+    """Create immediate dependencies for a class node based on the `global_names`."""
+    temp_module = cst.Module(body=[node])
+    visitor = ClassDependencyMapper(node.name.value, global_names)
+    temp_module.visit(visitor)
+    return visitor.dependencies
+
+
+def augmented_dependencies_for_class_node(
+    node: cst.ClassDef, mapper: "ModuleMapper", objects_imported_from_modeling: set[str] | None = None
+) -> set:
+    """Create augmented dependencies for a class node based on a `mapper`.
+    Augmented dependencies means immediate dependencies + recursive function and assignments dependencies.
+    """
+    temp_module = cst.Module(body=[node])
+    visitor = ClassDependencyMapper(node.name.value, set(mapper.global_nodes.keys()), objects_imported_from_modeling)
+    temp_module.visit(visitor)
+    return mapper.augment_dependencies(visitor.dependencies)
+
+
+# All the potential file types to create
+ALL_FILE_TYPES = (
+    "modeling",
+    "configuration",
+    "tokenization",
+    "processing",
+    "image_processing",
+    "feature_extractor",
+)
+
+
+class ModuleMapper(CSTVisitor, ABC):
+    """An abstract visitor class which analyses a module, creating a mapping of dependencies for classes, functions and assignments.
+    Class dependencies are computed with `compute_class_dependencies()`, while function and assignment dependencies are stored in
+    `self.object_recursive_dependency_mapping` (can be computed by `_compute_recursive_object_dependencies()`).
+    It defines common visiting patterns (i.e. common visit_xxx/leave_xxx functions) between the modular file and the
+    modeling files that will be visited.
+    """
+
+    METADATA_DEPENDENCIES = (ParentNodeProvider, PositionProvider)
+
+    def __init__(self, python_module: cst.Module):
+        # fmt: off
+        self.python_module: cst.Module = python_module             # original cst.Module being visited
+        self.classes: Dict[str, cst.ClassDef] = {}                 # mapping from class names to Nodes (it will be ordered by default!!)
+        self.imports = []                                          # stores all import statements
+        self.functions: Dict[str, cst.FunctionDef] = {}            # mapping of global scope function names to Nodes
+        self.object_dependency_mapping = defaultdict(set)          # immediate function/assignment dependency mapping (i.e. dependencies immediately in the function/assignment definition)
+        self.assignments: Dict[str, cst.SimpleStatementLine] = {}  # mapping of global assignments names to Nodes
+        self.current_function = None                               # this keeps track of the current module-scope function
+        self.current_assignment = None                             # this keeps track of the current module-scope assignment
+        # this keeps track of objects imported from modeling files (`from .configuration import Config`) -> `Config` should not be a dependency
+        self.objects_imported_from_modeling = set()
+        # regex pattern joining every possible file type
+        self.match_patterns = "|".join(ALL_FILE_TYPES)
+        # fmt: on
+
+    def visit_ImportFrom(self, node):
+        """This keeps track of objects imported from neighbor modeling files (e.g. in `modeling_xxx.py, we have
+        `from .configuration_xxx import Config`, then `Config` should be recorded as it is not a dependency that needs
+        to be added (because it will be part of the imports)"""
+        import_module = self.python_module.code_for_node(node.module)
+        import_statement = "." * len(node.relative) + import_module
+        if re.search(rf"^\.({self.match_patterns})_.*", import_statement):
+            for imported_object in node.names:
+                # If an alias is present, we record it and not the original name
+                if imported_object.evaluated_alias is not None:
+                    self.objects_imported_from_modeling.add(imported_object.evaluated_alias)
+                else:
+                    self.objects_imported_from_modeling.add(imported_object.evaluated_name)
+
+    def visit_SimpleStatementLine(self, node):
+        """
+        Global Assigns like `GEMMA_INPUT_DOCSTRING = 'THIS IS THE INPUT'` and all import statements
+        are extracted and saved in their corresponding dict. They are then used when updating dependency mappings.
+        """
+        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
+        simple_top_level_assign_structure = m.SimpleStatementLine(
+            body=[m.Assign(targets=[m.AssignTarget(target=m.Name())])]
+        )
+        if m.matches(parent_node, m.Module()):
+            if m.matches(node, simple_top_level_assign_structure):
+                left_hand_side = node.body[0].targets[0].target.value
+                self.current_assignment = left_hand_side
+                self.assignments[left_hand_side] = node
+            elif m.matches(node, m.SimpleStatementLine(body=[m.Import() | m.ImportFrom()])):
+                self.imports.append(node)
+
+    def leave_SimpleStatementLine(self, node):
+        # No need to check for the parent here -> everytime we exit one, it should be None anyway independently of where the
+        # SimpleStatement is located
+        self.current_assignment = None
+
+    def visit_FunctionDef(self, node):
+        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
+        if m.matches(parent_node, m.Module()):
+            self.current_function = node.name.value
+            self.functions[node.name.value] = node
+
+    def leave_FunctionDef(self, node):
+        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
+        if m.matches(parent_node, m.Module()):
+            self.current_function = None
+
+    def visit_If(self, node):
+        for stmt in node.body.body:
+            if m.matches(stmt, m.SimpleStatementLine(body=[m.ImportFrom() | m.Import()])):
+                self.imports.append(node)
+
+    def visit_ClassDef(self, node: ClassDef) -> None:
+        """Record class nodes to create their dependencies at the end."""
+        self.classes[node.name.value] = node
+
+    def visit_Name(self, node: cst.Call):
+        """This is used to create a mapping from module-scope functions and assignments to objects used inside them."""
+        if self.current_function is not None:
+            self.object_dependency_mapping[self.current_function].add(node.value)
+        if self.current_assignment is not None:
+            self.object_dependency_mapping[self.current_assignment].add(node.value)
+
+    def leave_Module(self, node):
+        """When leaving the module, we store the position of each global scoped node to allow sorting the dependencies
+        based on their position in the code later. We use the PositionProvider metadata wrapper for this.
+        We also make sure to update `self.object_dependency_mapping` so that it contains only names recorded in
+        `self.global_nodes`.
+        """
+        # assign all nodes
+        self.global_nodes = {**self.assignments, **self.classes, **self.functions}
+        # now sort the class dependency_mapping based on the position of the nodes
+        self.start_lines = {}
+        for id, node in self.global_nodes.items():
+            self.start_lines[id] = self.get_metadata(cst.metadata.PositionProvider, node).start.line
+
+    def _restrict_dependencies_to_known_entities(self):
+        """Since we added every Name as part of `self.object_dependency_mapping`, we need to remove those that
+        are not part of the recorded objects in `self.global_nodes` (i.e. built-in variables, imports, etc).
+        This should be called only after all merging operations have been finalized!!"""
+        global_objects = set(self.global_nodes.keys())
+        for object_name, dependencies in self.object_dependency_mapping.items():
+            self.object_dependency_mapping[object_name] = {dep for dep in dependencies if dep in global_objects}
+
+    def _compute_recursive_object_dependencies(self) -> dict[str, set]:
+        """Based on immediate dependency mapping, create the recursive dependency mapping. For example, given the
+        following file:
+        ```
+        def foo():
+            pass
+
+        def bar():
+            foo()
+
+        def test():
+            bar()
+        ```
+        this visitor can only record immediate dependencies, i.e. it will record the following
+        `self.object_dependency_mapping = {"test": {"bar"}, "bar": {"foo}}`. This function is used to create
+        the recursive mapping, i.e. `recursive_dependencies = {"test": {"bar", "foo"}, "bar": {"foo}}`.
+        """
+        recursive_dependencies = {}
+        for object_name in self.object_dependency_mapping.keys():
+            all_dependencies = find_all_dependencies(self.object_dependency_mapping, start_entity=object_name)
+            recursive_dependencies[object_name] = all_dependencies
+        return recursive_dependencies
+
+    def augment_dependencies(self, dependencies: set[str]) -> set[str]:
+        """For a set of `dependencies`, augment them by adding all potential dependencies of the **functions** and
+        **assignments** present in the `dependencies`.
+        """
+        new_dependencies = dependencies.copy()
+        # Go through the set of dependencies
+        for dep in tuple(dependencies):
+            if dep in self.object_recursive_dependency_mapping.keys():
+                new_dependencies.update(self.object_recursive_dependency_mapping[dep])
+        return new_dependencies
+
+    def compute_class_dependencies(self):
+        """For each visited class, find its dependencies based on visiting the current file + potential merged dependencies."""
+        self.class_dependency_mapping = {}
+        for class_name, class_node in self.classes.items():
+            dependencies = dependencies_for_class_node(class_node, set(self.global_nodes.keys()))
+            # Correctly augment class dependencies with all needed objects
+            self.class_dependency_mapping[class_name] = self.augment_dependencies(dependencies)
+
+    @abstractmethod
+    def compute_relative_order(self, missing_dependencies: set) -> dict[str, int]:
+        raise NotImplementedError
+
+
+class ModelFileMapper(ModuleMapper):
+    """A mapper designed to parse modeling files (like `modeling_llama.py`). When encountering such a file
+    in the `modular_xxx.py` file, we need to correctly visit it and merge the dependencies of the modular and current file.
+    For this reason, this class should only be instantiated from the class method `visit_and_merge_dependencies`, which takes
+    care of correctly merging dependencies, then finalizes all dependency graph computations.
+    Note that we only merge functions and assignments here, as classes will be treated later on as they may be modified.
+    For example, if you redefine `apply_rotary_pos_emb()` in the modular, the new node should be used in the dependencies
+    of the modeling files as well.
+    """
+
+    def __init__(self, python_module: cst.Module):
+        super().__init__(python_module)
+
+    def compute_relative_order(self, missing_dependencies: set[str]) -> dict[str, int]:
+        """Compute in which relative order the `missing_dependencies` should appear when the nodes are added to the final file that
+        will be created based on the modular.
+        """
+        relative_order = {}
+        idx = 0
+        classes = sorted(
+            [dep for dep in tuple(missing_dependencies) if dep in self.classes], key=lambda x: self.start_lines[x]
+        )
+        # This is because for merged dependencies, we only have relative order in the other visited file, so we need
+        # to track dependency order relative to a given class
+        if len(classes) > 0 and not hasattr(self, "class_dependency_mapping"):
+            raise ValueError("Cannot correctly find the relative order of the dependencies.")
+
+        remaining_dependencies = missing_dependencies.copy()
+
+        # Start by tracking relative order class by class
+        for class_name in classes:
+            class_dependencies = tuple(self.class_dependency_mapping[class_name] & remaining_dependencies)
+            original_dependencies = []
+            merged_dependencies = []
+            # We need to differentiate between nodes that were already present (we can get relative order globally) and
+            # nodes that were merged (we can get relative order only relative to the class the dependencies relate to)
+            for class_dep in class_dependencies:
+                if class_dep in self.start_lines:
+                    original_dependencies.append(class_dep)
+                else:
+                    merged_dependencies.append(class_dep)
+            # Sort both list according to the order in their respective file
+            original_dependencies = sorted(original_dependencies, key=lambda x: self.start_lines[x])
+            merged_dependencies = sorted(merged_dependencies, key=lambda x: self.modular_file_start_lines[x])
+
+            # Add all original node first, then merged ones
+            for dep in original_dependencies + merged_dependencies:
+                remaining_dependencies.remove(dep)
+                relative_order[dep] = idx
+                idx += 1
+            # Add the class itself
+            remaining_dependencies.remove(class_name)
+            relative_order[class_name] = idx
+            idx += 1
+
+        # Now add what still remains
+        remaining_dependencies = tuple(remaining_dependencies)
+        original_dependencies = []
+        merged_dependencies = []
+        for dep in remaining_dependencies:
+            if dep in self.modular_file_start_lines:
+                merged_dependencies.append(dep)
+            else:
+                original_dependencies.append(dep)
+        # Sort both list according to the order in their respective file
+        original_dependencies = sorted(original_dependencies, key=lambda x: self.start_lines[x])
+        merged_dependencies = sorted(merged_dependencies, key=lambda x: self.modular_file_start_lines[x])
+
+        # Add all original node first, then merged ones
+        for dep in original_dependencies + merged_dependencies:
+            relative_order[dep] = idx
+            idx += 1
+
+        return relative_order
+
+    def _merge_functions(self, functions: dict[str, cst.CSTNode], object_mapping: dict[str, set]):
+        """Update the global nodes and function dependency mapping with those from the modular file.
+
+        Merging rule: if any function with the same name was redefined in the modular, use it and its dependencies
+        instead of the original ones (this may mean to add new functions as well, if any redefined function uses a new one).
+        """
+        # Add/overwrite all needed function nodes and dependencies
+        self.functions.update(functions)
+        self.object_dependency_mapping.update(
+            {obj: dep for obj, dep in object_mapping.items() if obj in functions.keys()}
+        )
+        # Add them to global nodes
+        self.global_nodes.update(self.functions)
+
+    def _merge_assignments(self, assignments: dict[str, cst.CSTNode], object_mapping: dict[str, set]):
+        """Update the global nodes with the assignment from the modular file.
+
+        Merging rule: if any assignment with the same name was redefined in the modular, we use it and its dependencies ONLY if it is
+        in `ASSIGNMENTS_TO_KEEP`. Otherwise, we use the original value and dependencies. This rule was chosen to avoid having to rewrite the
+        big docstrings.
+        """
+        for assignment, node in assignments.items():
+            if assignment in ASSIGNMENTS_TO_KEEP or assignment not in self.assignments:
+                self.assignments[assignment] = node
+                if assignment in object_mapping:
+                    self.object_dependency_mapping[assignment] = object_mapping[assignment]
+        # Add them to global nodes
+        self.global_nodes.update(self.assignments)
+
+    def _merge_classes(self, classes: dict[str, cst.CSTNode]):
+        """Update the global nodes with the new classes from the modular (i.e. classes which do not exist in current file, and
+        are not imported). We do NOT update any dependency mapping here. This is because we only need the names of newly defined
+        classes in the modular to be discoverable when computing dependencies for new nodes later on. For this reason, we
+        do not add the new classes to `self.classes`, but only to `global_nodes`.
+        """
+        # Add/overwrite all needed function nodes and dependencies
+        self.global_nodes.update(
+            {
+                name: node
+                for name, node in classes.items()
+                if name not in self.classes and name not in self.objects_imported_from_modeling
+            }
+        )
+
+    def merge_modular_dependencies(self, classes, functions, assignments, object_mapping, start_lines):
+        """Merge classes, functions and assignments from the modular definitions into the current module file,
+        then record the relative order of all nodes.
+        Note: This function takes care of updating `global_nodes` and `object_recursive_dependency_mapping` as well after the
+        merge with other files dependencies.
+        """
+        self._merge_functions(functions, object_mapping)
+        self._merge_assignments(assignments, object_mapping)
+        self._merge_classes(classes)
+        self.modular_file_start_lines = start_lines
+
+        # Restrict the dependency mappings to the known entities to avoid Python's built-ins and imports
+        self._restrict_dependencies_to_known_entities()
+        # Create the global mapping of recursive dependencies for functions and assignments
+        self.object_recursive_dependency_mapping = self._compute_recursive_object_dependencies()
+
+    @classmethod
+    def visit_and_merge_dependencies(
+        cls, module: cst.Module, classes, functions, assignments, object_mapping, start_lines
+    ) -> "ModelFileMapper":
+        wrapper = MetadataWrapper(module)
+        mapper = cls(module)
+        wrapper.visit(mapper)
+        # Merge dependencies
+        mapper.merge_modular_dependencies(classes, functions, assignments, object_mapping, start_lines)
+        # Create the class dependencies graph
+        mapper.compute_class_dependencies()
+        return mapper
+
+
+def replace_class_node(mapper: ModelFileMapper, class_node: cst.ClassDef, renamed_super_class: str):
+    """
+    Replace a class node which inherits from another modeling class. This function works in the following way:
+    - start from the base class node of the inherited class (a cst.Node)
+    - replace all methods of the base node with the methods defined in the child class
+    - append all new methods defined in the child class
+    - replace all calls to super() with the unravelled code
 
                     |    ```python                          |               |    ```python
                     |    class GemmaModel(LlamaModel):      |               |       class GemmaModel(nn.Module):
@@ -547,14 +884,17 @@ def replace_call_to_super(
                                                                             |               self.post_init()
                                                                             |     ```
     """
-    original_node = class_finder.classes[class_name]
+    all_bases = [get_full_attribute_name(k.value) for k in class_node.bases]
+    if any(base is None for base in all_bases):
+        raise ValueError(f"Could not parse the name of the bases for {class_node.name.value}")
+
+    original_node = mapper.classes[renamed_super_class]
     original_methods = {
-        f.name.value if hasattr(f, "name") else class_finder.python_module.code_for_node(f): f
+        f.name.value if hasattr(f, "name") else mapper.python_module.code_for_node(f): f
         for f in original_node.body.body
     }
     updated_methods = {
-        f.name.value if hasattr(f, "name") else class_finder.python_module.code_for_node(f): f
-        for f in updated_node.body.body
+        f.name.value if hasattr(f, "name") else mapper.python_module.code_for_node(f): f for f in class_node.body.body
     }
     end_meth = []
 
@@ -562,7 +902,7 @@ def replace_call_to_super(
     docstring_node = []
     # Iterate directly from node.body as there can be property/setters with same names which are overwritten when we use a dict
     for func in original_node.body.body:
-        name = func.name.value if hasattr(func, "name") else class_finder.python_module.code_for_node(func)
+        name = func.name.value if hasattr(func, "name") else mapper.python_module.code_for_node(func)
         if m.matches(func, m.FunctionDef()) and name in updated_methods and updated_methods[name] is not None:
             new_params = updated_methods[name].params
             # Replace the method in the replacement class, preserving decorators
@@ -573,19 +913,23 @@ def replace_call_to_super(
                 new_params = new_params.with_changes(
                     params=list(parent_params.values()), star_kwarg=func.params.star_kwarg
                 )
+            # Keep decorators in `modular_xxx.py` if any, else original decorators
+            new_decorators = (
+                updated_methods[name].decorators if len(updated_methods[name].decorators) > 0 else func.decorators
+            )
             if not re.match(
                 r"\ndef .*\(.*\):\n    raise.*Error\(.*",
-                class_finder.python_module.code_for_node(updated_methods[name]),
+                mapper.python_module.code_for_node(updated_methods[name]),
             ):
-                func = func.with_changes(body=updated_methods[name].body, params=new_params)
+                func = func.with_changes(body=updated_methods[name].body, params=new_params, decorators=new_decorators)
             else:
                 continue
 
         if m.matches(func, m.SimpleStatementLine(body=[m.Assign()])):
-            target = class_finder.python_module.code_for_node(func.body[0].targets[0])
+            target = mapper.python_module.code_for_node(func.body[0].targets[0])
             assign_targets[target] = func
         elif m.matches(func, m.SimpleStatementLine(body=[m.AnnAssign()])):
-            target = class_finder.python_module.code_for_node(func.body[0].target)
+            target = mapper.python_module.code_for_node(func.body[0].target)
             assign_targets[target] = func
         elif m.matches(func, DOCSTRING_NODE):
             docstring_node = [func]
@@ -593,8 +937,8 @@ def replace_call_to_super(
             end_meth.append(func)
 
     # Port new methods that are defined only in modular-file and append at the end
-    for func in updated_node.body.body:
-        name = func.name.value if hasattr(func, "name") else class_finder.python_module.code_for_node(func)
+    for func in class_node.body.body:
+        name = func.name.value if hasattr(func, "name") else mapper.python_module.code_for_node(func)
         if m.matches(func, DOCSTRING_NODE):  # This processes the docstring of the class!
             # Extract the original docstring
             updated_docstring = func.body[0].value.value
@@ -608,22 +952,28 @@ def replace_call_to_super(
             end_meth.append(func)
         if m.matches(func, m.SimpleStatementLine(body=[m.Assign()])):
             # TODO we only use single assign might cause issues
-            target = class_finder.python_module.code_for_node(func.body[0].targets[0])
+            target = mapper.python_module.code_for_node(func.body[0].targets[0])
             assign_targets[target] = func
         if m.matches(func, m.SimpleStatementLine(body=[m.AnnAssign()])):
-            target = class_finder.python_module.code_for_node(func.body[0].target)
+            target = mapper.python_module.code_for_node(func.body[0].target)
             assign_targets[target] = func
     end_meth = docstring_node + list(assign_targets.values()) + end_meth
 
+    # Replace the calls to `super()` with the unrolled code
     result_node = original_node.with_changes(body=cst.IndentedBlock(body=end_meth))
     temp_module = cst.Module(body=[result_node])
     new_module = MetadataWrapper(temp_module)
     new_replacement_class = new_module.visit(
-        SuperTransformer(temp_module, original_methods, updated_methods, class_name, all_bases)
+        SuperTransformer(temp_module, original_methods, updated_methods, all_bases)
     )
     new_replacement_body = new_replacement_class.body[0].body  # get the indented block
 
-    return original_node.with_changes(body=new_replacement_body)
+    # Use decorators redefined in `modular_xxx.py` if any
+    new_decorators = class_node.decorators if len(class_node.decorators) > 0 else original_node.decorators
+    # Always use the new name of the class (in case we use e.g. `ColPaliForRetrieval` inheriting from `PaliGemmaForConditionalGeneration`)
+    name = class_node.name
+
+    return original_node.with_changes(body=new_replacement_body, decorators=new_decorators, name=name)
 
 
 TYPE_TO_FILE_TYPE = {
@@ -632,498 +982,490 @@ def replace_call_to_super(
     "Processor": "processing",
     "ImageProcessor": "image_processing",
     "FeatureExtractor": "feature_extractor",
+    "ProcessorKwargs": "processing",
+    "ImagesKwargs": "processing",
+    "TextKwargs": "processing",
 }
 
 
-def get_new_part(class_name, base_class):
-    """
-    When `MyClassNameAttention` inherits from `MistralAttention`, we need
-    to process the name to properly find dependencies.
-
-    Here we take what is the same (Attention) and what is different
-    when finding the dependencies.
+def find_file_type(class_name: str) -> str:
+    """Based on a class name, find the file type corresponding to the class.
+    If the class name is `LlamaConfig` it will return `configuration`.
+    The list of suffixes is in `TYPE_TO_FILE_TYPE`. If there are no match, we match by default to `modeling`
     """
-    common_suffix_len = 0
-    for i in range(1, min(len(class_name), len(base_class)) + 1):
-        if class_name[-i] == base_class[-i]:
-            common_suffix_len += 1
-        else:
-            break
-
-    if common_suffix_len > 0:
-        new_part = class_name[:-common_suffix_len]
+    match_pattern = "|".join(TYPE_TO_FILE_TYPE.keys())
+    match = re.search(rf"({match_pattern})$", class_name)
+    if match:
+        file_type = TYPE_TO_FILE_TYPE[match.group(1)]
     else:
-        new_part = class_name
+        file_type = "modeling"
+    return file_type
 
-    # Convert the remaining new part to snake_case
-    snake_case = re.sub(r"(?<!^)(?=[A-Z])", "_", new_part).lower()
-    return snake_case
 
+# These top-level variables will always appear at the very beginning of the file, in the order they are defined in
+# this list (this is to avoid having variables at weird places, even if they are not used before)
+VARIABLES_AT_THE_BEGINNING = (
+    "logger",
+    "_CHECKPOINT_FOR_DOC",
+    "_CONFIG_FOR_DOC",
+)
 
-def find_all_dependencies(function: str, dependency_mapping: Dict[str, set]):
-    """Return all the dependencies of the given top-level function. Given the following structure in the `modular_xxx.py` file:
-    ```
-    def foo1():
-        pass
+# These specific modeling imports should not be visited as other modeling files
+IMPORTS_TO_SKIP_IN_MODULAR = ("auto.modeling_auto",)
 
-    def foo2():
-        pass
 
-    def bar():
-        foo1()
+def append_new_import_node(node: cst.CSTNode, unused_imports: set[str], imports_to_keep: list[cst.CSTNode]):
+    """Insert the new `node` to the list of `imports_to_keep` in-place, if it is not part of the `unused_imports`."""
+    import_node = node.body[0]
+    names_to_keep = []
+    for name in import_node.names:
+        name_value = name.evaluated_name
+        if name_value not in unused_imports:
+            names_to_keep.append(name.with_changes(comma=cst.MaybeSentinel.DEFAULT))
+    if len(names_to_keep) > 0:
+        new_node = node.with_changes(body=[import_node.with_changes(names=names_to_keep)])
+        imports_to_keep.append(new_node)
 
-    def foobar():
-        bar()
-        foo2()
 
-    class MyLayer(SomeOtherModelLayer):
-        def forward(...):
-            foobar()
-    ```
-    and the `dependency_mapping` created when visiting the `modular_xxx.py` file, we get:
-    ```
-    dependency_mapping = {'bar': {'foo1'}, 'foobar': {'bar', 'foo2'}}
-    find_all_dependencies('foobar', dependency_mapping)
-    >>> [('bar', 'foobar'), ('foo2', 'foobar'), ('foo1', 'bar')]
-    ```
-    That is, all the functions needed (and their immediate parent) so that the function to be added in MyLayer (`foobar`) can
-    work correctly.
+def get_needed_imports(body: dict[str, dict], all_imports: list[cst.CSTNode]) -> list[cst.CSTNode]:
+    """Get all the imports needed in the `body`, from the list of `all_imports`.
+    `body` is a dict with the following structure `{str: {"insert_idx": int, "node": cst.CSTNode}}`.
+    Note: we need to use `isinstance` on scope assignements, m.matches apparently does not work here yet!
     """
-    all_dependencies = deque(dependency_mapping[function])
-    all_dependencies_with_parent = [(dep, function) for dep in dependency_mapping[function]]
-    checked_dependencies = set(function)
-    while len(all_dependencies) > 0:
-        # Pick element to visit
-        parent = all_dependencies.popleft()
-        if parent not in checked_dependencies:
-            # Update dependencies
-            all_dependencies.extend(dependency_mapping[parent])
-            all_dependencies_with_parent += [(dependency, parent) for dependency in dependency_mapping[parent]]
-            # add visited node to the list
-            checked_dependencies.add(parent)
-
-    # no child can ever appear before its parent thanks to the queue (needed to add them at the correct location in the body later)
-    return all_dependencies_with_parent
-
-
-class PostModularConverterCleaner(CSTTransformer):
-    """Allow simple cleaning after conversion. Remove top-level functions/classes without any calls (they may arise due
-    to dependency mapping, even if code parts with those functions/classes were overwritten)"""
-
-    METADATA_DEPENDENCIES = (ParentNodeProvider,)
-
-    def __init__(self, added_dependencies: set):
-        super().__init__()
-        self.top_level_functions_or_classes = {}
-        self.all_used_functions_or_classes = set()
-        self.added_dependencies = added_dependencies
-
-    def visit_FunctionDef(self, node):
-        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
-        if m.matches(parent_node, m.Module()):
-            self.top_level_functions_or_classes[node.name.value] = node
-
-    def visit_ClassDef(self, node):
-        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
-        if m.matches(parent_node, m.Module()):
-            self.top_level_functions_or_classes[node.name.value] = node
-
-    def visit_Name(self, node: cst.Name):
-        """This is used to find any mention of a top-level function or class except its own definition.
-        It will contain other names as well, but those will not be used. This is the most general way to do it
-        since mentions may appear in a lot of different contexts (apart from simple Call to the function/class).
-        e.g. Attention classes are only mentionned by their name in a dict assignment.
-        """
-        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
-
-        if not (
-            (m.matches(parent_node, m.ClassDef()) and parent_node.name.value == node.value)
-            or (m.matches(parent_node, m.FunctionDef()) and parent_node.name.value == node.value)
-        ):
-            self.all_used_functions_or_classes.add(node.value)
-
-    def leave_Module(self, original_node: cst.Module, node):
-        # Find any class/function that was mistakenly added as part of the dependencies and remove it
-        unused = self.added_dependencies - self.all_used_functions_or_classes
-        nodes_to_remove = [
-            self.top_level_functions_or_classes[name] for name in unused if name in self.top_level_functions_or_classes
-        ]
-        new_body = [node_ for node_ in original_node.body if node_ not in nodes_to_remove]
-        # Return a new module with the updated body
-        return node.with_changes(body=new_body)
+    new_body = [k[1]["node"] for k in sorted(body.items(), key=lambda x: x[1]["insert_idx"])]
+    wrapper = MetadataWrapper(cst.Module(body=all_imports + new_body))
+    scopes = set(wrapper.resolve(ScopeProvider).values())
+    unused_imports = set()
+    import_ref_count = {}
+    for scope in scopes:
+        for assignment in scope.assignments:
+            node = assignment.node
+            if isinstance(assignment, cst.metadata.Assignment) and isinstance(node, (cst.Import, cst.ImportFrom)):
+                ref_count = len(assignment.references)
+                name = assignment.name
+                # Similar imports may be redefined, and only used between their 1st and 2nd definition
+                # so if we already have a ref count > 0, the imports is actually used
+                if (ref_count == 0 and import_ref_count.get(name, -1) <= 0) or name in body.keys():
+                    unused_imports.add(name)
+                import_ref_count[name] = ref_count
+
+    imports_to_keep = []
+    existing_protected_statements = set()  # str repr of the import nodes - does not work with the nodes directly
+    for node in all_imports:
+        if m.matches(node, m.If()):  # handle safe imports
+            new_statements = []
+            for stmt_node in node.body.body:
+                append_new_import_node(stmt_node, unused_imports, new_statements)
+            new_statements = [stmt for stmt in new_statements if str(stmt) not in existing_protected_statements]
+            if len(new_statements) > 0:
+                new_node = node.with_changes(body=node.body.with_changes(body=new_statements))
+                imports_to_keep.append(new_node)
+                existing_protected_statements.update({str(stmt) for stmt in new_statements})
+        else:
+            append_new_import_node(node, unused_imports, imports_to_keep)
+
+    protected_import_nodes = [node for node in imports_to_keep if m.matches(node, m.If())]
+    usual_import_nodes = [node for node in imports_to_keep if not m.matches(node, m.If())]
+    # If the same import is both protected and unprotected, only keep the protected one
+    for protected_node in protected_import_nodes:
+        for stmt_node in protected_node.body.body:
+            usual_import_nodes = [node for node in usual_import_nodes if node.body[0] != stmt_node.body[0]]
+
+    # Protected imports always appear at the end of all imports
+    return usual_import_nodes + protected_import_nodes
+
+
+def split_all_assignment(node: cst.CSTNode) -> dict[str, cst.CSTNode]:
+    """Split the `__all__` assignment found in the modular between each corresponding files."""
+    all_all_per_file = {}
+    assign_node = node.body[0]
+    if isinstance(assign_node.value, cst.List):
+        # Extract the elements from the list
+        all_all_to_add = defaultdict(list)
+        for element in assign_node.value.elements:
+            if isinstance(element.value, cst.SimpleString):
+                # Remove quotes and add the string to the elements list
+                class_name = element.value.value
+                file = find_file_type(element.value.evaluated_value)
+                all_all_to_add[file] += [class_name]
+        for file, new_alls in all_all_to_add.items():
+            new_node = assign_node.with_changes(
+                value=cst.List(elements=[cst.Element(value=cst.SimpleString(value=k)) for k in new_alls])
+            )
+            all_all_per_file[file] = node.with_changes(body=[new_node])
+    return all_all_per_file
 
 
-class ModularConverterTransformer(CSTTransformer):
-    METADATA_DEPENDENCIES = (ParentNodeProvider, ScopeProvider, PositionProvider)
+class ModularFileMapper(ModuleMapper):
+    """This is a Mapper to visit a modular file (like `modular_llama.py`). It visits the whole file, recording dependency,
+    then visits all imported modeling files (like `modeling_llama.py`), and manages their mutual dependencies.
+    Calling the method `create_modules()` after visit will create all modules based on this modular file.
+    """
 
     def __init__(self, python_module, new_name, given_old_name=None, given_new_name=None):
-        super().__init__()
-        self.model_name = (
-            new_name  # name of the model being defined. Should be in the format of `llama` or `layout_xlm` our `phi3`
-        )
+        super().__init__(python_module)
+        # fmt: off
+        self.model_name = new_name  # name of the model being defined. Should be in the format of `llama` or `layout_xlm` or `phi3`
         self.given_old_name = given_old_name
         self.given_new_name = given_new_name
-        # fmt: off
-        self.python_module = python_module  # we store the original module to use `code_for_node`
-        self.transformers_imports = {}      # maps the imports name like "from transformers.models.xxx" to the parsed AST module
-        self.imported_mapping = {}          # stores the name of the imported classes, with their source {"LlamaModel":"transformers.model.llama.modeling_llama"}
-        self.visited_module = {}            # modules visited like "transformers.models.llama.modeling_llama"
-        self.inserted_deps = []             # nodes inserted via super dependency
-        self.all_imports = []               # just stores all of the imports
-        self.all_safe_imports = []          # stores the import under simple statements
-        self.global_scope_index = 0
+
+        self.model_specific_imported_objects: Dict[str, str] = {}  # e.g. {"LlamaModel": "transformers.models.llama.modeling_llama"}
+        self.model_specific_modules: Dict[str, cst.Module] = {}  # e.g. {"transformers.models.llama.modeling_llama": cst.Module}
+
+        self.all_all_to_add = {}
         # fmt: on
-        self.files = {  # mapping for different component bodies
-            "modeling": {},
-            "configuration": {},
-            "tokenization": {},
-            "processing": {},
-            "image_processing": {},
-            "feature_extractor": {},
-        }
-        self.match_patterns = "|".join(self.files.keys())
-        self.all_definitions = {}
-        self.class_to_file_type = {}
-        self.current_class = None  # keep track of current top-level class during visit
-        self.current_top_level_function = None  # keep track of current top-level function during visit
-        # Mapping from top-level functions to classes using them
-        self.function_call_class_mapping = defaultdict(lambda: set())
-        # Mapping from top-level functions to other top-level functions dependencies
-        self.function_call_dependency_mapping = defaultdict(lambda: set())
-        self.added_dependencies = set()
 
     def visit_ImportFrom(self, node: cst.ImportFrom) -> None:
-        """When visiting imports from `transformers.models.xxx` we need to:
-        1. Get the original source code
-        2. Parse it into an AST Tree
-        3. Add this import to `self.transformers_imports` as visited to not parse it twice
+        """When visiting imports from modeling files (i.e. `transformers.models.xxx`) we get the code, parse it,
+        and save it in `self.model_specific_modules` to later visit. The imported objects are saved in `self.model_specific_imported_objects`.
         """
-        import_statement = self.python_module.code_for_node(node.module)
+        import_module = self.python_module.code_for_node(node.module)
+        import_statement = "." * len(node.relative) + import_module
+        if any(import_to_skip in import_statement for import_to_skip in IMPORTS_TO_SKIP_IN_MODULAR):
+            return
         if m.matches(node.module, m.Attribute()):
             for imported_ in node.names:
-                _import = re.search(rf"(transformers\.models\..|..)*\.({self.match_patterns})_.*", import_statement)
+                _import = re.search(
+                    rf"(?:transformers\.models\.)|(?:\.\.)\w+\.({self.match_patterns})_.*", import_statement
+                )
                 if _import:
-                    source = _import.groups()[0]
+                    source = _import.group(1)
                     if source == "modeling" and "Config" in self.python_module.code_for_node(imported_):
                         raise ValueError(
                             f"You are importing {self.python_module.code_for_node(imported_)} from the modeling file. Import from the `configuration_xxxx.py` file instead"
                         )
-                    if import_statement not in self.transformers_imports:
-                        if "models" not in import_statement:
-                            import_statement = "models." + import_statement
-                        if "transformers" not in import_statement:
-                            import_statement = "transformers." + import_statement
-                        source_code = get_module_source_from_name(import_statement)
+                    if import_module not in self.model_specific_modules:
+                        if "models" not in import_module:
+                            import_module = "models." + import_module
+                        if "transformers" not in import_module:
+                            import_module = "transformers." + import_module
+                        source_code = get_module_source_from_name(import_module)
                         tree = cst.parse_module(source_code)
-                        self.transformers_imports[import_statement] = tree
-                    imported_class = self.python_module.code_for_node(imported_.name)
-                    self.imported_mapping[imported_class] = import_statement
+                        self.model_specific_modules[import_module] = tree
+                    imported_object = self.python_module.code_for_node(imported_.name)
+                    self.model_specific_imported_objects[imported_object] = import_module
         if m.matches(node.module, m.Name()):
-            if "transformers" == import_statement:
+            if "transformers" == import_module:
                 raise ValueError(
-                    f"You are importing from {import_statement} directly using global imports. Import from the correct local path"
+                    f"You are importing from {import_module} directly using global imports. Import from the correct local path"
                 )
 
-    def leave_SimpleStatementLine(self, original_node, updated_node):
-        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, original_node)
+    def visit_SimpleStatementLine(self, node):
+        """If we visit an import statement not previously visited, record it. If we visit a module-scope assignment,
+        simply record it or, if it is `__all__`, split it between files where we should dispatch it.
+        """
+        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
+        simple_top_level_assign_structure = m.SimpleStatementLine(
+            body=[m.Assign(targets=[m.AssignTarget(target=m.Name())])]
+        )
         if m.matches(parent_node, m.Module()):
-            if m.matches(updated_node, m.SimpleStatementLine(body=[m.Import()])):
-                if updated_node not in self.all_imports:
-                    self.all_imports.append(updated_node)
-                return updated_node
-            elif m.matches(updated_node, m.SimpleStatementLine(body=[m.ImportFrom()])):
-                full_statement = self.python_module.code_for_node(updated_node.body[0].module)
-                if re.search(
-                    rf"(transformers\.models\..|..)*\.({self.match_patterns})_.*", full_statement
-                ):  # OR MATCH ..llama.modeling_llama
-                    return cst.RemoveFromParent()
-                if updated_node not in self.all_imports:
-                    self.all_imports.append(updated_node)
-                return updated_node
-            elif m.matches(original_node, m.SimpleStatementLine(body=[m.Assign()])):
-                if original_node.body[0].targets[0].target.value in ASSIGNMENTS_TO_KEEP.keys():
-                    file_ = ASSIGNMENTS_TO_KEEP[original_node.body[0].targets[0].target.value]
-                    self.files[file_][original_node.body[0].targets[0].target.value] = {
-                        "node": original_node,
-                        "insert_idx": self.global_scope_index,
-                    }
-            self.global_scope_index += 100
-        return updated_node
-
-    def visit_ClassDef(self, node: cst.ClassDef):
-        """Used to keep track of current class"""
-        self.current_class = node.name.value
+            if m.matches(node, m.SimpleStatementLine(body=[m.Import()])):
+                self.imports.append(node)
+            elif m.matches(node, m.SimpleStatementLine(body=[m.ImportFrom()])):
+                import_module = self.python_module.code_for_node(node.body[0].module)
+                import_statement = "." * len(node.body[0].relative) + import_module
+                if not (
+                    re.search(rf"(?:transformers\.models\.)|(?:\.\.)\w+\.({self.match_patterns})_.*", import_statement)
+                    and not any(import_to_skip in import_statement for import_to_skip in IMPORTS_TO_SKIP_IN_MODULAR)
+                ):
+                    self.imports.append(node)
+            elif m.matches(node, simple_top_level_assign_structure):
+                assigned_variable = node.body[0].targets[0].target.value
+                # __all__ is treated differently and not added to general assignments
+                if assigned_variable == "__all__":
+                    self.all_all_to_add = split_all_assignment(node)
+                else:
+                    self.current_assignment = assigned_variable
+                    self.assignments[assigned_variable] = node
 
-    def leave_ClassDef(self, original_node, updated_node):
+    def leave_Module(self, node):
+        """When we leave the modular file, we do the following in order:
+        1. for each modeling file found in the imports, rename it with the new model name, visit it, and update
+        its dependency graph with the new function and assignment definitions found in the modular
+        2. update the modular dependency graph with the imported functions and assignments (found when visiting the matching files)
+        3. compute the nested (recursive) function and assignment dependencies
         """
-        1. Filter the `base` classes of this class
-        If they are from `transformers.models.xx` then:
-        - take the AST tree of the module it comes from and parse it with a `ClassFinder`.
-        - rename all every instance of `old_name` (llama) to `new_name` (gemma)
-        2. We insert the modules which the inherited base depends on. This has to be done in
-        the order of the dependencies. If on is already in the new_body (because it's defined in the diff file)
-        then we remove it from the new body to add it again in the correct order.
-        3. Replace the calls to `super().xxxx` merging parent code
+        # Takes care of finalizing our visit
+        super().leave_Module(node)
+
+        # 1. for each modeling file found in the imports, rename it with the new model name, visit it, and update dependencies
+        self.visited_modules = {}
+        self.renamers = {}
+        for file, module in self.model_specific_modules.items():
+            file_model_name = re.search(r"models\.\w*?\.\w*?_(\S*)", file).groups()[0]
+            renamer = ReplaceNameTransformer(
+                file_model_name, self.model_name, self.given_old_name, self.given_new_name
+            )
+            renamed_module = module.visit(renamer)
+            self.visited_modules[file] = ModelFileMapper.visit_and_merge_dependencies(
+                renamed_module,
+                self.classes,
+                self.functions,
+                self.assignments,
+                self.object_dependency_mapping,
+                self.start_lines,
+            )
+            # We record it so that we can rename classes later the exact same way
+            self.renamers[file] = renamer
+
+        # 2. in turn, we need to add the imported functions/assignments to the dependencies of the modular mapper, using the
+        # definitions found in the visited files
+        self.merge_model_specific_imports(self.visited_modules)
+
+        # 3. compute the nested (recursive) function and assignment dependencies
+        self.object_recursive_dependency_mapping = self._compute_recursive_object_dependencies()
+
+        # We need to keep track of which objects were imported directly into which modeling file to not add them wrongly later
+        # Note that we may visit several of the same file types, thus we save them per file type, not file
+        self.imported_objects_per_file = defaultdict(set)
+        for file, mapper in self.visited_modules.items():
+            file_type = re.search(rf"^transformers\.models\.\w+\.({self.match_patterns})_.*", file).group(1)
+            self.imported_objects_per_file[file_type].update(mapper.objects_imported_from_modeling)
+
+    def merge_model_specific_imports(self, visited_modules):
+        """Merge the functions and assignments imported from the modeling files to the modular nodes and dependency graph,
+        based on the visited files."""
+        self.start_lines_file_mapping = {}
+        self.added_objects_file_mapping = {}
+        for object_name, file in self.model_specific_imported_objects.items():
+            visited_module = visited_modules[file]
+            self.start_lines_file_mapping[file] = visited_module.start_lines
+            # Add functions and their dependencies
+            if object_name in visited_module.functions and object_name not in self.functions:
+                self.functions[object_name] = visited_module.functions[object_name]
+                self.added_objects_file_mapping[object_name] = file
+                dependencies = visited_module.object_dependency_mapping.get(object_name, None)
+                if dependencies is not None:
+                    self.object_dependency_mapping[object_name] = dependencies
+                    for dep in dependencies:
+                        if dep not in self.global_nodes:
+                            self.added_objects_file_mapping[dep] = file
+                            self.functions[dep] = visited_module.global_nodes[dep]
+
+            # Add assignments and their dependencies
+            elif object_name in visited_module.assignments and object_name not in self.assignments:
+                self.assignments[object_name] = visited_module.assignments[object_name]
+                self.added_objects_file_mapping[object_name] = file
+                dependencies = visited_module.object_dependency_mapping.get(object_name, None)
+                if dependencies is not None:
+                    self.object_dependency_mapping[object_name] = dependencies
+                    for dep in dependencies:
+                        if dep not in self.global_nodes:
+                            self.added_objects_file_mapping[dep] = file
+                            self.assignments[dep] = visited_module.global_nodes[dep]
+
+        # Do not forget to re-assign all nodes after the merge
+        self.global_nodes = {**self.assignments, **self.classes, **self.functions}
+        # And restric dependencies to those nodes only
+        self._restrict_dependencies_to_known_entities()
+
+    def compute_relative_order(self, missing_dependencies: set) -> dict[str, int]:
+        """Compute in which relative order the `missing_dependencies` should appear when the nodes are added to the final file that
+        will be created based on the modular.
         """
-        class_name = original_node.name.value
-        bases = [k.value.value for k in original_node.bases if k.value.value in self.imported_mapping]
-        all_bases = [k.value.value for k in original_node.bases]
-        self.global_scope_index += 100
-        for super_class in bases:
-            if super_class not in self.imported_mapping:
-                raise ImportError(
-                    f"{super_class} was not imported using `from transformers.models.xxxxx.modeling_xxxx import {super_class}"
-                )
-
-            super_file_name = self.imported_mapping[super_class]  # we need to get the parsed tree
-            model_name = re.search(r"models\.\w*?\.\w*?_(\S*)", super_file_name)
-            if model_name:
-                model_name = model_name.groups()[0]
+        relative_order = {}
+        idx = 0
+
+        original_dependencies = []
+        other_files_dependencies = defaultdict(list)
+        for dep in tuple(missing_dependencies):
+            if dep in self.added_objects_file_mapping:
+                file = self.added_objects_file_mapping[dep]
+                other_files_dependencies[file].append(dep)
             else:
-                raise ValueError(
-                    f"Tried parsing the name of the imported package from {super_file_name}, could not extract the model name"
-                )
-            file_type = re.search(r"models?\.\w*?\.(\w*?)_", super_file_name).groups()[0]
-            visited_module = self.visited_module
-            if super_file_name not in visited_module:  # only extract classes once
-                class_finder = find_classes_in_file(
-                    self.transformers_imports[super_file_name],
-                    model_name,
-                    self.model_name,
-                    self.given_old_name,
-                    self.given_new_name,
-                )
-                visited_module[super_file_name] = class_finder
-                list_dependencies = {
-                    dep: class_finder.class_start_line.get(dep, 1000)
-                    for dep in class_finder.class_dependency_mapping.get(class_name, [])
-                }
-            else:  # we are re-using the previously parsed data
-                class_finder = visited_module[super_file_name]
-
-                list_dependencies = {
-                    dep: class_finder.class_start_line.get(dep, 1000)
-                    for dep in class_finder.class_dependency_mapping.get(class_name, [])
-                }
-            if len(list_dependencies) == 0:
-                # so, maybe standard renaming did not work (the class name is different)
-                # we try with another renaming pattern
-                potential_given_name = get_new_part(class_name, super_class)
-                del visited_module[super_file_name]
-                class_finder = find_classes_in_file(
-                    self.transformers_imports[super_file_name],
-                    model_name,
-                    potential_given_name,
-                    self.model_name,
-                    potential_given_name,
-                )
-                list_dependencies = {
-                    dep: class_finder.class_start_line.get(dep, 1000)
-                    for dep in class_finder.class_dependency_mapping.get(class_name, [])
-                }
-            if len(list_dependencies) == 0:
-                # last recourse, if the suffix of the new class is different from the one of the super class
-                # e.g. MyNewClassForSegmentation extends MyOldClassForObjectDetection
-                # we try with another renaming pattern
-                class_finder = find_classes_in_file(
-                    self.transformers_imports[super_file_name],
-                    model_name,
-                    self.model_name,
-                    self.given_old_name,
-                    self.given_new_name,
-                    super_class,
-                    class_name,
-                )
-                visited_module[super_file_name] = class_finder
-                list_dependencies = {
-                    dep: class_finder.class_start_line.get(dep, 1000)
-                    for dep in class_finder.class_dependency_mapping.get(class_name, [])
-                }
-            if len(list_dependencies) == 0:
-                raise ValueError(
-                    f"We were unable to find dependencies for {class_name} (based on inheriting from {super_class})"
-                    f"   Here are all the global dependencies that we found in you modular file: {list(class_finder.class_dependency_mapping.keys())}."
-                    f"   This usually means that the name of `{class_name}` does not match the pattern of `{super_class}`"
-                )
+                original_dependencies.append(dep)
+        # Sort all lists according to the order in their respective file
+        all_dependencies = []
+        for file, dependencies in other_files_dependencies.items():
+            sorted_dependencies = sorted(dependencies, key=lambda x: self.start_lines_file_mapping[file][x])
+            all_dependencies += sorted_dependencies
+        all_dependencies += sorted(original_dependencies, key=lambda x: self.start_lines[x])
+
+        # Add all original node first, then merged ones (one file at a time)
+        for dep in all_dependencies:
+            relative_order[dep] = idx
+            idx += 1
+
+        return relative_order
+
+
+def check_dependencies_and_create_import_node(
+    file_type: str, new_dependencies: set[str], mapper: ModuleMapper, new_name: str
+) -> tuple[set[str], dict[str, cst.CSTNode]]:
+    """Check that all class nodes in the `new_dependencies` belong to the correct `file_type`. If this is not the case,
+    we need to remove it from the dependencies, and create a new import to it instead.
+    This scenario may appear in the following case:
+    If a new class in the `modular_xxx.py` file does not belong to `type_xxx.py`, but is used somewhere in `other_type_xxx.py`
+    (e.g. as a type hint), but none of the visited files had a similar class, then it would be imported in `type_xxx.py` as
+    part of the standard dependency graph (because we never encountered an import towards this new class in any file).
+    For example imagine the following `modular.py`:
+    ```
+    from ..llama.modeling_llama import LlamaModel
 
-            list_dependencies = sorted(list_dependencies.items(), key=lambda x: x[1], reverse=True)
-            start_insert_idx = self.global_scope_index
-            file_to_update = self.files[file_type]
-            is_empty_node = self.python_module.code_for_node(original_node.body) == "pass\n"
-            for dependency, _ in list_dependencies:
-                # we can write to the correct body, using the source of the parent class
-                node = class_finder.global_nodes.get(dependency, None)
-                if node is not None:
-                    if dependency not in file_to_update:
-                        node = self.all_definitions.pop(dependency, node)
-                        start_insert_idx -= 1
-                        file_to_update[dependency] = {"insert_idx": start_insert_idx, "node": node}
-                        self.added_dependencies.add(dependency)
-                    elif dependency not in self.inserted_deps:
-                        # make sure the node is written after its dependencies
-                        start_insert_idx = file_to_update[dependency]["insert_idx"] - 1
-                        if (
-                            dependency in file_to_update.keys()
-                            and dependency in class_finder.first_lvl_dependency_mapping[class_name]
-                        ):
-                            # If dependency is defined, but not used, raise error
-                            calls = m.findall(original_node, m.Call(func=m.Name(dependency)))
-                            if not calls and not is_empty_node and dependency not in all_bases:
-                                raise ValueError(
-                                    f"""You defined `{dependency}` in the modular_{self.model_name}.py, it should be used
-                                    when you define `{class_name}`, as it is one of it's direct dependencies. Make sure
-                                    you use it in the `__init__` function."""
-                                )
-                    self.inserted_deps.append(dependency)
-
-            if len(list_dependencies) > 0:
-                updated_node = replace_call_to_super(class_finder, updated_node, class_name, all_bases)
-
-        # Now, if a class was defined without parents, we look for the name
-        match_pattern = "|".join(TYPE_TO_FILE_TYPE.keys())
-        match = re.search(rf"({match_pattern})$", class_name)
-        if match:
-            key = TYPE_TO_FILE_TYPE[match.group(1)]
-            self.class_to_file_type[class_name] = key
-            self.files[key][class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
-        else:
-            self.class_to_file_type[class_name] = "modeling"
-            self.files["modeling"][class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
+    class NewNameTextConfig(PretrainedConfig):
+        ...
 
-        self.current_class = None
-        return updated_node
+    class NewNameConfig(PretrainedConfig):
+        ...
 
-    def visit_FunctionDef(self, node):
-        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
-        if m.matches(parent_node, m.Module()):
-            self.current_top_level_function = node.name.value
+    class NewNameModel(LlamaModel):
+        config = NewNameConfig()
+        text_config = NewNameTextConfig()
+        ...
+    ```
+    then without the help of this function, `NewNameTextConfig` would be imported in the `modeling_newname.py` as well as
+    `configuration_newname.py`, because `modeling_llama.py` tells us to not import `NewNameConfig`, but has no
+    knowledge of `NewNameTextConfig`.
+    """
+    class_dependencies = {dep for dep in new_dependencies if m.matches(mapper.global_nodes[dep], m.ClassDef())}
+    corrected_dependencies = new_dependencies.copy()
+    new_imports = {}
+    for class_name in class_dependencies:
+        class_file_type = find_file_type(class_name)
+        # In this case, we need to remove it from the dependencies and create a new import instead
+        if class_file_type != file_type:
+            corrected_dependencies.remove(class_name)
+            import_statement = f"from .{class_file_type}_{new_name} import {class_name}"
+            new_imports[class_name] = cst.parse_statement(import_statement)
+
+    return corrected_dependencies, new_imports
+
+
+def get_class_node_and_dependencies(
+    modular_mapper: ModularFileMapper, class_name: str, node: cst.CSTNode, files: dict[str, dict]
+) -> tuple[dict, str, dict]:
+    """Return a single class node (and all its dependency nodes), to be added to the `files`. It creates the new
+    class node based on the inherited classes if needed. Also returns any new imports of a new class defined in
+    the modular that we nay need.
+    """
+    bases = [k.value.value for k in node.bases if k.value.value in modular_mapper.model_specific_imported_objects]
+    if len(bases) > 1:
+        raise ValueError(
+            f"{class_name} was defined with more than 1 model-specific super class. This is unsupported. We found {*bases,}."
+        )
 
-    def leave_FunctionDef(self, original_node, node):
-        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, original_node)
-        if m.matches(parent_node, m.Module()):
-            self.all_definitions[node.name.value] = node
-        return node
-
-    def visit_Assign(self, node: cst.Assign) -> None:
-        # Check if the assignment target is '__all__'
-        if isinstance(node.targets[0].target, cst.Name) and node.targets[0].target.value == "__all__":
-            if isinstance(node.value, cst.List):
-                # Extract the elements from the list
-                all_all_to_add = defaultdict(list)
-                for elt in node.value.elements:
-                    if isinstance(elt.value, cst.SimpleString):
-                        # Remove quotes and add the string to the elements list
-                        class_name = elt.value.value
-                        file = self.class_to_file_type[
-                            elt.value.evaluated_value
-                        ]  # evaluated value give the content of the string
-                        all_all_to_add[file] += [class_name]
-                for f_type, new_alls in all_all_to_add.items():
-                    updated_node = node.with_changes(
-                        value=cst.List(elements=[cst.Element(value=cst.SimpleString(value=k)) for k in new_alls])
-                    )
-                    self.files[f_type][class_name] = {
-                        "insert_idx": self.global_scope_index + 100,
-                        "node": updated_node,
-                    }
-
-    def leave_If(self, original_node, node):
-        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, original_node)
-        if m.matches(parent_node, m.Module()):
-            full_statement = self.python_module.code_for_node(original_node.test)
-            if re.search(r"[\s\S]*is_.*available", full_statement):
-                self.all_safe_imports.append(node)
-            elif full_statement not in self.all_imports:
-                logger.warning(f"one import is protected with `if`. Hard guess where it's used {full_statement}")
-        return node
-
-    def visit_Call(self, node: cst.Call):
-        """This is used to create a mapping from functions to class calling them, and from top-level functions to functions called inside them.
-        Important note: we only rely on direct Call to the functions here, not indirect mentions (such as assigning a variable with the function,
-        add calling the variable later). This should be enough as the `modular_xxx` and `modeling_xxx` structures should be as simple as possible."""
-        # Only map function calls if we're inside a class (i.e., current_class is set)
-        if self.current_class is not None:
-            # Simple function calls such as foo()
-            if isinstance(node.func, cst.Name):
-                self.function_call_class_mapping[node.func.value].add(self.current_class)
-        elif self.current_top_level_function is not None:
-            # Simple function calls such as foo()
-            if isinstance(node.func, cst.Name):
-                self.function_call_dependency_mapping[self.current_top_level_function].add(node.func.value)
-
-    def _maybe_add_function_to_body(
-        self,
-        top_level_function: str,
-        body: dict,
-        function_node: cst.FunctionDef,
-        matching_callers: Optional[set] = None,
-        parent: Optional[str] = None,
-    ) -> bool:
-        """Check if the `top_level_function` should be added to the body (i.e. it is not already present, and `matching_callers`
-        is not empy, or `parent`is provided). If it should be added, do it (in the correct location, just before its caller) and return
-        `True`. Return `False` otherwise.
-        """
-        if matching_callers is None and parent is None:
-            raise ValueError("Cannot add function if both the parent and the matching callers are None.")
-        if matching_callers is None:
-            matching_callers = {parent}
-        if len(matching_callers) > 0 and top_level_function not in body.keys():
-            # Add the function just before the first class using it
-            new_idx = min([body[element]["insert_idx"] for element in matching_callers])
-            # Reorder the elements
-            for element in body.keys():
-                if body[element]["insert_idx"] >= new_idx:
-                    body[element]["insert_idx"] += 1
-            # Assign new element to body (after changing the count to avoid messing it)
-            body[top_level_function] = {"insert_idx": new_idx, "node": function_node}
-            return True
-        return False
-
-    def _recursively_add_all_new_needed_functions_in_files(self):
-        """For all top-level functions which were newly defined in the `modular_xxx.py`, check if they are used in a class in
-        the different files, and add them to the file if it is the case (also recursively adding all other functions that
-        may be needed in that function body)."""
-        # At this point, `self.all_definitions` only contains newly defined top-level functions in the `modualr_xxx.py`
-        for top_level_function, function_node in self.all_definitions.items():
-            calling_entities = self.function_call_class_mapping[top_level_function]
-            # The function may be needed in different files, we need to iterate on them
-            for file, body in self.files.items():
-                file_elements = set(body.keys())
-                # If the intersection is not null, top_level_func must be added to file
-                matching_callers = calling_entities & file_elements
-                added = self._maybe_add_function_to_body(top_level_function, body, function_node, matching_callers)
-                # If the function was added, we need to recursively add all its dependencies
-                if added:
-                    for dependency, parent in find_all_dependencies(
-                        top_level_function, self.function_call_dependency_mapping
-                    ):
-                        self._maybe_add_function_to_body(
-                            dependency, body, self.all_definitions[dependency], parent=parent
-                        )
+    file_type = find_file_type(class_name)
+    file_to_update = files[file_type]
+    model_name = modular_mapper.model_name
 
-    def leave_Module(self, original_node: cst.Module, node):
-        imports = {self.python_module.code_for_node(k): k for k in self.all_imports}
-        dependency_imports = {file_type: imports.copy() for file_type in self.files}
-        for super_file_name, visiter in self.visited_module.items():
-            file_type = re.search(r"models?\.\w*?\.(\w*?)_", super_file_name).groups()[0]
-            dependency_imports[file_type].update(
-                {self.python_module.code_for_node(k): k for k in visiter.imports.values()}
-            )
+    # This is used to avoid adding objects to the dependencies graph if they will be imported already
+    imported_objects = modular_mapper.imported_objects_per_file[file_type]
+
+    # We need to replace the class node with the transformers (modeling file) super class node
+    if len(bases) == 1:
+        super_class = bases[0]
+        super_file_name = modular_mapper.model_specific_imported_objects[super_class]
+
+        # Get the mapper corresponding to the inherited class
+        mapper = modular_mapper.visited_modules[super_file_name]
+        # Rename the super class according to the exact same rule we used when renaming the whole module
+        renamer = modular_mapper.renamers[super_file_name]
+        renamed_super_class = preserve_case_replace(super_class, renamer.patterns, renamer.default_name)
+        renamed_super_class = convert_to_camelcase(renamed_super_class, renamer.old_name, renamer.default_old_name)
+
+        # Create the new class node
+        updated_node = replace_class_node(mapper, node, renamed_super_class)
+
+        # Grab all immediate dependencies of the new node
+        new_node_dependencies = augmented_dependencies_for_class_node(updated_node, mapper, imported_objects)
+
+        # At this point, if any class dependency is found, but belongs to another file, it means that we need to remove
+        # it from the dependencies, and add a new import of it instead
+        new_node_dependencies, new_imports = check_dependencies_and_create_import_node(
+            file_type, new_node_dependencies, mapper, model_name
+        )
+
+        # The node was modified -> look for all recursive dependencies of the new node
+        all_dependencies_to_add = find_all_dependencies(
+            dependency_mapping=mapper.class_dependency_mapping,
+            initial_dependencies=new_node_dependencies,
+            initial_checked_dependencies=set(file_to_update.keys()),
+        )
+
+        relative_dependency_order = mapper.compute_relative_order(all_dependencies_to_add)
+        nodes_to_add = {
+            dep: (relative_dependency_order[dep], mapper.global_nodes[dep]) for dep in all_dependencies_to_add
+        }
+
+    # No transformers (modeling file) super class, just check functions and assignments dependencies
+    else:
+        updated_node = node
+        # The node was NOT modified -> no need to look recursively for other class dependencies. Indeed, even if they are not
+        # already defined (which would mean a weird order of the code in the modular...), they will be in the future
+        all_dependencies_to_add = augmented_dependencies_for_class_node(updated_node, modular_mapper, imported_objects)
+
+        # At this point, if any class dependency is found, but belongs to another file, it means that we need to remove
+        # it from the dependencies, and add a new import of it instead
+        all_dependencies_to_add, new_imports = check_dependencies_and_create_import_node(
+            file_type, all_dependencies_to_add, modular_mapper, model_name
+        )
+
+        relative_dependency_order = modular_mapper.compute_relative_order(all_dependencies_to_add)
+        nodes_to_add = {
+            dep: (relative_dependency_order[dep], modular_mapper.global_nodes[dep])
+            for dep in all_dependencies_to_add
+            if dep not in file_to_update.keys()
+        }
+
+    # Add the class node itself to the nodes to add
+    class_idx = max(relative_dependency_order.values()) + 1 if len(relative_dependency_order) > 0 else 0
+    nodes_to_add[class_name] = (class_idx, updated_node)
+
+    return nodes_to_add, file_type, new_imports
+
+
+def create_modules(modular_mapper: ModularFileMapper) -> dict[str, cst.Module]:
+    """Create all the new modules based on visiting the modular file. It replaces all classes as necesary."""
+    files = defaultdict(dict)
+    current_file_indices = defaultdict(lambda: 0)
+
+    # For each class defined in modular, potentially replace the node and add it with its dependencies
+    for class_name, node in modular_mapper.classes.items():
+        nodes_to_add, file_type, new_imports = get_class_node_and_dependencies(modular_mapper, class_name, node, files)
+
+        # Add the new potential new imports that we may need to the `modular_mapper` variable
+        modular_mapper.imported_objects_per_file[file_type].update(new_imports.keys())
+        modular_mapper.imports.extend(list(new_imports.values()))
+
+        # Sort the nodes according to their relative order
+        nodes_to_add = sorted(nodes_to_add.items(), key=lambda x: x[1][0])
+        # Write all nodes to file
+        for dependency, (_, node) in nodes_to_add:
+            # This is used to keep certain variables at the beginning of the file
+            try:
+                # The -1000 is arbitrary -> just keep it bigger than the list
+                idx = -1000 + VARIABLES_AT_THE_BEGINNING.index(dependency)
+            except ValueError:
+                idx = current_file_indices[file_type]
+                current_file_indices[file_type] += 1
+            files[file_type][dependency] = {"insert_idx": idx, "node": node}
+
+    # Add the __all__ statement to files at the end
+    for file_type, node in modular_mapper.all_all_to_add.items():
+        idx = current_file_indices[file_type]
+        files[file_type]["__all__"] = {"insert_idx": idx, "node": node}
+
+    # Aggregate all the imports statements (we look for duplicates with the code_for_node, not the nodes themselves because
+    # they are wrapped in SimpleStatementLine or If which could have different newlines, blanks etc)
+    all_imports = modular_mapper.imports.copy()
+    all_imports_code = {modular_mapper.python_module.code_for_node(node).strip() for node in all_imports}
+    for file, mapper in modular_mapper.visited_modules.items():
+        new_imports = [
+            node for node in mapper.imports if mapper.python_module.code_for_node(node).strip() not in all_imports_code
+        ]
+        new_imports_code = {mapper.python_module.code_for_node(node).strip() for node in new_imports}
+        all_imports.extend(new_imports)
+        all_imports_code.update(new_imports_code)
 
-        # Check if any new top-level function from the `modular_xxx.py` should be added to the different files
-        # (if it is called in a class in the file, then it will be copy pasted from `modular.py` to that file).
-        self._recursively_add_all_new_needed_functions_in_files()
+    # Find the correct imports, and write the new modules
+    for file, body in files.items():
+        new_body = [k[1]["node"] for k in sorted(body.items(), key=lambda x: x[1]["insert_idx"])]
+        needed_imports = get_needed_imports(body, all_imports)
+        full_module = needed_imports + new_body
+        new_module = cst.Module(body=full_module, header=modular_mapper.python_module.header)
+        files[file] = new_module
 
-        for file, body in self.files.items():
-            new_body = [k[1]["node"] for k in sorted(body.items(), key=lambda x: x[1]["insert_idx"])]
-            if len(new_body) > 0:
-                if file in dependency_imports.keys():
-                    new_body = list(dependency_imports[file].values()) + new_body
-                new_module = cst.Module(body=[*new_body], header=node.header)
-                # Final cleanup
-                new_module = MetadataWrapper(new_module).visit(PostModularConverterCleaner(self.added_dependencies))
-                self.files[file] = new_module
-        return node
+    return files
 
 
 def convert_modular_file(modular_file, old_model_name=None, new_model_name=None, cst_transformers=None):
@@ -1132,24 +1474,24 @@ def convert_modular_file(modular_file, old_model_name=None, new_model_name=None,
     if pattern is not None:
         model_name = pattern.groups()[0]
         # Parse the Python file
-        with open(modular_file, "r") as file:
+        with open(modular_file, "r", encoding="utf-8") as file:
             code = file.read()
         module = cst.parse_module(code)
         wrapper = MetadataWrapper(module)
         if cst_transformers is None:
-            cst_transformers = ModularConverterTransformer(module, model_name, old_model_name, new_model_name)
+            cst_transformers = ModularFileMapper(module, model_name, old_model_name, new_model_name)
         wrapper.visit(cst_transformers)
-        for file, node in cst_transformers.files.items():
-            if node != {}:
+        for file, module in create_modules(cst_transformers).items():
+            if module != {}:
                 # Get relative path starting from src/transformers/
                 relative_path = re.search(
-                    rf"(src{os.sep}transformers{os.sep}.*|examples{os.sep}.*)", os.path.abspath(modular_file)
+                    r"(src/transformers/.*|examples/.*)", os.path.abspath(modular_file).replace("\\", "/")
                 ).group(1)
 
                 header = AUTO_GENERATED_MESSAGE.format(
                     relative_path=relative_path, short_name=os.path.basename(relative_path)
                 )
-                ruffed_code = run_ruff(header + node.code, True)
+                ruffed_code = run_ruff(header + module.code, True)
                 formatted_code = run_ruff(ruffed_code, False)
                 output[file] = [formatted_code, ruffed_code]
         return output
@@ -1164,7 +1506,7 @@ def save_modeling_file(modular_file, converted_file):
             [line for line in converted_file[file_type][0].strip().split("\n") if not line.strip().startswith("#")]
         )
         if len(converted_file[file_type][0].strip()) > 0 and non_comment_lines > 0:
-            with open(modular_file.replace("modular_", f"{file_type}_"), "w") as f:
+            with open(modular_file.replace("modular_", f"{file_type}_"), "w", encoding="utf-8") as f:
                 f.write(converted_file[file_type][0])
         else:
             non_comment_lines = len(
@@ -1172,7 +1514,7 @@ def save_modeling_file(modular_file, converted_file):
             )
             if len(converted_file[file_type][1].strip()) > 0 and non_comment_lines > 0:
                 logger.warning("The modeling code contains errors, it's written without formatting")
-                with open(modular_file.replace("modular_", f"{file_type}_"), "w") as f:
+                with open(modular_file.replace("modular_", f"{file_type}_"), "w", encoding="utf-8") as f:
                     f.write(converted_file[file_type][1])
 
 
@@ -1180,7 +1522,7 @@ def save_modeling_file(modular_file, converted_file):
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--files_to_parse",
-        default=["src/transformers/models/roberta/modular_roberta.py"],
+        default=["src/transformers/models/starcoder2/modular_starcoder2.py"],
         nargs="+",
         help="A list of `modular_xxxx` files that should be converted to single model file",
     )
@@ -1197,6 +1539,7 @@ def save_modeling_file(modular_file, converted_file):
     args = parser.parse_args()
     if args.files_to_parse == ["all"]:
         args.files_to_parse = glob.glob("src/transformers/models/**/modular_*.py", recursive=True)
+        args.files_to_parse += glob.glob("examples/**/modular_*.py", recursive=True)
 
     for file_name in find_priority_list(args.files_to_parse):
         print(f"Converting {file_name} to a single model single file format")
diff --git a/utils/notification_service.py b/utils/notification_service.py
index 629b793337889a..6c9eab3a85387b 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -547,7 +547,8 @@ def payload(self) -> str:
                     items = re.findall(pattern, line)
                 elif "tests/models/" in line:
                     model = line.split("/")[2]
-                    new_failed_tests[model] = {"single-gpu": [], "multi-gpu": []}
+                    if model not in new_failed_tests:
+                        new_failed_tests[model] = {"single-gpu": [], "multi-gpu": []}
                     for url, device in items:
                         new_failed_tests[model][f"{device}-gpu"].append(line)
             file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.json")
@@ -1075,6 +1076,11 @@ def prepare_reports(title, header, reports, to_truncate=True):
 
                 for line in artifact["summary_short"].split("\n"):
                     if line.startswith("FAILED "):
+                        # Avoid the extra `FAILED` entry given by `run_test_using_subprocess` causing issue when calling
+                        # `stacktraces.pop` below.
+                        # See `run_test_using_subprocess` in `src/transformers/testing_utils.py`
+                        if " - Failed: (subprocess)" in line:
+                            continue
                         line = line[len("FAILED ") :]
                         line = line.split()[0].replace("\n", "")
 
@@ -1185,6 +1191,11 @@ def prepare_reports(title, header, reports, to_truncate=True):
             if failed:
                 for line in artifact["summary_short"].split("\n"):
                     if line.startswith("FAILED "):
+                        # Avoid the extra `FAILED` entry given by `run_test_using_subprocess` causing issue when calling
+                        # `stacktraces.pop` below.
+                        # See `run_test_using_subprocess` in `src/transformers/testing_utils.py`
+                        if " - Failed: (subprocess)" in line:
+                            continue
                         line = line[len("FAILED ") :]
                         line = line.split()[0].replace("\n", "")
 
diff --git a/utils/process_bad_commit_report.py b/utils/process_bad_commit_report.py
index 513dc8df3a3b3c..19812ff21f7d19 100644
--- a/utils/process_bad_commit_report.py
+++ b/utils/process_bad_commit_report.py
@@ -64,6 +64,8 @@
             for device, failed_tests in model_result.items():
                 failed_tests = [x for x in failed_tests if x["author"] == author or x["merged_by"] == author]
                 model_result[device] = failed_tests
+            _data[model] = {k: v for k, v in model_result.items() if len(v) > 0}
+        new_data_full[author] = {k: v for k, v in _data.items() if len(v) > 0}
 
     # Upload to Hub and get the url
     with open("new_model_failures_with_bad_commit_grouped_by_authors.json", "w") as fp:
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 9e15f2e115ec61..906e85e1de61a5 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -997,7 +997,7 @@ def _print_list(l) -> str:
 def infer_tests_to_run(
     output_file: str,
     diff_with_last_commit: bool = False,
-    filter_models: bool = True,
+    filter_models: bool = False,
 ):
     """
     The main function called by the test fetcher. Determines the tests to run from the diff.
@@ -1229,6 +1229,6 @@ def create_test_list_from_filter(full_test_list, out_path):
         infer_tests_to_run(
             args.output_file,
             diff_with_last_commit=diff_with_last_commit,
-            filter_models=(not (commit_flags["no_filter"] or is_main_branch)),
+            filter_models=False,
         )
         filter_tests(args.output_file, ["repo_utils"])
diff --git a/utils/update_metadata.py b/utils/update_metadata.py
index 1806eb3f03df5a..b6ee1e7c8c13c2 100755
--- a/utils/update_metadata.py
+++ b/utils/update_metadata.py
@@ -69,6 +69,7 @@
     ("automatic-speech-recognition", "MODEL_FOR_CTC_MAPPING_NAMES", "AutoModelForCTC"),
     ("image-classification", "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES", "AutoModelForImageClassification"),
     ("image-segmentation", "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES", "AutoModelForImageSegmentation"),
+    ("image-text-to-text", "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES", "AutoModelForImageTextToText"),
     ("image-to-image", "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES", "AutoModelForImageToImage"),
     ("fill-mask", "MODEL_FOR_MASKED_LM_MAPPING_NAMES", "AutoModelForMaskedLM"),
     ("object-detection", "MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES", "AutoModelForObjectDetection"),