diff --git a/.circleci/config.yml b/.circleci/config.yml
index d5e9ac799fe728..044493315d2003 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -208,7 +208,6 @@ jobs:
- run: python utils/check_doctest_list.py
- run: make deps_table_check_updated
- run: python utils/update_metadata.py --check-only
- - run: python utils/check_task_guides.py
- run: python utils/check_docstrings.py
- run: python utils/check_support_list.py
diff --git a/.github/workflows/model-templates.yml b/.github/workflows/model-templates.yml
deleted file mode 100644
index d34a28508eef67..00000000000000
--- a/.github/workflows/model-templates.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-name: Model templates runner
-
-on:
- repository_dispatch:
- schedule:
- - cron: "0 2 * * *"
-
-jobs:
- run_tests_templates:
- runs-on: ubuntu-22.04
- steps:
- - name: Checkout repository
- uses: actions/checkout@v4
-
- - name: Install dependencies
- run: |
- sudo apt -y update && sudo apt install -y libsndfile1-dev
-
- - name: Load cached virtual environment
- uses: actions/cache@v2
- id: cache
- with:
- path: ~/venv/
- key: v4-tests_templates-${{ hashFiles('setup.py') }}
-
- - name: Create virtual environment on cache miss
- if: steps.cache.outputs.cache-hit != 'true'
- run: |
- python -m venv ~/venv && . ~/venv/bin/activate
- pip install --upgrade pip!=21.3
- pip install -e .[dev]
-
- - name: Check transformers location
- # make `transformers` available as package (required since we use `-e` flag) and check it's indeed from the repo.
- run: |
- . ~/venv/bin/activate
- python setup.py develop
- transformer_loc=$(pip show transformers | grep "Location: " | cut -c11-)
- transformer_repo_loc=$(pwd .)
- if [ "$transformer_loc" != "$transformer_repo_loc/src" ]; then
- echo "transformers is from $transformer_loc but it shoud be from $transformer_repo_loc/src."
- echo "A fix is required. Stop testing."
- exit 1
- fi
-
- - name: Create model files
- run: |
- . ~/venv/bin/activate
- transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
- transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
- transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
- transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
- transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
- transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
- transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/flax-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
- transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/flax-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
- make style
- python utils/check_table.py --fix_and_overwrite
- python utils/check_dummies.py --fix_and_overwrite
- python utils/check_copies.py --fix_and_overwrite
-
- - name: Run all non-slow tests
- run: |
- . ~/venv/bin/activate
- python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_templates tests/*template*
-
- - name: Run style changes
- run: |
- . ~/venv/bin/activate
- make style && make quality && make repo-consistency
-
- - name: Failure short reports
- if: ${{ always() }}
- run: cat reports/tests_templates/failures_short.txt
-
- - name: Test suite reports artifacts
- if: ${{ always() }}
- uses: actions/upload-artifact@v4
- with:
- name: run_all_tests_templates_test_reports
- path: reports/tests_templates
diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml
index 2ba0b917cad9dd..f88af8e39af27d 100644
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@@ -28,7 +28,7 @@ env:
CUDA_VISIBLE_DEVICES: 0,1
jobs:
- model_job:
+ run_models_gpu:
name: " "
strategy:
fail-fast: false
@@ -80,23 +80,23 @@ jobs:
- name: Run all tests on GPU
working-directory: /transformers
- run: python3 -m pytest -rs -v --make-reports=${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+ run: python3 -m pytest -rs -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+ run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
- name: Run test
shell: bash
run: |
- mkdir -p /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}
- echo "hello" > /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}/hello.txt
- echo "${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}"
+ mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+ echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
+ echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
- - name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
+ - name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ inputs.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
- path: /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}
+ name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+ path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
diff --git a/.github/workflows/self-new-model-pr-caller.yml b/.github/workflows/self-new-model-pr-caller.yml
new file mode 100644
index 00000000000000..888b4f7a8ce5ac
--- /dev/null
+++ b/.github/workflows/self-new-model-pr-caller.yml
@@ -0,0 +1,112 @@
+name: PR slow CI
+
+on:
+ pull_request:
+ paths:
+ - "src/transformers/models/*/modeling_*.py"
+
+env:
+ HF_HOME: /mnt/cache
+ TRANSFORMERS_IS_CI: yes
+ OMP_NUM_THREADS: 8
+ MKL_NUM_THREADS: 8
+ RUN_SLOW: yes
+ # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
+ # This token is created under the bot `hf-transformers-bot`.
+ HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+ SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+ TF_FORCE_GPU_ALLOW_GROWTH: true
+ RUN_PT_TF_CROSS_TESTS: 1
+ CUDA_VISIBLE_DEVICES: 0,1
+
+jobs:
+ check_for_new_model:
+ runs-on: ubuntu-22.04
+ name: Check if a PR is a new model PR
+ outputs:
+ new_model: ${{ steps.check_new_model.outputs.new_model }}
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ fetch-depth: "0"
+
+ - name: Check if there is a new model
+ id: check_new_model
+ run: |
+ python -m pip install GitPython
+ echo "new_model=$(python utils/check_if_new_model_added.py | tail -n 1)" >> $GITHUB_OUTPUT
+
+ run_models_gpu:
+ name: Run all tests for the new model
+ # Triggered if it is a new model PR and the required label is added
+ if: ${{ needs.check_for_new_model.outputs.new_model != '' && contains(github.event.pull_request.labels.*.name, 'single-model-run-slow') }}
+ needs: check_for_new_model
+ strategy:
+ fail-fast: false
+ matrix:
+ folders: ["${{ needs.check_for_new_model.outputs.new_model }}"]
+ machine_type: [single-gpu, multi-gpu]
+ runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
+ container:
+ image: huggingface/transformers-all-latest-gpu
+ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+ steps:
+ - name: Echo input and matrix info
+ shell: bash
+ run: |
+ echo "${{ matrix.folders }}"
+
+ - name: Echo folder ${{ matrix.folders }}
+ shell: bash
+ # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+ # set the artifact folder names (because the character `/` is not allowed).
+ run: |
+ echo "${{ matrix.folders }}"
+ matrix_folders=${{ matrix.folders }}
+ matrix_folders=${matrix_folders/'models/'/'models_'}
+ echo "$matrix_folders"
+ echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+ - name: Update clone
+ working-directory: /transformers
+ run: git fetch && git checkout ${{ github.event.pull_request.head.sha }}
+
+ - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+ working-directory: /transformers
+ run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+ - name: NVIDIA-SMI
+ run: |
+ nvidia-smi
+
+ - name: Environment
+ working-directory: /transformers
+ run: |
+ python3 utils/print_env.py
+
+ - name: Show installed libraries and their versions
+ working-directory: /transformers
+ run: pip freeze
+
+ - name: Run all tests on GPU
+ working-directory: /transformers
+ run: python3 -m pytest -v -rs --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+
+ - name: Failure short reports
+ if: ${{ failure() }}
+ continue-on-error: true
+ run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+
+ - name: Make sure report directory exists
+ shell: bash
+ run: |
+ mkdir -p /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+ echo "hello" > /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
+ echo "${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
+
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
+ if: ${{ always() }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+ path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml
index 7906325e83bb9d..875e715b068b6c 100644
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -2,7 +2,7 @@ name: Self-hosted runner (nightly-ci)
# Note that each job's dependencies go into a corresponding docker file.
#
-# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
+# For example for `run_torch_cuda_extensions_gpu` the docker image is
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
@@ -183,7 +183,7 @@ jobs:
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
- run_all_tests_torch_cuda_extensions_gpu:
+ run_torch_cuda_extensions_gpu:
name: Torch CUDA extension tests
strategy:
fail-fast: false
@@ -231,19 +231,19 @@ jobs:
- name: Run all tests on GPU
working-directory: /workspace/transformers
run: |
- python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+ python -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
+ run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_nightly"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports_postfix_nightly"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_nightly
- path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
+ name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports_postfix_nightly
+ path: /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
send_results:
name: Send results to webhook
@@ -253,7 +253,7 @@ jobs:
setup,
run_tests_single_gpu,
run_tests_multi_gpu,
- run_all_tests_torch_cuda_extensions_gpu
+ run_torch_cuda_extensions_gpu
]
steps:
- name: Preliminary job status
diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml
index 7be658c43202ff..ca47c454f6894a 100644
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@@ -2,7 +2,7 @@ name: Self-hosted runner (past-ci)
# Note that each job's dependencies go into a corresponding docker file.
#
-# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
+# For example for `run_torch_cuda_extensions_gpu` the docker image is
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
@@ -228,7 +228,7 @@ jobs:
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
- run_all_tests_torch_cuda_extensions_gpu:
+ run_torch_cuda_extensions_gpu:
name: Torch CUDA extension tests
if: inputs.framework == 'pytorch'
strategy:
@@ -286,19 +286,19 @@ jobs:
- name: Run all tests on GPU
working-directory: /transformers
run: |
- python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+ python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
+ run: cat /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
- path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
+ name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
+ path: /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
send_results:
name: Send results to webhook
@@ -308,7 +308,7 @@ jobs:
setup,
run_tests_single_gpu,
run_tests_multi_gpu,
- run_all_tests_torch_cuda_extensions_gpu
+ run_torch_cuda_extensions_gpu
]
steps:
- name: Preliminary job status
diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml
index b285a5f8fc0ad8..8705f398b2b510 100644
--- a/.github/workflows/self-push-amd.yml
+++ b/.github/workflows/self-push-amd.yml
@@ -145,7 +145,7 @@ jobs:
echo "matrix=$keys" >> $GITHUB_OUTPUT
echo "test_map=$test_map" >> $GITHUB_OUTPUT
- run_tests_amdgpu:
+ run_models_gpu:
name: Model tests
needs: setup_gpu
# `dummy` means there is no test to run
@@ -230,19 +230,19 @@ jobs:
- name: Run all non-slow selected tests on GPU
working-directory: /transformers
run: |
- python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}
+ python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+ run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
- path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+ name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+ path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
send_results:
name: Send results to webhook
@@ -252,7 +252,7 @@ jobs:
check_runner_status,
check_runners,
setup_gpu,
- run_tests_amdgpu,
+ run_models_gpu,
# run_tests_torch_cuda_extensions_single_gpu,
# run_tests_torch_cuda_extensions_multi_gpu
]
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index 17dff31fa4e330..1bc02ccd826eb0 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -385,19 +385,19 @@ jobs:
working-directory: /workspace/transformers
# TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
run: |
- python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+ python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
+ run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
- path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
+ name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+ path: /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
run_tests_torch_cuda_extensions_multi_gpu:
name: Torch CUDA extension tests
@@ -475,19 +475,19 @@ jobs:
working-directory: /workspace/transformers
# TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
run: |
- python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+ python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
+ run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
- path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
+ name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+ path: /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
send_results:
name: Send results to webhook
diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 09926071802a7a..d2ab90d1331848 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -108,7 +108,7 @@ jobs:
run: |
python3 utils/print_env.py
- run_tests_single_gpu:
+ run_models_gpu_single_gpu:
name: Single GPU tests
strategy:
max-parallel: 1 # For now, not to parallelize. Can change later if it works well.
@@ -162,21 +162,21 @@ jobs:
- name: Run all tests on GPU
working-directory: /transformers
- run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+ run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+ run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
- path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+ name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+ path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
- run_tests_multi_gpu:
+ run_models_gpu_multi_gpu:
name: Multi GPU tests
strategy:
max-parallel: 1
@@ -230,19 +230,19 @@ jobs:
- name: Run all tests on GPU
working-directory: /transformers
- run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+ run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+ run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
- path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+ name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+ path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
run_examples_gpu:
name: Examples tests
@@ -287,19 +287,19 @@ jobs:
working-directory: /transformers
run: |
pip install -r examples/pytorch/_tests_requirements.txt
- python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
+ python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
+ run: cat /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_examples_gpu
- path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
+ name: ${{ matrix.machine_type }}_run_examples_gpu_test_reports
+ path: /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports
run_pipelines_torch_gpu:
name: PyTorch pipelines tests
@@ -343,21 +343,21 @@ jobs:
- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
- python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
+ python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
+ run: cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
- path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
+ name: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
+ path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
- run_tests_torch_deepspeed_gpu:
+ run_torch_cuda_extensions_gpu:
name: Torch ROCm deepspeed tests
strategy:
fail-fast: false
@@ -400,19 +400,19 @@ jobs:
- name: Run all tests on GPU
working-directory: /transformers
- run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_deepspeed_gpu tests/deepspeed tests/extended
+ run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu/failures_short.txt
+ run: cat /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_deepspeed_gpu_test_reports"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_tests_torch_deepspeed_gpu_test_reports
- path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu
+ name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+ path: /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
run_extract_warnings:
name: Extract warnings in CI artifacts
@@ -422,11 +422,11 @@ jobs:
check_runner_status,
check_runners,
setup,
- run_tests_single_gpu,
- run_tests_multi_gpu,
+ run_models_gpu_single_gpu,
+ run_models_gpu_multi_gpu,
run_examples_gpu,
run_pipelines_torch_gpu,
- run_tests_torch_deepspeed_gpu
+ run_torch_cuda_extensions_gpu
]
steps:
- name: Checkout transformers
@@ -471,11 +471,11 @@ jobs:
check_runner_status,
check_runners,
setup,
- run_tests_single_gpu,
- run_tests_multi_gpu,
+ run_models_gpu_single_gpu,
+ run_models_gpu_multi_gpu,
run_examples_gpu,
run_pipelines_torch_gpu,
- run_tests_torch_deepspeed_gpu,
+ run_torch_cuda_extensions_gpu,
run_extract_warnings
]
steps:
diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml
index 59b992bcd250e2..40689c629a09bf 100644
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@@ -14,7 +14,7 @@ jobs:
name: Model CI
uses: ./.github/workflows/self-scheduled.yml
with:
- job: run_tests_gpu
+ job: run_models_gpu
slack_report_channel: "#transformers-ci-daily-models"
secrets: inherit
@@ -46,7 +46,7 @@ jobs:
name: DeepSpeed CI
uses: ./.github/workflows/self-scheduled.yml
with:
- job: run_all_tests_torch_cuda_extensions_gpu
+ job: run_torch_cuda_extensions_gpu
slack_report_channel: "#transformers-ci-daily-deepspeed"
secrets: inherit
@@ -54,6 +54,6 @@ jobs:
name: Quantization CI
uses: ./.github/workflows/self-scheduled.yml
with:
- job: run_tests_quantization_torch_gpu
+ job: run_quantization_torch_gpu
slack_report_channel: "#transformers-ci-daily-quantization"
secrets: inherit
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index fa41bffc0bc826..5911c81bf4f95d 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -2,7 +2,7 @@ name: Self-hosted runner (scheduled)
# Note that each job's dependencies go into a corresponding docker file.
#
-# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
+# For example for `run_torch_cuda_extensions_gpu` the docker image is
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
@@ -33,7 +33,7 @@ env:
jobs:
setup:
- if: contains(fromJSON('["run_tests_gpu", "run_tests_quantization_torch_gpu"]'), inputs.job)
+ if: contains(fromJSON('["run_models_gpu", "run_quantization_torch_gpu"]'), inputs.job)
name: Setup
strategy:
matrix:
@@ -64,7 +64,7 @@ jobs:
run: pip freeze
- id: set-matrix
- if: ${{ inputs.job == 'run_tests_gpu' }}
+ if: ${{ inputs.job == 'run_models_gpu' }}
name: Identify models to test
working-directory: /transformers/tests
run: |
@@ -72,7 +72,7 @@ jobs:
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
- id: set-matrix-quantization
- if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
+ if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
name: Identify quantization method to test
working-directory: /transformers/tests
run: |
@@ -82,8 +82,8 @@ jobs:
run: |
nvidia-smi
- run_tests_gpu:
- if: ${{ inputs.job == 'run_tests_gpu' }}
+ run_models_gpu:
+ if: ${{ inputs.job == 'run_models_gpu' }}
name: " "
needs: setup
strategy:
@@ -134,19 +134,19 @@ jobs:
- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
- python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
+ python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
+ run: cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
- path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
+ name: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
+ path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
run_pipelines_tf_gpu:
if: ${{ inputs.job == 'run_pipelines_tf_gpu' }}
@@ -185,19 +185,19 @@ jobs:
- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
- python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines
+ python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_tf_gpu_test_reports tests/pipelines
- name: Failure short reports
if: ${{ always() }}
run: |
- cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt
+ cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_tf_gpu_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_tf_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu
- path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu
+ name: ${{ matrix.machine_type }}_run_pipelines_tf_gpu_test_reports
+ path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_tf_gpu_test_reports
run_examples_gpu:
if: ${{ inputs.job == 'run_examples_gpu' }}
@@ -236,22 +236,22 @@ jobs:
working-directory: /transformers
run: |
pip install -r examples/pytorch/_tests_requirements.txt
- python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
+ python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
+ run: cat /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_examples_gpu
- path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
+ name: ${{ matrix.machine_type }}_run_examples_gpu_test_reports
+ path: /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports
- run_all_tests_torch_cuda_extensions_gpu:
- if: ${{ inputs.job == 'run_all_tests_torch_cuda_extensions_gpu' }}
+ run_torch_cuda_extensions_gpu:
+ if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
name: Torch CUDA extension tests
strategy:
fail-fast: false
@@ -296,22 +296,22 @@ jobs:
- name: Run all tests on GPU
working-directory: /workspace/transformers
run: |
- python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+ python -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
+ run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
- path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
+ name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+ path: /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
- run_tests_quantization_torch_gpu:
- if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
+ run_quantization_torch_gpu:
+ if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
name: " "
needs: setup
strategy:
@@ -357,26 +357,26 @@ jobs:
- name: Run quantization tests on GPU
working-directory: /transformers
run: |
- python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+ python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
- run: cat /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }}/failures_short.txt
+ run: cat /transformers/reports/${{ matrix.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
- - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu_${{ env.matrix_folders }}"
+ - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
- name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu_${{ env.matrix_folders }}
- path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }}
+ name: ${{ matrix.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
+ path: /transformers/reports/${{ matrix.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
run_extract_warnings:
- # Let's only do this for the job `run_tests_gpu` to simplify the (already complex) logic.
- if: ${{ always() && inputs.job == 'run_tests_gpu' }}
+ # Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic.
+ if: ${{ always() && inputs.job == 'run_models_gpu' }}
name: Extract warnings in CI artifacts
runs-on: ubuntu-22.04
- needs: [setup, run_tests_gpu]
+ needs: [setup, run_models_gpu]
steps:
- name: Checkout transformers
uses: actions/checkout@v4
@@ -416,12 +416,12 @@ jobs:
name: Slack Report
needs: [
setup,
- run_tests_gpu,
+ run_models_gpu,
run_pipelines_torch_gpu,
run_pipelines_tf_gpu,
run_examples_gpu,
- run_all_tests_torch_cuda_extensions_gpu,
- run_tests_quantization_torch_gpu,
+ run_torch_cuda_extensions_gpu,
+ run_quantization_torch_gpu,
run_extract_warnings
]
if: ${{ always() }}
diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml
index 88660914bfdc65..75905dde495e98 100644
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@@ -35,7 +35,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/download-artifact@v4
- name: Send message to Slack
- if: ${{ inputs.job != 'run_tests_quantization_torch_gpu' }}
+ if: ${{ inputs.job != 'run_quantization_torch_gpu' }}
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
@@ -61,7 +61,7 @@ jobs:
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
- name: Failure table artifacts
# Only the model testing job is concerned for this step
- if: ${{ inputs.job == 'run_tests_gpu' }}
+ if: ${{ inputs.job == 'run_models_gpu' }}
uses: actions/upload-artifact@v4
with:
name: prev_ci_results
@@ -70,7 +70,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/download-artifact@v4
- name: Send message to Slack for quantization workflow
- if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
+ if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9aee200ba4120e..c67e83b8fa2b4b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -110,7 +110,7 @@ New models are constantly released and if you want to implement a new model, ple
If you are willing to contribute the model yourself, let us know so we can help you add it to đ€ Transformers!
-We have added a [detailed guide and templates](https://github.com/huggingface/transformers/tree/main/templates) to help you get started with adding a new model, and we also have a more technical guide for [how to add a model to đ€ Transformers](https://huggingface.co/docs/transformers/add_new_model).
+We have a technical guide for [how to add a model to đ€ Transformers](https://huggingface.co/docs/transformers/add_new_model).
## Do you want to add documentation?
diff --git a/Makefile b/Makefile
index 49535b5694d6fd..ebc66d922cdd1b 100644
--- a/Makefile
+++ b/Makefile
@@ -44,7 +44,6 @@ repo-consistency:
python utils/check_config_attributes.py
python utils/check_doctest_list.py
python utils/update_metadata.py --check-only
- python utils/check_task_guides.py
python utils/check_docstrings.py
python utils/check_support_list.py
@@ -85,7 +84,6 @@ fix-copies:
python utils/check_table.py --fix_and_overwrite
python utils/check_dummies.py --fix_and_overwrite
python utils/check_doctest_list.py --fix_and_overwrite
- python utils/check_task_guides.py --fix_and_overwrite
python utils/check_docstrings.py --fix_and_overwrite
# Run tests for the library
diff --git a/README.md b/README.md
index 24032d4a536f69..d87b55414ce45c 100644
--- a/README.md
+++ b/README.md
@@ -294,276 +294,7 @@ Follow the installation pages of Flax, PyTorch or TensorFlow to see how to insta
Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€ Transformers currently provides the following architectures (see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them):
-
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer.
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from Ăcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz SuĂĄrez*, Yoann Dupont, Laurent Romary, Ăric Villemonte de la Clergerie, DjamĂ© Seddah and BenoĂźt Sagot.
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo LĂŒddecke and Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste RoziÚre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere.
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (from University of Hong Kong and TikTok) released with the paper [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao.
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl.
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas OÄuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre DĂ©fossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (from ESPnet) released with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoßt Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, SaÄnak TaĆırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (from Google) released with the paper [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by the Gemma Google team.
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (from Hugging Face) released with the blog [IDEFICS2](https://huggingface.co/blog/idefics2) by LĂ©o Tronchon, Hugo Laurencon, Victor Sanh.
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste RoziÚre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (from Albert Gu and Tri Dao) released with the paper [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) by Albert Gu and Tri Dao.
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noahâs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (from AI2) released with the paper [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) by Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** (from IBM Research) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier HĂ©naff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, JoĂŁo Carreira.
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio CĂ©sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, SĂ©bastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, SĂ©bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich KĂŒttler, Mike Lewis, Wen-tau Yih, Tim RocktĂ€schel, Sebastian Riedel, Douwe Kiela.
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams.
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr DollĂĄr.
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault FĂ©vry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (from Beijing Academy of Artificial Intelligence (BAAI)) released with the paper [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (from Google AI) released with the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer.
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of WĂŒrzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, PaweĆ Krzysztof Nowak, Thomas MĂŒller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (from Microsoft Research) released with the paper [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) by Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal.
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (from University of WisconsinâMadison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr DollĂĄr, Ross Girshick.
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco GuzmĂĄn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedback before starting your PR.
+đ€ Transformers currently provides the following architectures: see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them.
To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the đ€ Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).
diff --git a/README_de.md b/README_de.md
index c602c50bc49ac2..fc60bfe31a4a13 100644
--- a/README_de.md
+++ b/README_de.md
@@ -290,276 +290,7 @@ Folgen Sie den Installationsanleitungen von Flax, PyTorch oder TensorFlow, um zu
Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€ Transformers bietet derzeit die folgenden Architekturen an (siehe [hier](https://huggingface.co/docs/transformers/model_summary) fĂŒr eine jeweilige Ăbersicht):
-
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer.
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from Ăcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz SuĂĄrez*, Yoann Dupont, Laurent Romary, Ăric Villemonte de la Clergerie, DjamĂ© Seddah and BenoĂźt Sagot.
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo LĂŒddecke and Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste RoziÚre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere.
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (from University of Hong Kong and TikTok) released with the paper [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao.
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl.
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas OÄuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre DĂ©fossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (from ESPnet) released with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoßt Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, SaÄnak TaĆırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (from Google) released with the paper [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by the Gemma Google team.
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (from Hugging Face) released with the paper [IDEFICS2](https://huggingface.co/blog/idefics2) by LĂ©o Tronchon, Hugo Laurencon, Victor Sanh.
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste RoziÚre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (from Albert Gu and Tri Dao) released with the paper [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) by Albert Gu and Tri Dao.
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noahâs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (from AI2) released with the paper [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) by Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** (from IBM Research) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier HĂ©naff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, JoĂŁo Carreira.
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio CĂ©sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, SĂ©bastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, SĂ©bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich KĂŒttler, Mike Lewis, Wen-tau Yih, Tim RocktĂ€schel, Sebastian Riedel, Douwe Kiela.
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams.
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr DollĂĄr.
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault FĂ©vry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (from Beijing Academy of Artificial Intelligence (BAAI) released with the paper [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (from Google AI) released with the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer.
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of WĂŒrzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, PaweĆ Krzysztof Nowak, Thomas MĂŒller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (from Microsoft Research) released with the paper [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) by Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal.
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (from University of WisconsinâMadison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr DollĂĄr, Ross Girshick.
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco GuzmĂĄn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-1. Möchten Sie ein neues Modell beitragen? Wir haben einen **detaillierten Leitfaden und Vorlagen** hinzugefĂŒgt, um Sie beim HinzufĂŒgen eines neuen Modells zu unterstĂŒtzen. Sie können diese im [`templates`](./templates) Ordner des Repositorys finden. Lesen Sie unbedingt die [Beitragshinweise](./CONTRIBUTING.md) und kontaktieren Sie die Maintainer oder erstellen Sie ein Issue, um Feedback zu sammeln, bevor Sie mit der PR starten.
+đ€ Transformers bietet derzeit die folgenden Architekturen an: siehe [hier](https://huggingface.co/docs/transformers/model_summary) fĂŒr eine jeweilige Ăbersicht.
Um zu ĂŒberprĂŒfen, ob jedes Modell eine Implementierung in Flax, PyTorch oder TensorFlow hat oder ĂŒber einen zugehörigen Tokenizer verfĂŒgt, der von der đ€ Tokenizers-Bibliothek unterstĂŒtzt wird, schauen Sie auf [diese Tabelle](https://huggingface.co/docs/transformers/index#supported-frameworks).
diff --git a/README_es.md b/README_es.md
index a73de46252610c..097fb4fce88797 100644
--- a/README_es.md
+++ b/README_es.md
@@ -267,276 +267,7 @@ Sigue las pĂĄginas de instalaciĂłn de Flax, PyTorch o TensorFlow para ver cĂłmo
NĂșmero actual de puntos de control: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€ Transformers actualmente proporciona las siguientes arquitecturas (ver [aquĂ](https://huggingface.co/docs/transformers/model_summary) para un resumen de alto nivel de cada uno de ellas.):
-
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from Ăcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz SuĂĄrez*, Yoann Dupont, Laurent Romary, Ăric Villemonte de la Clergerie, DjamĂ© Seddah and BenoĂźt Sagot.
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo LĂŒddecke and Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste RoziÚre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere.
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (from University of Hong Kong and TikTok) released with the paper [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao.
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl.
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas OÄuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre DĂ©fossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (from ESPnet) released with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoßt Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, SaÄnak TaĆırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (from Google) released with the paper [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by the Gemma Google team.
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (from Hugging Face) released with the paper [IDEFICS2](https://huggingface.co/blog/idefics2) by LĂ©o Tronchon, Hugo Laurencon, Victor Sanh.
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste RoziÚre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (from Albert Gu and Tri Dao) released with the paper [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) by Albert Gu and Tri Dao.
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed..
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noahâs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (from AI2) released with the paper [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) by Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** (from IBM Research) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier HĂ©naff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, JoĂŁo Carreira.
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released with the paper [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio CĂ©sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, SĂ©bastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, SĂ©bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich KĂŒttler, Mike Lewis, Wen-tau Yih, Tim RocktĂ€schel, Sebastian Riedel, Douwe Kiela.
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams.
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr DollĂĄr.
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault FĂ©vry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (from Beijing Academy of Artificial Intelligence (BAAI) released with the paper [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (from Google AI) released with the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer.
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of WĂŒrzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, PaweĆ Krzysztof Nowak, Thomas MĂŒller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (from Microsoft Research) released with the paper [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) by Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal.
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (from University of WisconsinâMadison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr DollĂĄr, Ross Girshick.
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco GuzmĂĄn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-1. ÂżQuieres aportar un nuevo modelo? Hemos agregado una **guĂa detallada y plantillas** para guiarte en el proceso de agregar un nuevo modelo. Puedes encontrarlos en la carpeta de [`templates`](./templates) del repositorio. AsegĂșrate de revisar las [pautas de contribuciĂłn](./CONTRIBUTING.md) y comunĂcate con los mantenedores o abra un problema para recopilar comentarios antes de comenzar su PR.
+đ€ Transformers actualmente proporciona las siguientes arquitecturas: ver [aquĂ](https://huggingface.co/docs/transformers/model_summary) para un resumen de alto nivel de cada uno de ellas.
Para comprobar si cada modelo tiene una implementaciĂłn en Flax, PyTorch o TensorFlow, o tiene un tokenizador asociado respaldado por la librerĂa đ€ Tokenizers, ve a [esta tabla](https://huggingface.co/docs/transformers/index#supported-frameworks).
diff --git a/README_fr.md b/README_fr.md
index d42f65061f8075..d58bb0bbca385d 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -289,275 +289,7 @@ Suivez les pages d'installation de Flax, PyTorch ou TensorFlow pour voir comment
Nombre actuel de points de contrĂŽle : ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€ Transformers fournit actuellement les architectures suivantes (consultez [ici](https://huggingface.co/docs/transformers/model_summary) pour un rĂ©sumĂ© global de chacune d'entre elles) :
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (de Google Research et du Toyota Technological Institute at Chicago) publié dans l'article [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), par Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (de Google Research) publié dans l'article [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) de Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (de BAAI) publié dans l'article [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) de Chen, Zhongzhi et Liu, Guang et Zhang, Bo-Wen et Ye, Fulong et Yang, Qinghong et Wu, Ledell.
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (du MIT) publié dans l'article [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) de Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (de l'Université Tsinghua) publié dans l'article [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) de Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (de Suno) publié dans le référentiel [suno-ai/bark](https://github.com/suno-ai/bark) par l'équipe Suno AI.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (de Facebook) publié dans l'article [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) de Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov et Luke Zettlemoyer.
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (de l'Ăcole polytechnique) publiĂ© dans l'article [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) de Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (de VinAI Research) publié dans l'article [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) de Nguyen Luong Tran, Duong Minh Le et Dat Quoc Nguyen.
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (de Microsoft) publié dans l'article [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) par Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (de Google) publié dans l'article [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) par Jacob Devlin, Ming-Wei Chang, Kenton Lee et Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (de Google) publié dans l'article [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) parSascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (de VinAI Research) publié dans l'article [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) par Dat Quoc Nguyen, Thanh Vu et Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (de Google Research) publié dans l'article [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) par Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (de Google Research) publié dans l'article [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) par Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (de Microsoft Research AI4Science) publié dans l'article [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) par Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon et Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (de Google AI) publié dans l'article [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) par Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (de Facebook) publié dans l'article [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) par Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (de Facebook) publié dans l'article [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) par Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (de Salesforce) publié dans l'article [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) par Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (de Salesforce) publié dans l'article [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) par Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (de l'atelier BigScience) publié par l'[atelier BigScience](https://bigscience.huggingface.co/).
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (d'Alexa) publié dans l'article [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) par Adrian de Wynter et Daniel J. Perry.
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (de l'Institut de technologie de Harbin/Microsoft Research Asia/Intel Labs) publié dans l'article [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) par Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (de NAVER CLOVA) publié dans l'article [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) par Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (de Google Research) publié dans l'article [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) par Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (d'Inria/Facebook/Sorbonne) publiĂ© dans l'article [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) par Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz SuĂĄrez*, Yoann Dupont, Laurent Romary, Ăric Villemonte de la Clergerie, DjamĂ© Seddah et BenoĂźt Sagot.
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (de Google Research) publié dans l'article [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) par Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (d'OFA-Sys) publié dans l'article [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) par An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (de LAION-AI) publié dans l'article [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) par Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (d'OpenAI) publié dans l'article [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) par Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (de l'UniversitĂ© de Göttingen) publiĂ© dans l'article [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) par Timo LĂŒddecke et Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** publié dans l'article [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) par James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (de Salesforce) publié dans l'article [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) par Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (de MetaAI) publié dans l'article [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) par Baptiste RoziÚre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (de Cohere) publié dans l'article [Command-R: Retrieval Augmented Generation at Production Scale]() parCohere.
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (de Microsoft Research Asia) publié dans l'article [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) par Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (de YituTech) publié dans l'article [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) par Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (de Facebook AI) publié dans l'article [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) par Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (de Facebook AI) publié dans l'article [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) par Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (de l'Université de Tsinghua) publié dans l'article [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) par Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (d'OpenBMB) publié par l'[OpenBMB](https://www.openbmb.org/).
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (de Salesforce) publié dans l'article [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) par Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong et Richard Socher.
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (de Microsoft) publié dans l'article [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) par Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (de Facebook) publié dans l'article [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) par Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (de Microsoft) publié dans l'article [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) par Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (de Microsoft) publié dans l'article [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) par Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (de Berkeley/Facebook/Google) publié dans l'article [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) par Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (de SenseTime Research) publié dans l'article [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) par Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (de Facebook) publié dans l'article [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) par Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (de Google AI) publié dans l'article [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) par Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (de l'université d'Hong Kong et TikTok) publié dans l'article [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao.
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (de l'UniversitĂ© du Texas Ă Austin) publiĂ© dans l'article [NMS Strikes Back](https://arxiv.org/abs/2212.06137) par Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl.
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (de Facebook) publié dans l'article [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) par Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (de Microsoft Research) publié dans l'article [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) par Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (de SHI Labs) publié dans l'article [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) par Ali Hassani et Humphrey Shi.
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (de Meta AI) publié dans l'article [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) par Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (de HuggingFace), publiĂ© dans l'article [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) par Victor Sanh, Lysandre Debut et Thomas Wolf. La mĂȘme mĂ©thode a Ă©tĂ© appliquĂ©e pour compresser GPT2 en [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa en [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT en [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) et une version allemande de DistilBERT.
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (de Microsoft Research) publié dans l'article [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) par Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (de NAVER), publié dans l'article [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) par Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (de Facebook) publiĂ© dans l'article [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) par Vladimir Karpukhin, Barlas OÄuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen et Wen-tau Yih.
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (d'Intel Labs) publié dans l'article [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) par René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (de Snap Research) publié dans l'article [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) par Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (de Google Brain) publié dans l'article [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) par Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (de Google Research/Université Stanford) publié dans l'article [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) par Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (de Meta AI) publié dans l'article [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) par Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (de Google Research) publié dans l'article [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) par Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (de Baidu) publié dans l'article [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) par Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (de Baidu) publié dans l'article [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) par Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (de Meta AI) sont des modÚles de langage de protéines de type transformateur. **ESM-1b** a été publié dans l'article [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) par Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma et Rob Fergus. **ESM-1v** a été publié dans l'article [Les modÚles de langage permettent une prédiction hors champ des effets des mutations sur la fonction des protéines](https://doi.org/10.1101/2021.07.09.450648) par Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu et Alexander Rives. **ESM-2 et ESMFold** ont été publiés avec l'article [Les modÚles de langage des séquences de protéines à l'échelle de l'évolution permettent une prédiction précise de la structure](https://doi.org/10.1101/2022.07.20.500902) par Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (de Technology Innovation Institute) par Almazrouei, Ebtesam et Alobeidli, Hamza et Alshamsi, Abdulaziz et Cappelli, Alessandro et Cojocaru, Ruxandra et Debbah, Merouane et Goffinet, Etienne et Heslow, Daniel et Launay, Julien et Malartic, Quentin et Noune, Badreddine et Pannier, Baptiste et Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (d'ESPnet) publié dans l'article [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) par Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang et Yuekai Zhang.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (de Google AI) publié dans le référentiel [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) par Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le et Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (de Google AI) publié dans le référentiel [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) par Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le et Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (du CNRS) publié dans l'article [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) par Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoßt Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (de Facebook AI) publié dans l'article [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) par Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach et Douwe Kiela.
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (de Google Research) publié dans l'article [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) par James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (de Microsoft Research) publié dans l'article [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) par Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (de l'Université Carnegie Mellon/Google Brain) publié dans l'article [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) par Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (de ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, SaÄnak TaĆırlar. PubliĂ© dans l'article [blog post](https://www.adept.ai/blog/fuyu-8b)
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (de Google) publié dans l'article [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) parthe Gemma Google team.
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (de Microsoft Research) publié dans l'article [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) par Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (de la KAIST) publié dans l'article [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) par Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (d'OpenAI) publié dans l'article [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) par Alec Radford, Karthik Narasimhan, Tim Salimans et Ilya Sutskever.
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (d'EleutherAI) publié dans le référentiel [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) par Sid Black, Stella Biderman, Leo Gao, Phil Wang et Connor Leahy.
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (d'EleutherAI) publié dans l'article [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) par Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (de ABEJA) publié par Shinya Otani, Takayoshi Makabe, Anuj Arora et Kyo Hattori.
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (d'OpenAI) a été publié dans l'article [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) par Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei et Ilya Sutskever.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (d'EleutherAI) a été publié dans le dépÎt [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) par Ben Wang et Aran Komatsuzaki.
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (d'AI-Sweden) a Ă©tĂ© publiĂ© dans l'article [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) par Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (de BigCode) a Ă©tĂ© publiĂ© dans l'article [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) par Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** a été publié dans le dépÎt [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) par Toshiyuki Sakamoto (tanreinama).
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (de Microsoft) a été publié dans l'article [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) par Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (de Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) publié dans l'article [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) parShilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (de l'UCSD, NVIDIA) a été publié dans l'article [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) par Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (d'Allegro.pl, AGH University of Science and Technology) a été publié dans l'article [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) par Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (de Facebook) a été publié dans l'article [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) par Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (de Berkeley) a été publié dans l'article [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) par Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (de HuggingFace) a été publié dans l'article [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) par Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (de Hugging Face) publié dans l'article [IDEFICS2](https://huggingface.co/blog/idefics2) parLéo Tronchon, Hugo Laurencon, Victor Sanh.
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (d'OpenAI) a été publié dans l'article [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) par Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (de l'Université de Beihang, UC Berkeley, Rutgers University, SEDD Company) a été publié dans l'article [Informer : Au-delà du Transformer efficace pour la prévision de séries temporel
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (de Salesforce) a été publié dans l'article [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) de Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (d'OpenAI) a été publié dans l'article [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) de Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (de Microsoft Research Asia) a été publié dans l'article [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) de Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (de Microsoft Research Asia) a été publié dans l'article [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) de Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (de Microsoft Research Asia) a été publié dans l'article [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) de Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (de Microsoft Research Asia) a été publié dans l'article [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) de Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (de Microsoft Research Asia) a été publié dans l'article [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) de Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (d'AllenAI) a été publié dans l'article [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) de Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (de Meta AI) a été publié dans l'article [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) de Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (de l'Université de technologie du Sud de la Chine) a été publié dans l'article [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) de Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (de l'équipe FAIR de Meta AI) a été publié dans l'article [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) de Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste RoziÚre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (de l'équipe FAIR de Meta AI) a été publié dans l'article [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) de Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (de Microsoft Research & University of Wisconsin-Madison) a été publié dans l'article [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) de Haotian Liu, Chunyuan Li, Yuheng Li et Yong Jae Lee.
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (de Microsoft Research & University of Wisconsin-Madison) publié dans l'article [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744) parHaotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (d'AllenAI) a été publié dans l'article [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) de Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (de Google AI) a été publié dans l'article [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) de Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (de Studio Ousia) a été publié dans l'article [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) de Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (de l'UNC Chapel Hill) a été publié dans l'article [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) de Hao Tan et Mohit Bansal.
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (de Facebook) a été publié dans l'article [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) de Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve et Ronan Collobert.
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (de Facebook) a été publié dans l'article [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) de Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (de Google) a été publié dans l'article [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) de Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (de Albert Gu and Tri Dao) publié dans l'article [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) parAlbert Gu and Tri Dao.
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Des modÚles de traduction automatique formés avec les données [OPUS](http://opus.nlpl.eu/) par Jörg Tiedemann. Le [cadre Marian](https://marian-nmt.github.io/) est en cours de développement par l'équipe Microsoft Translator.
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (de Microsoft Research Asia) a été publié dans l'article [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) de Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (de FAIR et UIUC) a été publié dans l'article [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) de Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (de Meta et UIUC) a été publié dans l'article [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) de Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (de Google AI) a été publié dans l'article [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) de Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (de Facebook) a été publié dans l'article [Pré-entraßnement de débruitage multilingue pour la traduction automatique neuronale
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (de Facebook) a été publié dans l'article [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) par Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (de Meta/USC/CMU/SJTU) a été publié dans l'article [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) par Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May et Luke Zettlemoyer.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (de NVIDIA) a été publié dans l'article [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) par Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper et Bryan Catanzaro.
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (de NVIDIA) a été publié dans l'article [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) par Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper et Bryan Catanzaro.
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (d'Alibaba Research) a été publié dans l'article [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) par Peng Wang, Cheng Da et Cong Yao.
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (de Mistral AI) par l'équipe [Mistral AI](https://mistral.ai) : Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (de Mistral AI) par l'équipe [Mistral AI](https://mistral.ai) : Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (de Studio Ousia) a été publié dans l'article [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) par Ryokan Ri, Ikuya Yamada et Yoshimasa Tsuruoka.
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (de Facebook) a été publié dans l'article [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) par Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (de CMU/Google Brain) a été publié dans l'article [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) par Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang et Denny Zhou.
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (de Google Inc.) a été publié dans l'article [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) par Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (de Google Inc.) a été publié dans l'article [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) par Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (d'Apple) a été publié dans l'article [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) par Sachin Mehta et Mohammad Rastegari.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (d'Apple) a été publié dans l'article [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) par Sachin Mehta et Mohammad Rastegari.
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (de Microsoft Research) a été publié dans l'article [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) par Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (de MosaiML) a été publié avec le référentiel [llm-foundry](https://github.com/mosaicml/llm-foundry/) par l'équipe MosaiML NLP.
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (de l'Université du Wisconsin - Madison) a été publié dans l'article [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) par Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (de Google AI) a été publié dans l'article [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) par Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (de Meta) a été publié dans l'article [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) par Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi et Alexandre Défossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (de Meta) publié dans l'article [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) parJade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (de RUC AI Box) a été publié dans l'article [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) par Tianyi Tang, Junyi Li, Wayne Xin Zhao et Ji-Rong Wen.
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (de SHI Labs) a été publié dans l'article [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) par Ali Hassani, Steven Walton, Jiachen Li, Shen Li et Humphrey Shi.
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (du laboratoire Noah's Ark de Huawei) a été publié dans l'article [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) par Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen et Qun Liu.
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (de Meta) a été publié dans l'article [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) par l'équipe NLLB.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (de Meta) a été publié dans l'article [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) par l'équipe NLLB.
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (de Meta AI) a été publié dans l'article [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) par Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (de l'Université du Wisconsin - Madison) a été publié dans l'article [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) par Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (de AI2) publié dans l'article [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) parDirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (de SHI Labs) a été publié dans l'article [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) par Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (de [s-JoL](https://huggingface.co/s-JoL)) publié sur GitHub (maintenant supprimé).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (de Meta AI) a été publié dans l'article [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) par Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (de Google AI) a été publié dans l'article [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) par Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf et Neil Houlsby.
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (de Google AI) a été publié dans l'article [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) par Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** (d'IBM Research) a été publié dans l'article [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) par Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (d'IBM) a été publié dans l'article [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) par Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (de Google) a été publié dans l'article [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) par Jingqing Zhang, Yao Zhao, Mohammad Saleh et Peter J. Liu.
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (de Google) a été publié dans l'article [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) par Jason Phang, Yao Zhao et Peter J. Liu.
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (de Deepmind) a été publié dans l'article [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) par Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals et João Carreira.
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (d'ADEPT) a Ă©tĂ© publiĂ© dans un [blog post](https://www.adept.ai/blog/persimmon-8b) par Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (de Microsoft) a été publié avec les articles - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) par Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee et Yuanzhi Li, [Textbooks Are All You Need II : Rapport technique phi-1.5](https://arxiv.org/abs/2309.05463) par Yuanzhi Li, Sébastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar et Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (de VinAI Research) a été publié dans l'article [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) par Dat Quoc Nguyen et Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (de Google) a été publié dans l'article [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) par Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (de UCLA NLP) a été publié dans l'article [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) par Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (de Sea AI Labs) a été publié dans l'article [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) par Yu, Weihao et Luo, Mi et Zhou, Pan et Si, Chenyang et Zhou, Yichen et Wang, Xinchao et Feng, Jiashi et Yan, Shuicheng.
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** a été publié dans l'article [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) par Jongho Choi et Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (de Microsoft Research) a été publié dans l'article [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) par Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang et Ming Zhou.
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (de l'Université de Nankin, l'Université de Hong Kong, etc.) a été publié dans l'article [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) par Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo et Ling Shao.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (de Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) publié dans l'article [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) parWenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (de NVIDIA) a été publié dans l'article [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) par Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev et Paulius Micikevicius.
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (de l'équipe Qwen, Alibaba Group) a été publié avec le rapport technique [Qwen Technical Report](https://arxiv.org/abs/2309.16609) par Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou et Tianhang Zhu.
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (de l'équipe Qwen, Alibaba Group) a été publié avec le rapport technique [blog post](https://qwenlm.github.io/blog/qwen-moe/) par Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (de Facebook) a Ă©tĂ© publiĂ© dans l'article [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) par Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich KĂŒttler, Mike Lewis, Wen-tau Yih, Tim RocktĂ€schel, Sebastian Riedel, Douwe Kiela.
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (de Google Research) a été publié dans l'article [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) par Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat et Ming-Wei Chang.
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (de Google) publié dans l'article [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) parthe Griffin, RLHF and Gemma Teams.
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (de Google Research) a Ă©tĂ© publiĂ© dans l'article [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) par Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (de META Platforms) a été publié dans l'article [Designing Network Design Space](https://arxiv.org/abs/2003.13678) par Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollår.
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (de Google Research) a été publié dans l'article [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) par Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (de Microsoft Research) a été publié dans l'article [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) par Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (de Facebook), publié dans l'article [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) par Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (de Facebook) a été publié dans l'article [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) par Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (de WeChatAI) a été publié dans l'article [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) par HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (de ZhuiyiTechnology), publié dans l'article [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) par Jianlin Su et Yu Lu et Shengfeng Pan et Bo Wen et Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (de Bo Peng), publié sur [this repo](https://github.com/BlinkDL/RWKV-LM) par Bo Peng.
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (de Meta AI) a Ă©tĂ© publiĂ© dans l'article [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) par l'Ă©quipe de communication transparente.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (de Meta AI) a été publié dans l'article [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) par l'équipe de communication transparente.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (de NVIDIA) a été publié dans l'article [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) par Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (de Beijing Academy of Artificial Intelligence (BAAI) publié dans l'article [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) parXinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (de Meta AI) a été publié dans l'article [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) par Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (de ASAPP) a été publié dans l'article [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) par Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (de ASAPP) a été publié dans l'article [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) par Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (de Google AI) a été publié dans l'article [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) par Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer.
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (de Microsoft Research) a été publié dans l'article [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) par Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (de Facebook), publié dans l'article [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) par Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (de Facebook), publié dans l'article [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) par Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (de l'Université de Tel Aviv), publié dans l'article [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) par Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (de Berkeley) a été publié dans l'article [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) par Forrest N. Iandola, Albert E. Shaw, Ravi Krishna et Kurt W. Keutzer.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (de MagicLeap) publié dans l'article [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) parDaniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (de MBZUAI) a été publié dans l'article [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) par Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (de Microsoft) a été publié dans l'article [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) par Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (de Microsoft) a été publié dans l'article [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) par Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (de l'UniversitĂ© de WĂŒrzburg) a Ă©tĂ© publiĂ© dans l'article [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) par Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (de Google) a été publié dans l'article [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) par William Fedus, Barret Zoph, Noam Shazeer.
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (de Google AI) a été publié dans l'article [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) par Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li et Peter J. Liu.
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (de Google AI) a été publié dans le dépÎt [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) par Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li et Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (de Microsoft Research) a été publié dans l'article [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) par Brandon Smock, Rohith Pesala, Robin Abraham.
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (de Google AI) a Ă©tĂ© publiĂ© dans l'article [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) par Jonathan Herzig, PaweĆ Krzysztof Nowak, Thomas MĂŒller, Francesco Piccinno et Julian Martin Eisenschlos.
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (de Microsoft Research) a été publié dans l'article [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) par Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen et Jian-Guang Lou.
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (de HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (de Facebook) a été publié dans l'article [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) par Gedas Bertasius, Heng Wang, Lorenzo Torresani.
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (de l'Université de Californie à Berkeley) a été publié dans l'article [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) par Michael Janner, Qiyang Li, Sergey Levine.
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (de Google/CMU) a été publié dans l'article [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) par Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (de Microsoft), publié dans l'article [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) par Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (de l'UNC Chapel Hill) a été publié dans l'article [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) par Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (d'Intel) a été publié dans l'article [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) par Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (de Microsoft Research) publié dans l'article [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) parZineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal.
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (de Google Research) a été publié dans l'article [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) par Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler.
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (de Google Research) a été publié dans l'article [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) par Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (de Microsoft Research) a été publié dans l'article [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) par Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (de Microsoft Research) a été publié dans l'article [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) par Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (de Kakao Corporation) a été publié dans l'article [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) par Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim et Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (de l'Université de Pékin) a été publié dans l'article [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) par Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (de l'Université Tsinghua et de l'Université Nankai) publié dans l'article [Visual Attention Network](https://arxiv.org/abs/2202.09741) par Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (du groupe d'informatique multimédia, Université de Nankin) publié dans l'article [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) par Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (du NAVER AI Lab/Kakao Enterprise/Kakao Brain) publié dans l'article [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) par Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (de l'UniversitĂ© du WisconsinâMadison) publiĂ© dans l'article [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) par Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (de Google AI) publié dans l'article [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) par Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (de UCLA NLP) publié dans l'article [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) par Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (de Google AI) publié dans l'article [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) par Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (de Meta AI) publié dans l'article [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) par Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (de Meta AI) publié dans l'article [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) par Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollår, Ross Girshick.
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (de HUST-VL) publié dans l'article [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) par Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (de Meta AI) publié dans l'article [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) par Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (de Kakao Enterprise) publié dans l'article [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) par Jaehyeon Kim, Jungil Kong, Juhee Son.
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (de Google Research) publiĂ© dans l'article [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) par Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (de Facebook AI) publié dans l'article [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) par Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (de Meta AI) publié dans l'article [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) par l'équipe Seamless Communication.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (de Facebook AI) a été publié dans l'article [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) par Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (de Facebook AI) a été publié dans l'article [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) par Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (de Microsoft Research) a été publié dans l'article [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) par Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (d'OpenAI) a été publié dans l'article [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) par Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (de Microsoft Research) a été publié dans l'article [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) par Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (de Meta AI) a été publié dans l'article [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) par Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (de Facebook AI) a été publié dans l'article [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) par Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (de Facebook) a été publié dans l'article [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) par Guillaume Lample et Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (de Microsoft Research) a été publié dans l'article [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) par Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang et Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (de Facebook AI), publié dans l'article [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) par Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmån, Edouard Grave, Myle Ott, Luke Zettlemoyer et Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (de Facebook AI), publié dans l'article [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) par Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (de Meta AI) a été publié dans l'article [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) par Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (de Google/CMU) a été publié dans l'article [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) par Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (de Facebook AI) publié dans l'article [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) par Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (de Facebook AI) publié dans l'article [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) par Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (de l'Université Huazhong des sciences et technologies) publié dans l'article [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) par Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (de l'Université du Wisconsin - Madison) publié dans l'article [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) par Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-1. Vous souhaitez contribuer avec un nouveau modÚle ? Nous avons ajouté un **guide détaillé et des modÚles types** pour vous guider dans le processus d'ajout d'un nouveau modÚle. Vous pouvez les trouver dans le dossier [`templates`](./templates) du référentiel. Assurez-vous de consulter les [directives de contribution](./CONTRIBUTING.md) et de contacter les mainteneurs ou d'ouvrir un ticket pour recueillir des commentaires avant de commencer votre pull request.
+đ€ Transformers fournit actuellement les architectures suivantes: consultez [ici](https://huggingface.co/docs/transformers/model_summary) pour un rĂ©sumĂ© global de chacune d'entre elles.
Pour vĂ©rifier si chaque modĂšle a une implĂ©mentation en Flax, PyTorch ou TensorFlow, ou s'il a un tokenizer associĂ© pris en charge par la bibliothĂšque đ€ Tokenizers, consultez [ce tableau](https://huggingface.co/docs/transformers/index#supported-frameworks).
diff --git a/README_hd.md b/README_hd.md
index 8a67023e2f1879..c72489d88aca5f 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -241,276 +241,7 @@ conda install conda-forge::transformers
à€à„à€à€żà€Żà„à€ à€à„ à€”à€°à„à€€à€źà€Ÿà€š à€žà€à€à„à€Żà€Ÿ: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€ à€à„à€°à€Ÿà€à€žà€«à„à€°à„à€źà€° à€”à€°à„à€€à€źà€Ÿà€š à€źà„à€ à€šà€żà€źà„à€šà€Čà€żà€à€żà€€ à€à€°à„à€à€żà€à„à€à„à€à€° à€à€Ÿ à€žà€źà€°à„à€„à€š à€à€°à€€à„ à€čà„à€ (à€źà„à€Ąà€Č à€à„ à€
à€”à€Čà„à€à€š à€à„ à€Čà€żà€ [à€Żà€čà€Ÿà€ à€Šà„à€à„à€](https://huggingface.co/docs/transformers/model_summary))ïŒ
-
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (Google Research and the Toyota Technological Institute at Chicago) à€žà€Ÿà€„ à€„à„à€žà€żà€ž [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), à€à„à€à€à„à€à€ à€Čà„à€š, à€źà€żà€à€à€Šà€Ÿ à€à„à€š, à€žà„à€Źà„à€žà„à€à€żà€Żà€š à€à„à€Ąà€źà„à€š, à€à„à€”à€żà€š à€à€żà€źà„à€Șà„à€Č, à€Șà„à€Żà„à€· à€¶à€°à„à€źà€Ÿ, à€°à€Ÿà€Ąà„ à€žà„à€°à€żà€à€
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research à€žà„) Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (à€«à„à€žà€Źà„à€) à€žà€Ÿà€„ à€„à„à€žà€żà€ž [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) à€Șà€° à€šà€żà€°à„à€à€° à€źà€Ÿà€à€ à€Čà„à€à€ž, à€Żà€żà€šà€čà€Ÿà€š à€Čà€żà€Żà„, à€šà€źà€š à€à„à€Żà€Č, à€źà€Ÿà€°à„à€à€š à€à€Œà€à€Œà€”à€żà€šà€żà€šà„à€à€Ÿà€Š, à€
à€Źà„à€Šà„à€Čà€°à€čà€źà€Ÿà€š à€źà„à€čà€źà„à€źà€Š, à€à€źà€° à€Čà„à€”à„, à€”à„à€ž à€žà„à€à„à€Żà€Ÿà€šà„à€” à€à€° à€Čà„à€Żà„à€ à€à€Œà„à€à€Čà€źà„à€Żà€°
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (à€žà„ Ăcole polytechnique) à€žà€Ÿà€„ à€„à„à€žà€żà€ž [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) à€Șà€° à€šà€żà€°à„à€à€° Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis à€°à€żà€čà€Ÿà€à„€
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)à€à„à€Żà„à€š à€Čà„à€à€à€ à€à„à€°à€Ÿà€š, à€Ąà„à€à€à€ à€źà€żà€šà„à€č à€Čà„ à€à€° à€Ąà€Ÿà€ à€à„à€”à„à€ à€à„à€Żà„à€š à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (Microsoft à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) Hangbo Bao, Li Dong, Furu Wei à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (à€à„à€à€Č à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) à€à„à€à€Ź à€Ąà„à€”à€Čà€żà€š, à€źà€żà€à€-à€”à„à€ à€à€Ÿà€à€, à€à„à€à€à€š à€Čà„ à€à€° à€à„à€°à€żà€žà„à€à„à€šà€Ÿ à€à„à€à€Ÿà€šà„à€”à€Ÿ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€°à€à€Ÿà€¶à€żà€€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ à€„à€Ÿà„€ .
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (à€à„à€à€Č à€žà„) à€žà€Ÿà€„ à€Šà„à€šà„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) à€žà€Ÿà€¶à€Ÿ à€°à„à€ à„, à€¶à€¶à€ż à€šà€Ÿà€°à€Ÿà€Żà€Ł, à€
à€Čà€żà€Żà€Ÿà€à„à€žà€ż à€žà„à€”à„à€°à€żà€š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) à€Ąà€Ÿà€ à€à„à€”à„à€ à€à„à€Żà„à€š, à€„à€Ÿà€š à€”à„ à€à€° à€
à€šà„à€č à€€à„à€à€š à€à„à€Żà„à€š à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€°à€à€Ÿà€¶à€żà€€à„€
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (à€à„à€à€Č à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) à€źà€à€à€Œà€żà€Č à€à€Œà€čà„à€°, à€à„à€°à„ à€à„à€°à„à€à€Łà„à€¶, à€
à€”à€żà€šà€Ÿà€”à€Ÿ à€Šà„à€Źà„, à€à„à€¶à„à€ à€à€à€à€žà„à€Čà„, à€à„à€°à€żà€ž à€
à€Čà„à€Źà€°à„à€à„, à€žà„à€à€à€żà€Żà€Ÿà€à„ à€à€à€à€Ÿà€šà„à€š, à€«à€żà€Čà€żà€Ș à€«à€Ÿà€ź, à€
à€šà€żà€°à„à€Šà„à€§ à€°à€Ÿà€”à„à€Čà€Ÿ, à€à€żà€«à€Œà€Ÿà€š à€”à€Ÿà€à€, à€Čà„ à€Żà€Ÿà€à€, à€
à€źà€° à€
à€čà€źà€Š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (à€à„à€à€Č à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) à€źà€à€à€Œà€żà€Č à€à€Œà€čà„à€°, à€à„à€°à„ à€à„à€°à„à€à€Łà„à€¶, à€
à€”à€żà€šà€Ÿà€”à€Ÿ à€Šà„à€Źà„, à€à„à€¶à„à€ à€à€à€à€žà„à€Čà„, à€à„à€°à€żà€ž à€
à€Čà„à€Źà€°à„à€à„, à€žà„à€à€à€żà€Żà€Ÿà€à„ à€à€à€à€Ÿà€šà€š, à€«à€żà€Čà€żà€Ș à€«à€Ÿà€ź à€Šà„à€”à€Ÿà€°à€Ÿ , à€
à€šà€żà€°à„à€Šà„à€§ à€°à€Ÿà€”à„à€Čà€Ÿ, à€à€żà€«à€Œà€Ÿà€š à€”à€Ÿà€à€, à€Čà„ à€Żà€Ÿà€à€, à€
à€źà€° à€
à€čà€źà€Š à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) à€žà„à€à„à€«à€š à€°à„à€Čà€°, à€à€źà€żà€Čà„ à€Šà„à€šà€š, à€šà€źà€š à€à„à€Żà€Č, à€Šà€Ÿ à€à„, à€źà„à€°à„ à€”à€żà€Čà€żà€Żà€źà€žà€š, à€Żà€żà€šà€čà€Ÿà€š à€Čà€żà€Żà„, à€à€żà€à€ à€à„, à€źà€Ÿà€Żà€Č à€à€, à€à€°à„à€ à€¶à€žà„à€à€°, à€à€°à€żà€ à€à€źà„€ à€žà„à€źà€żà€„, à€”à€Ÿà€-à€Čà„à€š à€Źà„à€°à„, à€à„à€žà€š à€”à„à€žà„à€à€š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) à€žà„à€à„à€«à€š à€°à„à€Čà€°, à€à€źà€żà€Čà„ à€Šà„à€šà€š, à€šà€źà€š à€à„à€Żà€Č, à€Šà€Ÿ à€à„, à€źà„à€°à„ à€”à€żà€Čà€żà€Żà€źà€žà€š, à€Żà€żà€šà€čà€Ÿà€š à€Čà€żà€Żà„, à€à€żà€à€ à€à„, à€źà€Ÿà€Żà€Č à€à€, à€à€°à„à€ à€¶à€žà„à€à€°, à€à€°à€żà€ à€à€ź à€žà„à€źà€żà€„, à€”à€Ÿà€-à€Čà„à€š à€Źà„à€°à„, à€à„à€žà€š à€”à„à€žà„à€à€š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (Salesforce à€žà„) Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (à€à€Čà„à€à„à€žà€Ÿ à€žà„) à€à€Ÿà€à€ à€à„ à€žà€Ÿà€„ [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) à€à€Ąà„à€°à€żà€Żà€š à€Ąà„ à€”à€żà€à€à€° à€à€° à€Ąà„à€šà€żà€Żà€Č à€à„ à€Șà„à€°à„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (à€čà€°à€Źà€żà€š à€à€à€žà„à€à€żà€à„à€Żà„à€ à€à„ à€à„à€à„à€šà„à€Čà„à€à„/à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€à€¶à€żà€Żà€Ÿ/à€à€à€à„à€Č à€Čà„à€Źà„à€ž à€žà„) à€à€Ÿà€à€ à€à„ à€žà€Ÿà€„ [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (NAVER CLOVA à€žà„) Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google à€
à€šà„à€žà€à€§à€Ÿà€š à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) Linting Xue, Aditya Barua, Noah Constant, à€°à€Ÿà€źà„ à€
à€Č-à€°à€«à„, à€¶à€°à€Ł à€šà€Ÿà€°à€à€, à€źà€żà€čà€żà€° à€à€Ÿà€Čà„, à€à€Ąà€ź à€°à„à€Źà€°à„à€à„à€ž, à€à„à€Čà€żà€š à€°à„à€«à„à€Č à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (à€à€šà€°à€żà€Żà€Ÿ/à€«à„à€žà€Źà„à€/à€žà„à€°à€Źà„à€š à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) à€Čà„à€ à€źà€Ÿà€°à„à€à€żà€š*, à€Źà„à€à€à€Ÿà€źà€żà€š à€źà„à€Čà€°*, à€Șà„à€Ąà„à€°à„ à€à„à€”à€żà€Żà€° à€à€°à„à€à€żà€à€Œ à€žà„à€à€°à„à€à€Œ*, à€Żà„à€à€š à€Ąà„à€Żà„à€Șà„à€šà„à€, à€Čà„à€°à„à€à€ à€°à„à€źà€°à„, à€à€°à€żà€ à€”à€żà€Čà„à€źà„à€šà„à€à„ à€Ąà„ à€Čà€Ÿ à€à„à€Čà€°à„à€à€°à„, à€à„à€źà„ à€žà„à€Ąà€Ÿà€č à€à€° à€Źà„à€šà„à€à€ à€žà€à„à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Šà€żà€Żà€Ÿ à€à€Żà€Ÿ à€Șà„à€Șà€° [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) à€à„à€šà€Ÿà€„à€š à€à€ à€à„à€Čà€Ÿà€°à„à€, à€Ąà„à€š à€à„à€°à„à€, à€Żà„à€Čà€żà€Żà€Ÿ à€à€°à„à€, à€à„à€š à€”à€żà€à€à€żà€à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI à€žà„) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) à€à€Čà„à€ à€°à„à€Ąà€«à„à€°à„à€Ą, à€à„à€à€ à€”à„à€ à€à€żà€ź, à€à„à€°à€żà€ž à€čà„à€Čà€Ÿà€žà„, à€à€Šà€żà€€à„à€Ż à€°à€źà„à€¶, à€à„à€Źà„à€°à€żà€Żà€Č à€à„à€č, à€žà€à€§à„à€Żà€Ÿ à€
à€à„à€°à€”à€Ÿà€Č, à€à€żà€°à„à€¶ à€¶à€Ÿà€žà„à€€à„à€°à„, à€
à€źà€Ÿà€à€Ąà€Ÿ à€à€žà„à€à„à€Č, à€Șà€Ÿà€źà„à€Čà€Ÿ à€źà€żà€¶à„à€à€żà€š, à€à„à€ à€à„à€Čà€Ÿà€°à„à€, à€à„à€°à„à€à„à€š à€à„à€°à„à€à€à€°, à€à€Čà„à€Żà€Ÿ à€žà„à€€à„à€žà„à€à„à€”à€° à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo LĂŒddecke and Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (à€žà„à€Čà„à€žà€«à„à€°à„à€ž à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) à€à€°à€żà€ à€šà€żà€à€à„à€à€Ș, à€Źà„ à€Șà„à€à€, à€čà€żà€°à„à€à€à„ à€čà€Żà€Ÿà€¶à„, à€Čà€żà€«à„ à€€à„, à€čà„à€à€š à€”à€Ÿà€à€, à€Żà€żà€à€à€Źà„ à€à„à€, à€žà€żà€Čà„à€”à€żà€Żà„ à€žà€Ÿà€”à€°à„à€ž, à€à„à€źà€żà€à€ à€à€żà€à€à€ à€°à€żà€Čà„à€à„€
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (MetaAI à€žà„) Baptiste RoziĂšre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, JĂ©rĂ©my Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre DĂ©fossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (Cohere à€žà„) Cohere. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Command-R: Retrieval Augmented Generation at Production Scale]() à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€à€¶à€żà€Żà€Ÿ à€žà„) à€à€Ÿà€à€ à€à„ à€žà€Ÿà€„ [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) à€Ąà„à€Șà„ à€źà„à€à€, à€à€Œà€żà€Żà€Ÿà€à€à€Ÿà€à€ à€à„à€š, à€à€Œà„à€à€żà€Żà€Ÿ à€«à„à€š, à€à„à€à€ à€à€Œà„à€à€, à€čà„à€à€à€żà€Żà€Ÿà€à€ à€Čà„, à€Żà„à€čà„à€ à€Żà„à€à€š, à€Čà„à€ à€žà€š, à€à€żà€à€à€Ąà„à€à€ à€”à€Ÿà€à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) à€à€żà€čà€Ÿà€à€ à€à€żà€Żà€Ÿà€à€, à€”à„à€čà€Ÿà€ à€Żà„, à€Ąà€Ÿà€à€Ÿà€š à€à„à€, à€Żà„à€šà€Șà„à€à€ à€à„à€š, à€à€żà€Żà€Ÿà€¶à„ à€«à„à€à€, à€¶à„à€à€à„à€à€ à€Żà€Ÿà€š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) à€à€Œà„à€à€à€ à€Čà€żà€Żà„, à€čà„à€à€à€Œà„ à€źà€Ÿà€, à€à€Ÿà€-à€Żà„à€à€š à€”à„, à€à„à€°à€żà€žà„à€à„à€«à€Œ à€«à„à€à€à„à€šà€čà„à€«à€Œà€°, à€à„à€°à„à€”à€° à€Ąà„à€°à„à€Č, à€žà„à€šà€żà€à€ à€à€Œà„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (à€žà€żà€à€à„à€ à€Żà„à€šà€żà€”à€°à„à€žà€żà€à„ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) à€à„à€à€à„à€Żà€Ÿà€š à€à€Ÿà€à€, à€à„ à€čà€Ÿà€š, à€čà€Ÿà€ à€à„à€, à€Șà„à€ à€à„, à€Żà„à€à„à€žà€żà€Żà€š à€à„, à€Ąà„à€źà€żà€à€ à€Żà„, à€Żà„à€à€żà€Żà€Ÿ à€à€żà€š, à€Żà„à€¶à„à€à€ à€žà„, à€čà€Ÿà€à€à„ à€à„, à€à€żà€Żà€Ÿà€š à€à„à€à€š, à€«à„à€à€à€Ÿà€ à€à„à€Żà„à€, à€à€Œà€żà€Żà€Ÿà€à€à„ à€”à€Ÿà€à€, à€Żà€Ÿà€šà€Ÿà€š à€à„à€à€ à€Šà„à€”à€Ÿà€°à€Ÿ , à€à„à€à€Żà€Ÿà€à€ à€à€Œà„à€à€, à€čà„à€à€šà€à„ à€à€Ÿà€, à€¶à„à€à€à€à„ à€à„à€š, à€Ąà€Ÿà€à€à„à€žà„à€à€š à€Čà„, à€à€Œà„à€šà€Źà„ à€žà€š, à€à€Œà€żà€Żà„à€à€š à€Čà€żà€Żà„, à€źà€żà€šà€Čà„ à€čà„à€à€à€, à€”à„à€à€à€Ÿà€ à€čà€Ÿà€š, à€à„ à€€à€Ÿà€à€, à€à„à€à€šà€à€Œà„ à€Čà„, à€à€Œà€żà€Żà€Ÿà€à€Żà€Ÿà€š à€à„, à€źà€Ÿà€à€žà„à€à€ à€žà€šà„€
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (à€žà„à€Čà„à€žà€«à„à€°à„à€ž à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) à€šà„à€€à„à€¶ à€¶à€żà€°à„à€· à€à„à€žà€à€°*, à€Źà„à€°à€Ÿà€Żà€š à€źà„à€à€à„à€š*, à€Čà€” à€à€°. à€”à€Ÿà€°à„à€·à„à€Łà„à€Ż, à€à„à€źà€żà€à€ à€à€żà€à€à€ à€à€° à€°à€żà€à€°à„à€Ą à€Šà„à€”à€Ÿà€°à€Ÿ à€žà„à€à€° à€Šà„à€”à€Ÿà€°à€Ÿ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Šà€żà€Żà€Ÿ à€à€Żà€Ÿ à€Șà„à€Șà€° [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) à€čà„à€Șà€żà€à€ à€”à„, à€Źà€żà€š à€à€żà€, à€šà„à€à€Č à€à„à€Ąà„à€Čà€Ÿ, à€źà„à€à€à€à„à€š à€Čà€żà€Żà„, à€à€żà€Żà€Ÿà€à€ à€Šà€Ÿà€, à€Čà„ à€Żà„à€à€š, à€Čà„à€ à€à€Ÿà€à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) à€à€Čà„à€à„à€žà„ à€Źà€Ÿà€à€”à„à€žà„à€à„, à€”à„à€-à€šà€żà€à€ à€žà„, à€à€żà€Żà€Ÿà€šà€à„à€à€ à€à„, à€
à€°à„à€Ł à€Źà€Ÿà€Źà„, à€à€żà€Żà€Ÿà€€à€Ÿà€ à€à„, à€źà€Ÿà€à€à€Č à€à€Čà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Šà€żà€Żà€Ÿ à€à€Żà€Ÿ à€Șà„à€Șà€° [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) à€Șà„à€à€à€à„à€à€ à€čà„, à€à€Œà€żà€Żà€Ÿà€à€Ąà„à€à€ à€Čà€żà€Żà„, à€à€żà€Żà€Ÿà€šà€«à„à€à€ à€à€Ÿà€, à€”à„à€à€Œà„ à€à„à€š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Šà€żà€Żà€Ÿ à€à€Żà€Ÿ à€Șà„à€Șà€° [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) à€Șà„à€à€à€à„à€à€ à€čà„, à€à€Œà€żà€Żà€Ÿà€à€Ąà„à€à€ à€Čà€żà€Żà„, à€à€żà€Żà€Ÿà€šà€«à„à€à€ à€à€Ÿà€, à€”à„à€à€Œà„ à€à„à€š à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (à€Źà€°à„à€à€Čà„/à€«à„à€žà€Źà„à€/à€à„à€à€Č à€žà„) à€Șà„à€Șà€° à€à„ à€žà€Ÿà€„ [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) à€Čà€żà€Čà„ à€à„à€š, à€à„à€”à€żà€š à€Čà„, à€
à€°à€”à€żà€à€Š à€°à€Ÿà€à„à€¶à„à€”à€°à€š, à€à€żà€źà€żà€š à€Čà„, à€à€Šà€żà€€à„à€Ż à€à„à€°à„à€”à€°, à€źà€Ÿà€à€à€Č à€Čà€Ÿà€žà„à€à€żà€š, à€Șà„à€à€° à€à€Źà„à€Č, à€
à€°à€”à€żà€à€Š à€¶à„à€°à„à€šà€żà€”à€Ÿà€ž, à€à€à„à€° à€źà„à€°à„à€Ąà€ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (à€žà„à€à€žà€à€Ÿà€à€ź à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, à€à€żà€«à„à€à€ à€Šà€Ÿà€ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) à€čà„à€Żà„à€à„ à€à„à€”à„à€°à„à€š, à€źà„à€„à„à€Żà„ à€à„à€°à„à€Ą, à€źà„à€„à€żà€à„à€ž à€Ąà„à€à€Œ, à€«à€Œà„à€°à€Ÿà€à€žà€żà€žà„à€à„ à€źà€žà„à€žà€Ÿ, à€à€Čà„à€à„à€à€Œà„à€à€Ąà€° à€žà€Źà€Čà„à€°à„à€Čà„à€ž, à€čà€°à„à€”à„ à€à„à€à„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (Google AI à€žà„) Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (University of Hong Kong and TikTok à€žà„) Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl.
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) à€šà€żà€à„à€Čà€ž à€à„à€°à€żà€Żà€š, à€«à€Œà„à€°à€Ÿà€à€žà€żà€žà„à€à„ à€źà€žà„à€žà€Ÿ, à€à„à€Źà„à€°à€żà€Żà€Č à€žà€żà€šà„à€”, à€šà€żà€à„à€Čà€ž à€à€žà„à€šà€żà€Żà€°, à€
à€Čà„à€à„à€à„à€à€Ąà€° à€à€żà€°à€żà€Čà„à€”, à€žà€°à„à€à„à€ à€à€Œà€Ÿà€à„à€°à„à€Żà€à„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€žà„) à€à€Ÿà€à€ à€à„ à€žà€Ÿà€„ [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) à€Żà€żà€à€Œà„ à€à€Ÿà€à€, à€žà€żà€à„ à€žà€š, à€źà€żà€¶à„à€Č à€à„à€Čà„, à€Żà„à€š-à€à„à€š à€à„à€š, à€à„à€°à€żà€ž à€Źà„à€°à„à€à„à€, à€à€żà€Żà€Ÿà€à€ à€à€Ÿà€, à€à€żà€Żà€Ÿà€šà€«à„à€à€ à€à€Ÿà€, à€à€żà€à€à€à€żà€à€ à€Čà€żà€Żà„, à€Źà€żà€Č à€Ąà„à€Čà€š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (Meta AI à€žà„) Maxime Oquab, TimothĂ©e Darcet, ThĂ©o Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, HervĂ© Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (à€čà€à€żà€à€à€«à„à€ž à€žà„), à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) à€”à€żà€à„à€à€° à€žà€šà€č, à€Čà€żà€žà€Ÿà€à€Ąà„à€°à„ à€Ąà„à€Źà„à€Żà„ à€à€° à€„à„à€źà€ž à€”à„à€Čà„à€« à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€ à€Żà€čà„ à€€à€°à„à€à€Ÿ GPT-2 à€à„ [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERta à€žà„ [DistilRoBERta](https://github.com) à€Șà€° à€à€à€Șà„à€°à„à€ž à€à€°à€šà„ à€à„ à€Čà€żà€ à€à„ à€Čà€Ÿà€à„ à€à€żà€Żà€Ÿ à€à€Ÿà€€à€Ÿ à€čà„à„€ / à€čà€à€żà€à€à€«à„à€ž/à€à„à€°à€Ÿà€à€žà€«à„à€°à„à€źà€°à„à€ž/à€à„à€°à„/à€źà„à€š/à€à€Šà€Ÿà€čà€°à€Ł/à€Ąà€żà€žà„à€à€żà€Čà„à€¶à€š), à€Źà€čà„à€à€Ÿà€·à„ BERT à€žà„ [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) à€à€° à€Ąà€żà€žà„à€à€żà€Čà€Źà€°à„à€ à€à€Ÿ à€à€°à„à€źà€š à€žà€à€žà„à€à€°à€Łà„€
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) à€à„à€šà€Čà„à€šà„à€ à€Čà„, à€Żà€żà€čà„à€à€ à€à„, à€à„à€à€à€à€Ÿà€ à€Čà€”, à€Čà„à€ à€à„à€, à€à€Ÿ à€à€Ÿà€à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€«à„à€°à„ à€”à„à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) à€à„à€”à„à€ à€à€żà€ź, à€à„à€à€à„à€Żà„ à€čà„à€à€, à€źà„à€šà€Źà€żà€š à€Żà€żà€ź, à€à€żà€Żà„à€à€à„à€Żà„à€š à€šà€Ÿà€ź, à€à€żà€šà€Żà„à€šà„à€ à€Șà€Ÿà€°à„à€, à€à€żà€šà€Żà„à€šà„à€ à€Żà€żà€ź, à€”à„à€šà€žà„à€à€ à€čà„à€”à€Ÿà€à€, à€žà€Ÿà€à€à€Ąà„ à€Żà„à€, à€Ąà„à€à€à€Żà„à€š à€čà€Ÿà€š, à€žà„à€à€à€à„à€Żà„à€š à€Șà€Ÿà€°à„à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) à€”à„à€Čà€Ÿà€Šà€żà€źà„à€° à€à€°à€Șà„à€à€żà€š, à€Źà€°à€Čà€Ÿà€ž à€à€à€Œà„à€à€Œ, à€žà„à€”à€š à€źà€żà€š, à€Șà„à€à„à€°à€żà€ à€Čà„à€à€ž, à€Čà„à€Ąà„à€Č à€”à„, à€žà€°à„à€à„à€ à€à€Ąà„à€šà„à€”, à€Ąà„à€šà€à„ à€à„à€š, à€à€° à€”à„à€š-à€€à€Ÿà€ à€Żà€żà€č à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (à€à€à€à„à€Č à€Čà„à€Źà„à€ž à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) à€°à„à€šà„ à€°à„à€šà€«à„à€à€Č, à€à€Čà„à€à„à€žà„ à€Źà„à€à€à„à€”à€žà„à€à„, à€”à„à€Čà€Ÿà€Šà€Čà„à€š à€à„à€Čà„à€à€š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google à€°à€żà€žà€°à„à€/à€žà„à€à„à€šà€«à„à€°à„à€Ą à€Żà„à€šà€żà€”à€°à„à€žà€żà€à„ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Šà€żà€Żà€Ÿ à€à€Żà€Ÿ à€Șà„à€Șà€° [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) à€à„à€”à€żà€š à€à„à€Čà€Ÿà€°à„à€, à€źà€żà€šà„à€č-à€„à€Ÿà€à€ à€Čà„à€à€à€, à€à„à€”à„à€ à€”à„. à€Čà„, à€à„à€°à€żà€žà„à€à„à€«à€° à€Ąà„. à€źà„à€šà€żà€à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (Meta AI à€žà„) Alexandre DĂ©fossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Šà€żà€Żà€Ÿ à€à€Żà€Ÿ à€Șà„à€Șà€° [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) à€žà€Ÿà€¶à€Ÿ à€°à„à€ à„, à€¶à€¶à€ż à€šà€Ÿà€°à€Ÿà€Żà€Ł, à€
à€Čà€żà€Żà€Ÿà€à„à€žà€ż à€žà„à€”à„à€°à€żà€š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)**(Baidu à€žà„) à€žà€Ÿà€„ à€Šà„à€šà„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) à€Żà„ à€žà€š, à€¶à„à€à€čà„à€à€š à€”à€Ÿà€à€, à€Żà„à€à„à€š à€Čà„, à€¶à€żà€à„à€š à€«à„à€à€, à€à€Œà„à€ à€à„à€š, à€čà€Ÿà€š à€à€Ÿà€à€, à€¶à€żà€š à€€à€żà€Żà€Ÿà€š, à€Ąà„à€šà€à„à€žà€żà€Żà€Ÿà€à€ à€à„, à€čà€Ÿà€ à€€à€żà€Żà€Ÿà€š, à€čà„à€ à€”à„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu à€žà„) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (à€źà„à€à€Ÿ AI à€žà„) à€à„à€°à€Ÿà€à€žà€«à„à€°à„à€źà€° à€Șà„à€°à„à€à„à€š à€à€Ÿà€·à€Ÿ à€źà„à€Ąà€Č à€čà„à€à„€ **ESM-1b** à€Șà„à€Șà€° à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ à€„à€Ÿ [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) à€à„à€žà€š à€Čà€żà€Żà„, à€Ąà„à€źà„ à€à„à€, à€źà€Ÿà€Żà€Č à€à€, à€žà„. à€Čà„à€°à„à€à€ž à€à€Œà€żà€à€šà€żà€, à€à„à€°à„ à€źà€Ÿ à€à€° à€°à„à€Ź à€«à€°à„à€à€žà„€ **ESM-1v** à€à„ à€Șà„à€Șà€° à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ à€„à€Ÿ [à€à€Ÿà€·à€Ÿ à€źà„à€Ąà€Č à€Șà„à€°à„à€à„à€š à€«à€Œà€à€à„à€¶à€š à€Șà€° à€à€€à„à€Șà€°à€żà€”à€°à„à€€à€š à€à„ à€Șà„à€°à€à€Ÿà€”à„à€ à€à„ à€¶à„à€šà„à€Ż-à€¶à„à€ à€à€”à€żà€·à„à€Żà€”à€Ÿà€Łà„ à€à„ à€žà€à„à€·à€ź à€à€°à€€à„ à€čà„à€](https://doi.org/10.1101/2021.07.09.450648) à€à„à€¶à„à€ à€źà„à€Żà€°, à€°à„à€¶à€š à€°à€Ÿà€”, à€°à„à€Źà€°à„à€ à€”à„à€°à€à„à€à€Č, à€à„à€žà€š à€Čà€żà€Żà„, à€à„à€ź à€žà€°à„à€à„ à€à€° à€
à€Čà„à€à„à€à„à€à€Ąà€° à€°à€Ÿà€à€”à„à€ž à€Šà„à€”à€Ÿà€°à€Ÿà„€ **ESM-2** à€à„ à€Șà„à€Șà€° à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ à€„à€Ÿ [à€à€Ÿà€·à€Ÿ à€źà„à€Ąà€Č à€”à€żà€à€Ÿà€ž à€à„ à€Șà„à€źà€Ÿà€šà„ à€Șà€° à€Șà„à€°à„à€à„à€š à€
à€šà„à€à„à€°à€ź à€žà€à„à€ à€žà€à€°à€à€šà€Ÿ à€à€”à€żà€·à„à€Żà€”à€Ÿà€Łà„ à€à„ à€žà€à„à€·à€ź à€à€°à€€à„ à€čà„à€](https://doi.org/10.1101/2022.07.20.500902) à€à€Œà„à€źà€żà€à€ à€Čà€żà€š, à€čà€Čà„à€Č à€
à€à€żà€š, à€°à„à€¶à€š à€°à€Ÿà€”, à€Źà„à€°à€Ÿà€Żà€š à€čà„, à€à„à€à€à€à€Ÿà€ à€à„, à€”à„à€à€à€żà€à€ à€Čà„, à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€Čà€Ÿà€š à€Ąà„à€ž à€žà„à€à€à„à€ž à€à„à€žà„à€à€Ÿ, à€źà€°à€żà€Żà€ź à€«à€Œà€à€Œà€Č-à€à€Œà€°à€à€Ąà„, à€à„à€ź à€žà€°à„à€à„, à€žà€Ÿà€Č à€à„à€à€Ąà€żà€Ąà„, à€
à€Čà„à€à„à€à„à€à€Ąà€° à€°à€Ÿà€à€”à„à€žà„€
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (ESPnet and Microsoft Research à€žà„) Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) Hang Le, LoĂŻc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, à€Źà„à€à€à€Ÿà€źà€żà€š à€Čà„à€à„à€à€à„à€à„à€ž, à€
à€Čà„à€à„à€à„à€à€Ąà„à€°à„ à€
à€Čà„à€Čà€Ÿà€à€à€Œà„à€š, à€Źà„à€šà„à€à€ à€à„à€°à„à€Źà„, à€Čà„à€°à„à€à€ à€Źà„à€žà„à€žà€żà€Żà€°, à€Ąà€żà€Ąà€żà€à€° à€¶à„à€”à€Ÿà€Ź à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° à€
à€źà€šà€Șà„à€°à„à€€ à€žà€żà€à€č, à€°à„à€à€à€čà€Ÿà€à€ à€čà„, à€”à„à€Šà€Ÿà€šà„à€ à€à„à€žà„à€”à€Ÿà€źà„, à€à„à€à€Čà„à€Żà„à€ź à€à„à€à€°à„à€š, à€”à„à€à„à€¶à€żà€à€ à€à€Ÿà€Čà„à€Źà€Ÿ, à€źà€Ÿà€°à„à€à€ž à€°à„à€čà€°à€Źà„à€, à€à€° à€Ąà„à€”à„ à€à„à€Čà€Ÿ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (à€à„à€à€Č à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) à€à„à€źà„à€ž à€Čà„-à€„à„à€°à„à€Ș, à€à„à€¶à„à€ à€à€à€à€žà„à€Čà„, à€à€Čà„à€Żà€Ÿ à€à€à€žà„à€à„à€š, à€žà„à€à€à€żà€Żà€Ÿà€à„ à€à€à€à€Ÿà€šà€š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research à€žà„) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (à€žà„à€à€źà€Żà„/à€à„à€à€Č à€Źà„à€°à„à€š à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) à€à€żà€čà€Ÿà€à€ à€Šà€Ÿà€, à€à„à€à€à„à€š à€Čà€Ÿà€, à€Żà€żà€źà€żà€à€ à€Żà€Ÿà€à€, à€à„à€”à„à€ à€”à„. à€Čà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€°à€żà€čà€Ÿà€à„€
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (ADEPT à€žà„) à€°à„à€čà€š à€Źà€Ÿà€”à€żà€¶à„, à€à€°à€żà€ à€à€Čà€žà„à€š, à€à€°à„à€à€żà€ž à€čà„à€„à„à€°à„à€š, à€źà„à€à„à€žà€”à„à€Č à€šà„, à€à€à€žà„à€à€ž à€à€Ąà„à€šà€Ÿ, à€
à€°à„à€¶à„ à€žà„à€źà€Ÿà€šà„, à€žà€Ÿà€à€šà€Ÿà€ à€€à€Ÿà€žà€żà€°à€Čà€Ÿà€° [blog post](https://www.adept.ai/blog/fuyu-8b)
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (Google à€žà„) the Gemma Google team. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) à€Ąà„à€Żà„à€š à€à€żà€ź, à€”à„à€à€à€čà„à€Żà„à€š à€à€Ÿ, à€Șà„à€Żà„à€à€à€”à€Ÿà€š à€à€č, à€Ąà„à€à€à€à„à€Żà„ à€à„, à€žà„à€čà€”à€Ÿà€š à€à„à€š, à€à„à€šà€źà„ à€à€żà€ź à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Šà€żà€Żà€Ÿ à€à€Żà€Ÿ à€Șà„à€Șà€° [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) à€à€Čà„à€ à€°à„à€Ąà€«à„à€°à„à€Ą, à€à€Ÿà€°à„à€€à€żà€ à€šà€°à€žà€żà€źà„à€čà€š, à€à€żà€ź à€žà€Ÿà€Čà€żà€źà€šà„à€ž à€à€° à€à€Čà„à€Żà€Ÿ à€žà„à€€à„à€žà„à€à„à€”à€° à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI à€žà„) à€°à€żà€Șà„à€à€żà€à€°à„ à€à„ à€žà€Ÿà€„ [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) à€°à€żà€Čà„à€à„€ à€žà€żà€Ą à€Źà„à€Čà„à€, à€žà„à€à„à€Čà€Ÿ à€Źà€żà€Ąà€°à€źà„à€š, à€Čà€żà€Żà„ à€à€Ÿà€, à€«à€żà€Č à€”à€Ÿà€à€ à€à€° à€à„à€šà€° à€Čà„à€čà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (EleutherAI à€žà„) à€Șà„à€Șà€° à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) à€žà€żà€Ą à€Źà„à€Čà„à€, à€žà„à€à„à€Čà€Ÿ à€Źà€żà€Ąà€°à€źà„à€š, à€à€°à€żà€ à€čà„à€Čà€Ÿà€čà€š, à€à„à€”à„à€à€à€żà€š à€à€à€„à„à€šà„, à€Čà€żà€Żà„ à€à€Ÿà€, à€Čà„à€°à„à€à€ž à€à„à€Čà„à€Ąà€żà€à€, à€čà„à€°à„à€ž à€čà„, à€à„à€šà€° à€Čà„à€čà„, à€à€Ÿà€à€Č à€źà„à€à€Ąà„à€šà„à€Č, à€à„à€žà€š à€«à€Ÿà€à€, à€źà€Ÿà€à€à€Č à€Șà€Ÿà€à€Čà€°, à€Żà„à€à€žà€”à„à€à€žà€à€š à€žà€Ÿà€ à€Șà„à€°à€¶à€Ÿà€à€€ à€Šà„à€”à€Ÿà€°à€Ÿ , à€¶à€żà€”à€Ÿà€à€¶à„ à€Șà„à€°à„à€čà€żà€€, à€Čà€Ÿà€°à€żà€Żà€Ÿ à€°à„à€šà„à€Čà„à€Ąà„à€ž, à€à„à€šà€Ÿà€„à€š à€à„, à€Źà„à€š à€”à€Ÿà€à€, à€žà„à€źà„à€
à€Č à€”à„à€šà€Źà„à€
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (à€
à€Źà„à€à€Ÿ à€à„ à€à€°à€żà€) à€¶à€żà€šà„à€Żà€Ÿ à€à€à€Ÿà€šà„, à€€à€Ÿà€à€Ÿà€Żà„à€¶à„ à€źà€à€Ÿà€Źà„, à€
à€šà„à€ à€
à€°à„à€Ąà€Œà€Ÿ, à€à„à€Żà„ à€čà€à„à€°à„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (à€à€Șà€šà€à€à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) à€à€Čà„à€ à€°à„à€Ąà€«à„à€°à„à€Ą, à€à„à€«à€°à„ à€”à„, à€°à„à€”à€š à€à€Ÿà€à€Čà„à€Ą, à€Ąà„à€”à€żà€Ą à€Čà„à€à€š, à€Ąà€Ÿà€°à€żà€Żà„ à€à€źà„à€Ąà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€à€° à€à€Čà„à€Żà€Ÿ à€žà„à€€à„à€žà€à„à€”à€° à€šà„ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿà„€
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) à€Źà„à€š à€”à€Ÿà€à€ à€à€° à€
à€°à€š à€à„à€źà€Ÿà€€à„à€žà„à€à€Ÿà€à„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode à€žà„) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others à€žà„) Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) à€à€żà€Żà€Ÿà€°à„à€ à€à„, à€¶à€Ÿà€Čà€żà€šà„ à€Ąà„ à€źà„à€Čà„, à€žà€żà€«à€Œà„ à€Čà€żà€Żà„, à€”à„à€šà€źà€żà€š à€Źà€Ÿà€Żà€š, à€„à„à€źà€ž à€Źà„à€°à„à€à€à€Č, à€à€Ÿà€š à€à„à€à„à€à€Œ, à€à€Œà€żà€Żà€Ÿà€à€Čà„à€à€ à€”à€Ÿà€à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology à€žà„) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) à€”à„à€-à€šà€żà€à€ à€žà„, à€Źà„à€à€à€Ÿà€źà€żà€š à€Źà„à€Čà„à€à„, à€Żà€Ÿà€-à€čà€à€ à€čà„à€Żà„à€Źà€°à„à€ à€€à„à€žà€Ÿà€, à€à„à€¶à€Ÿà€Č à€Čà€à„à€à€żà€Żà€Ÿ, à€°à„à€žà„à€Čà€Ÿà€š à€žà€Ÿà€Čà€Ÿà€à„à€€à€Šà„à€šà„à€”, à€
à€Źà„à€Šà„à€Čà€°à€čà€źà€Ÿà€š à€źà„à€čà€źà„à€źà€Š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (à€Źà€°à„à€à€Čà„ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) à€žà„à€čà„à€š à€à€żà€ź, à€
à€źà„à€° à€à„à€Čà€źà„, à€à€Œà„à€”à„à€ à€Żà€Ÿà€, à€źà€Ÿà€à€à€Č à€Ąà€Źà„à€Čà„à€Żà„ à€źà€čà„à€šà„, à€à€°à„à€ à€à„à€à€à€Œà€° à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (Hugging Face à€žà„) LĂ©o Tronchon, Hugo Laurencon, Victor Sanh. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [IDEFICS2](https://huggingface.co/blog/idefics2) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce à€žà„) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€à€¶à€żà€Żà€Ÿ à€žà„) à€žà€Ÿà€„ à€Šà„à€šà„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) à€Żà„à€Șà€š à€čà„à€à€à€, à€à„à€à€à€à€Ÿà€ à€Čà€”, à€Čà„à€ à€à„à€, à€Żà„à€à„à€à€ à€Čà„, à€«à„à€°à„ à€”à„à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (à€źà„à€à€Ÿ AI à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) à€Źà„à€š à€à„à€°à€Ÿà€čà€ź, à€
à€Čà€Ÿà€à€Čà„à€Ąà€żà€š à€à€Č-à€šà„à€Źà„, à€čà„à€Żà„à€à„ à€à„à€”à€°à€š, à€Șà€żà€Żà€°à„ à€žà„à€à„à€, à€à€°à„à€źà€à€Ą à€à„à€Čà€żà€š, à€čà€°à„à€”à„ à€à„à€à„, à€źà„à€„à€żà€ à€Ąà„à€à€Œ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (à€Šà€à„à€·à€żà€Ł à€à„à€š à€Șà„à€°à„à€Šà„à€Żà„à€à€żà€à„ à€”à€żà€¶à„à€”à€”à€żà€Šà„à€Żà€Ÿà€Čà€Ż à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) à€à€żà€Żà€Ÿà€Șà„à€à€ à€”à€Ÿà€à€, à€Čà€żà€Żà€Ÿà€šà€”à„à€š à€à€żà€š, à€à€Ÿà€ à€Ąà€żà€à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI à€žà„) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, TimothĂ©e Lacroix, Baptiste RoziĂšre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI à€žà„) Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (Microsoft Research & University of Wisconsin-Madison à€žà„) Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (Microsoft Research & University of Wisconsin-Madison à€žà„) Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (à€źà„à€à€Ąà„ à€à„à€, à€à„à€¶à„à€ à€à€à€à€žà„à€Čà„, à€Ąà„à€”à€żà€Ą à€Żà„à€„à€ž, à€žà„à€à€à€żà€Żà€Ÿà€à„ à€à€à€à€Ÿà€šà€š, à€à€żà€Żà€Ÿà€šà€źà„ à€šà€ż, à€Żà„à€-à€čà„à€à€š à€žà„à€à€, à€Żà€żà€šà€«à„à€ à€Żà€Ÿà€à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (à€žà„à€à„à€Ąà€żà€Żà„ à€à€žà€żà€Żà€Ÿ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (UNC à€à„à€Șà€Č à€čà€żà€Č à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) à€čà€Ÿà€ à€à„à€š à€à€° à€źà„à€čà€żà€€ à€Źà€à€žà€Č à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€Šà„à€šà„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) à€à€à€à„à€Čà€Ÿ à€«à„à€š, à€¶à„à€°à„à€€à€ż à€à„à€žà€Čà„, à€čà„à€Čà„à€à€° à€¶à„à€”à„à€šà„à€, à€à„ à€źà€Ÿ, à€
à€čà€źà€Š à€
à€Č-à€à€żà€¶à„à€à„, à€žà€żà€Šà„à€§à€Ÿà€°à„à€„ à€à„à€Żà€Č, à€źà€šà€Šà„à€Ș à€Źà„à€šà„à€ž, à€à€šà„à€° à€žà„à€Čà„à€Źà„, à€à„à€à€Čà„à€Čà€Ÿà€ź à€”à„à€šà„à€à„à€, à€”à€żà€¶à„à€°à€” à€à„à€§à€°à„, à€šà€źà€š à€à„à€Żà€Č, à€à„à€ź à€Źà€°à„à€, à€”à€żà€à€Ÿà€Čà„ à€Čà€żà€Șà€à€żà€à€žà„à€à„, à€žà€°à„à€à„à€ à€à€Ąà„à€šà„à€”, à€à€Ąà„à€°à„à€Ą à€Šà„à€”à€Ÿà€°à€Ÿ à€à„à€°à„à€”, à€źà€Ÿà€à€à€Č à€à€Čà„, à€à€°à„à€źà€à€Ą à€à„à€Čà€żà€š à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (Albert Gu and Tri Dao à€žà„) Albert Gu and Tri Dao. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg à€Šà„à€”à€Ÿà€°à€Ÿ [OPUS](http://opus.nlpl.eu/) à€Ąà„à€à€Ÿ à€žà„ à€Șà„à€°à€¶à€żà€à„à€·à€żà€€ à€źà€¶à„à€šà„ à€
à€šà„à€”à€Ÿà€Š à€źà„à€Ąà€Č à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ à€à€Ÿà€à€Ąà„à€źà„à€š à€Šà„à€”à€Ÿà€°à€Ÿà„€ [à€źà„à€°à€żà€Żà€š à€«à„à€°à„à€źà€”à€°à„à€](https://marian-nmt.github.io/) à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€à„à€°à€Ÿà€à€žà€Čà„à€à€° à€à„à€ź à€Šà„à€”à€Ÿà€°à€Ÿ à€”à€żà€à€žà€żà€€à„€
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€à€¶à€żà€Żà€Ÿ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) à€à„à€šà€Čà„à€šà„à€ à€Čà„, à€Żà€żà€čà„à€à€ à€à„, à€Čà„à€ à€à„à€, à€«à„à€°à„ à€Šà„à€”à€Ÿà€°à€Ÿ à€”à„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC à€žà„) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (à€źà„à€à€Ÿ à€à€° UIUC à€žà„) à€Șà„à€Șà€° à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) à€Źà„à€”à„à€š à€à„à€à€, à€
à€Čà„à€à„à€à„à€à€Ąà€° à€à„. à€¶à„à€”à€żà€à€, à€
à€Čà„à€à„à€à„à€à€Ąà€° à€à€żà€°à€żà€Čà„à€” à€Šà„à€”à€Ÿà€°à€Ÿ >>>>>> à€°à€żà€Źà„à€ž à€ à„à€ à€à€°à„à€
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (Google AI à€žà„) Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) à€Żà€żà€šà€čà€Ÿà€š à€Čà€żà€Żà„, à€à€żà€Żà€Ÿà€€à€Ÿà€ à€à„, à€šà€źà€š à€à„à€Żà€Č, à€à€żà€Żà€Ÿà€š à€Čà„, à€žà€°à„à€à„à€ à€à€Ąà„à€šà„à€”, à€źà€Ÿà€°à„à€à€š à€à€Œà€à€Œà€”à€żà€šà€żà€šà„à€à€Ÿà€Š, à€źà€Ÿà€à€ à€Čà„à€à€ž, à€Čà„à€Żà„à€ à€à€Œà„à€à€Čà€źà„à€Żà€° à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) à€Żà„à€à€żà€à€ à€à„à€à€, à€à€Ÿà€ à€à„à€°à€Ÿà€š, à€à€żà€Żà€Ÿà€š à€Čà„, à€Șà„à€à€-à€à„à€š à€à„à€š, à€šà€źà€š à€à„à€Żà€Č, à€”à€żà€¶à„à€°à€” à€à„à€§à€°à„, à€à€żà€Żà€Ÿà€€à€Ÿà€ à€à„, à€à€à€à„à€Čà€Ÿ à€«à„à€š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (Facebook à€žà„) Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA à€žà„) à€à€Ÿà€à€ à€à„ à€žà€Ÿà€„ [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) à€źà„à€čà€źà„à€źà€Š à€¶à„à€à€Źà„, à€źà„à€žà„à€à„à€«à€Ÿ à€Șà€à€”à€Ÿà€°à„, à€°à€Ÿà€à€Č à€Șà„à€°à„, à€Șà„à€à„à€°à€żà€ à€Čà„à€à„à€°à„à€žà„à€Čà„, à€à„à€°à„à€Ą à€à„à€žà„à€Șà€° à€à€° à€Źà„à€°à€Ÿà€Żà€š à€à„à€à€Ÿà€šà€à€Œà€Ÿà€°à„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) à€źà„à€čà€źà„à€źà€Š à€¶à„à€à€Źà„, à€źà„à€žà„à€à„à€«à€Ÿ à€Șà€à€”à€Ÿà€°à„, à€°à€Ÿà€à€Č à€Șà„à€°à„, à€Șà„à€à„à€°à€żà€ à€Čà„à€à„à€°à„à€žà„à€Čà„, à€à„à€°à„à€Ą à€à„à€žà„à€Șà€° à€à€° à€Źà„à€°à€Ÿà€Żà€š à€à„à€à€Ÿà€šà€à€Œà€Ÿà€°à„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research à€žà„) Peng Wang, Cheng Da, and Cong Yao. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed..
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (à€«à„à€°à„à€ź Studio Ousia) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) à€°à€Żà„à€à€š à€°à„, à€à€à„à€Żà€Ÿ à€Żà€Ÿà€źà€Ÿà€Ąà€Ÿ, à€à€° à€Żà„à€¶à€żà€źà€Ÿà€žà€Ÿ à€€à„à€žà„à€°à„à€à€Ÿ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook à€žà„) Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (à€žà„à€à€źà€Żà„/à€à„à€à€Č à€Źà„à€°à„à€š à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, à€à€° Denny Zhou à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) à€žà€à€żà€š à€źà„à€čà€€à€Ÿ à€à€° à€źà„à€čà€źà„à€źà€Š à€°à€žà„à€€à€à€°à„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple à€žà„) Sachin Mehta and Mohammad Rastegari. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (MosaiML à€žà„) the MosaicML NLP Team. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [llm-foundry](https://github.com/mosaicml/llm-foundry/) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison à€žà„) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) à€Čà€żà€à€à€żà€à€ à€à€Œà„, à€šà„à€ à€à„à€šà„à€žà€à„à€à€, à€à€Ąà€ź à€°à„à€Źà€°à„à€à„à€ž, à€źà€żà€čà€żà€° à€à€Ÿà€Čà„, à€°à€Ÿà€źà„ à€
à€Č-à€°à€«à„, à€à€Šà€żà€€à„à€Ż à€žà€żà€Šà„à€§à€Ÿà€à€€, à€à€Šà€żà€€à„à€Ż à€Źà€°à„à€, à€à„à€Čà€żà€š à€°à„à€«à„à€Č à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (à€čà„à€à€”à„à€ à€šà„à€č à€à„ à€à€°à„à€ à€Čà„à€Ź à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€à€Œ [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) à€à„à€šà„à€à€żà€ à€”à„à€, à€à€Œà€żà€Żà€Ÿà€à€à€Œà„ à€°à„à€š, à€à€Œà€żà€à€à€à„à€à€à€ à€Čà„, à€”à„à€šà€Żà„à€à€ à€čà„à€à€à€, à€Żà„ à€Čà€żà€Żà€Ÿà€, à€Żà€Ÿà€¶à„à€à€ à€”à€Ÿà€à€, à€à€żà€Żà€Ÿà€¶à„ à€Čà€żà€š, à€¶à€żà€š à€à€żà€Żà€Ÿà€à€, à€à€żà€ à€à„à€š à€à€° à€à„à€š à€Čà€żà€Żà„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (à€«à„à€°à„à€ź à€źà„à€à€Ÿ) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) à€à€šà€à€Čà€à€Čà€Źà„ à€à„à€ź à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€°à€à€Ÿà€¶à€żà€€à„€
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta à€žà„) the NLLB team. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI à€žà„) Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (à€”à€żà€žà„à€à„à€šà„à€žà€żà€š à€”à€żà€¶à„à€”à€”à€żà€Šà„à€Żà€Ÿà€Čà€Ż - à€źà„à€Ąà€żà€žà€š à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) à€Żà„à€šà€Żà€Ÿà€à€ à€à€Œà€żà€à€à€, à€à€Ÿà€šà€Șà„à€à€ à€à€Œà„à€à€, à€°à„à€Šà„à€°à€žà€żà€ž à€à€à„à€°à€”à€°à„à€€à„, à€źà€żà€à€à€à„à€žà€żà€à€ à€à„à€š, à€à„à€Čà„à€š à€«à€à€, à€Żà€żà€š à€Čà„, à€”à€żà€à€Ÿà€ž à€žà€żà€à€č à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (AI2 à€žà„) Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs à€žà„) à€Șà„à€Șà€° [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) à€à€żà€€à„à€¶ à€à„à€š, à€à€żà€à€à„à€š à€Čà„, à€źà€Ÿà€à€à€à€żà€ à€à€żà€, à€
à€Čà„ à€čà€žà€šà„, à€šà€żà€à€żà€€à€Ÿ à€à€°à€Čà„à€”, à€čà€źà„à€«à„à€°à„ à€¶à€ż à€à„ à€Šà„à€”à€Ÿà€°à€Ÿ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ à€čà„à„€
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) à€źà„à€„à€żà€Żà€Ÿà€ž à€źà€żà€à€Ąà€°à€°, à€à€Čà„à€à„à€žà„ à€à„à€°à€żà€à„à€žà„à€à€à„, à€à€žà„à€à€żà€š à€žà„à€à„à€š, à€źà„à€à„à€žà€żà€ź à€šà„à€Żà„à€źà„à€š, à€Ąà€żà€°à„à€ à€”à„à€žà„à€šà€Źà„à€°à„à€š, à€à€Čà„à€à„à€žà„ à€Ąà„à€žà„à€”à€żà€€à„à€žà„à€à„, à€
à€°à€”à€żà€à€Š à€źà€čà„à€à€Šà„à€°à€š, à€
à€šà„à€°à€Ÿà€ à€
à€°à„à€šà€Ź, à€źà„à€žà„à€€à€«à€Ÿ à€Šà„à€čà€à€Ÿà€šà„, à€à€Œà„à€à€°à€š à€¶à„à€š, à€à€żà€ à€”à€Ÿà€à€, à€à€Œà€żà€Żà€Ÿà€à€čà„à€ à€à€Ÿà€, à€„à„à€źà€ž à€à€żà€«à€Œ, à€à€° à€šà„à€Č à€čà„à€Čà„à€žà€Źà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (Google AI à€žà„) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** ( IBM Research à€žà„) Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (IBM à€žà„) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google à€à„ à€à€° à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Šà€żà€Żà€Ÿ à€à€Żà€Ÿ à€Șà„à€Șà€° [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) à€à„à€žà€š à€«à€Ÿà€à€, à€Żà€Ÿà€ à€à€Ÿà€, à€Șà„à€à€° à€à„ à€Čà€żà€Żà„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (à€Šà„à€Șà€źà€Ÿà€à€à€Ą à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) à€à€à€Ąà„à€°à€Żà„ à€à„à€à€Č, à€žà„à€Źà„à€žà„à€à€żà€Żà€š à€Źà„à€°à€à„à€Żà„à€Ą, à€à„à€š-à€Źà„à€Șà„à€à€żà€žà„à€ à€
à€Čà€Ÿà€Żà€°à€Ÿà€, à€à€Ÿà€°à„à€Č à€Ąà„à€°à„à€¶, à€à„à€à€Čà€żà€š à€à€à€šà„à€žà„à€à„, à€Ąà„à€”à€żà€Ą à€Šà„à€”à€Ÿà€°à€Ÿ à€Ąà€żà€à€, à€žà„à€à€à€Š à€à„à€Șà„à€Șà„à€Čà€Ÿ, à€Ąà„à€šà€żà€Żà€Č à€à€Œà„à€°à€Ÿà€š, à€à€à€Ąà„à€°à€Żà„ à€Źà„à€°à„à€, à€à€”à€Ÿà€š à€¶à„à€Čà€čà„à€źà€°, à€à€Čà€żà€”à€żà€Żà€° à€čà„à€šà€Ÿà€«, à€źà„à€„à„à€Żà„ à€à€źà„€ à€Źà„à€à„à€”à€żà€šà€żà€, à€à€à€Ąà„à€°à€Żà„ à€à€Œà€żà€žà€°à€źà„à€š, à€à€°à€żà€à€Č à€”à€żà€šà€żà€Żà€Čà„à€ž, à€à„à€à€ à€à„à€°à„à€°à€Ÿ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (ADEPT à€žà„) Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [blog post](https://www.adept.ai/blog/persimmon-8b) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio CĂ©sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, SĂ©bastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, SĂ©bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research à€žà„) à€à€Ÿà€à€ à€à„ à€žà€Ÿà€„ [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) à€Ąà„à€ à€à„à€”à„à€ à€à„à€Żà„à€š à€à€° à€
à€šà„à€č à€€à„à€à€š à€à„à€Żà„à€š à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google à€žà„) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) à€”à€žà„ à€à€Šà„à€Šà„à€š à€
à€čà€źà€Š, à€žà„à€à€€ à€à€à„à€°à€”à€°à„à€€à„, à€Źà„à€¶à€Ÿà€à„ à€°à„, à€à€Ÿà€-à€”à„à€ à€à€Ÿà€à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) à€Żà„ à€Żà€Ÿà€š, à€”à„à€à€Œà„à€š à€à„à€Żà„à€, à€Żà„à€Żà„à€š à€à„à€à€, à€Šà€Żà€Ÿà€čà„à€à€ à€Čà€żà€Żà„, à€šà€Ÿà€š à€Ąà„à€à€š, à€à€żà€à€¶à„à€à€ à€à„à€š, à€°à„à€à€«à€Œà„à€ à€à€Ÿà€à€ à€à€° à€źà€żà€à€ à€à„à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. à€žà„) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc. à€žà„) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) à€čà€Ÿà€ à€”à„, à€Șà„à€à„à€°à€żà€ à€à„à€Ą, à€à€żà€à€à€à„ à€à€Ÿà€à€, à€źà€żà€à€Ÿà€à€Č à€à€žà„à€” à€à€° à€Șà„à€Čà€żà€Żà€ž à€źà€Ÿà€à€à„à€”à€żà€žà€żà€Żà€ž à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (the Qwen team, Alibaba Group à€žà„) Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Qwen Technical Report](https://arxiv.org/abs/2309.16609) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (the Qwen team, Alibaba Group à€žà„) Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [blog post](https://qwenlm.github.io/blog/qwen-moe/) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) à€Șà„à€à„à€°à€żà€ à€Čà„à€à€ž, à€à€„à€š à€Șà„à€°à„à€à€Œ, à€
à€Čà„à€à„à€à„à€à€Ąà„à€°à€Ÿ à€Șà€żà€à„à€à€ž, à€«à„à€Źà€żà€Żà„ à€Șà„à€à„à€°à„à€šà„, à€”à„à€Čà€Ÿà€Šà€żà€źà„à€° à€à€Ÿà€°à€Șà„à€à€żà€š, à€šà€źà€š à€à„à€Żà€Č, à€čà„à€šà€°à€żà€ à€à„à€à€Čà€°, à€źà€Ÿà€à€ à€Čà„à€à€ž, à€”à„à€š-à€€à€Ÿà€ à€Żà€żà€č, à€à€żà€ź à€°à„à€à€à€Ÿà€¶à„à€Č, à€žà„à€Źà€žà„à€à€żà€Żà€š à€°à€żà€Ąà„à€Č, à€Ąà„à€”à„ à€à„à€Čà€Ÿ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google à€
à€šà„à€žà€à€§à€Ÿà€š à€žà„) à€à„à€Čà„à€”à€żà€š à€à„, à€à„à€à€à€š à€Čà„, à€à€Œà„à€°à€Ÿ à€€à„à€à€, à€Șà€Ÿà€šà„à€Șà„à€à€ à€Șà€žà„à€Șà€€ à€à€° à€źà€żà€à€-à€”à„à€ à€à€Ÿà€à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€žà€Ÿà€„ à€źà„à€ à€Šà€żà€Żà€Ÿ à€à€Żà€Ÿ à€Șà„à€Șà€° [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909)à„€
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (Google à€žà„) the Griffin, RLHF and Gemma Teams. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (META à€°à€żà€žà€°à„à€ à€žà„) [Designing Network Design Space](https://arxiv.org/abs/2003.13678) à€Șà„à€Șà€° à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ à€à€Źà„à€ž/2003.13678) à€à€Čà€żà€à€Ÿ à€°à€Ÿà€Ąà„à€žà€Ÿà€”à„à€”à€żà€, à€°à€Ÿà€ à€Șà„à€°à€€à„à€ à€à„à€žà€Ÿà€°à€Ÿà€à„, à€°à„à€ž à€à€żà€°à„à€¶à€żà€, à€à„à€źà€żà€à€ à€čà„, à€Șà€żà€à€à€° à€Ąà„à€Čà€° à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (à€à„à€à€Č à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) à€čà„à€Żà„à€à€ à€”à„à€š à€à„à€à€, à€„à€żà€Źà„à€Čà„à€ à€«à€Œà„à€”à€°à„, à€čà„à€šà€°à„ à€€à„à€žà€Ÿà€, à€à€ź. à€à„à€šà€žà€š, à€žà„à€Źà„à€žà„à€à€żà€Żà€š à€°à„à€Ąà€° à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€žà„) [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) à€à„à€źà€żà€à€ à€čà„, à€à€żà€Żà€Ÿà€à€à„à€Żà„ à€à€Ÿà€à€, à€¶à€Ÿà€à€à€żà€à€ à€°à„à€š, à€à€żà€Żà€Ÿà€š à€žà€š à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (à€«à„à€žà€Źà„à€ à€žà„), à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) à€Żà€żà€šà€čà€Ÿà€š à€Čà€żà€Żà„, à€źà€Ÿà€Żà€Č à€à€, à€šà€źà€š à€à„à€Żà€Č, à€à€żà€à€à€«à„à€ à€Ąà„, à€źà€à€Šà€Ÿà€° à€à„à€¶à„, à€Ąà„à€šà€à„ à€à„à€š, à€à€źà€° à€Čà„à€”à„, à€źà€Ÿà€à€ à€Čà„à€à€ž, à€Čà„à€Żà„à€ à€à€Œà„à€à€Čà€źà„à€Żà€°, à€”à„à€žà„à€Čà€żà€š à€žà„à€à„à€Żà€Ÿà€šà„à€” à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (à€à„à€à€ à€à„à€à„à€šà„à€Čà„à€à„ à€žà„), à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) à€à€żà€Żà€Ÿà€šà€Čà€żà€š à€žà„ à€à€° à€Żà„ à€Čà„ à€à€° à€¶à„à€à€à€«à„à€à€ à€Șà„à€š à€à€° à€Źà„ à€”à„à€š à€à€° à€Żà„à€šà€«à„à€à€ à€Čà€żà€Żà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€°à€à€Ÿà€¶à€żà€€à„€
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng à€žà„) Bo Peng. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [this repo](https://github.com/BlinkDL/RWKV-LM) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (Beijing Academy of Artificial Intelligence (BAAI à€žà„) Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI à€žà„) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP à€žà„) à€žà€Ÿà€„ à€Šà„à€šà„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) à€«à„à€Čà€żà€à„à€ž à€”à„, à€à„à€”à€Ÿà€à€à€Żà„à€š à€à€żà€ź, à€à€żà€à€ à€Șà„à€š, à€à„à€Żà„ à€čà€Ÿà€š, à€à€żà€Čà€żà€Żà€š à€à„à€Żà„. à€”à„à€šà€Źà€°à„à€à€°, à€Żà„à€” à€à€°à„à€à€à€Œà„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) à€«à„à€Čà€żà€à„à€ž à€”à„, à€à„à€”à€Ÿà€à€à€Żà„à€š à€à€żà€ź, à€à€żà€à€ à€Șà„à€š, à€à„à€Żà„ à€čà€Ÿà€š, à€à€żà€Čà€żà€Żà€š à€à„à€Żà„. à€”à„à€šà€Źà€°à„à€à€°, à€Żà„à€à€” à€à€°à„à€à€à€Œà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (Google AI à€žà„) Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (à€«à„à€žà€Źà„à€ à€žà„), à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) à€à€Ÿà€à€à€čà€Ÿà€š à€”à€Ÿà€à€, à€Żà„à€ à€€à€Ÿà€à€, à€à„à€€à€Ÿà€ à€źà€Ÿ, à€à€šà„ à€”à„, à€Šà€żà€źà€żà€€à„à€°à„ à€à€à„à€šà€à„, à€à„à€à€š à€Șà€żà€šà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿă
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) à€à€Ÿà€à€à€čà€Ÿà€š à€”à€Ÿà€à€, à€à€šà„ à€”à„, à€à„à€à€š à€Șà€żà€šà„, à€à€Čà„à€à„à€žà„ à€Źà„à€”à€žà„à€à„, à€źà€Ÿà€à€à€Č à€à€Čà„, à€à€Čà„à€à„à€žà€żà€ž à€Šà„à€”à€Ÿà€°à€Ÿ Conneau à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (à€€à„à€Č à€
à€”à„à€” à€Żà„à€šà€żà€”à€°à„à€žà€żà€à„ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) à€à€°à€ż à€°à€Ÿà€ź, à€Żà„à€”à€Č à€à€°à„à€žà„à€à€š, à€à„à€šà€Ÿà€„à€š à€Źà„à€°à„à€à€, à€
à€źà„à€° à€à„à€Čà„à€Źà€°à„à€žà€š, à€à€źà€° à€Čà„à€”à„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (à€Źà€°à„à€à€Čà„ à€žà„) à€à€Ÿà€à€ à€à„ à€žà€Ÿà€„ [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) à€«à„à€°à„à€žà„à€ à€à€š. à€à€šà€Ąà„à€Čà€Ÿ, à€
à€Čà„à€Źà€°à„à€ à€. à€¶à„, à€°à€”à€ż à€à„à€·à„à€Łà€Ÿ, à€à€° à€à€°à„à€ à€Ąà€Źà„à€Čà„à€Żà„. à€à„à€à€à€Œà€° à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI à€žà„) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) à€à€Œà„ à€Čà€żà€Żà„, à€Żà„à€à„à€à€ à€Čà€żà€š, à€Żà„ à€à€Ÿà€, à€čà€Ÿà€š à€čà„, à€Żà€żà€à„à€žà„à€à€š à€”à„à€, à€à„à€à€ à€à€Ÿà€à€, à€žà„à€à„à€«à€š à€Čà€żà€š, à€Źà„à€šà€żà€à€ à€à„à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) à€à€Œà„ à€Čà€żà€Żà„, à€čà€Ÿà€š à€čà„, à€Żà„à€à„à€à€ à€Čà€żà€š, à€à€Œà„à€Čà€żà€à€à€ à€Żà€Ÿà€, à€à€Œà„à€à€Ąà€Ÿ à€à€Œà„, à€Żà€żà€à„à€žà„à€à€š à€”à„à€, à€à€żà€Żà€Ÿ à€šà€żà€à€, à€Żà„ à€à€Ÿà€, à€à„à€à€ à€à€Ÿà€à€, à€Čà„ à€Ąà„à€à€, à€«à„à€°à„ à€”à„à€, à€Źà„à€šà€żà€à€ à€à„à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of WĂŒrzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (æ„èȘ Google AI)à€à„à€Čà€żà€š à€°à„à€«à„à€Č à€à€° à€šà„à€ź à€¶à€à€Œà„à€° à€à€° à€à€Ąà€ź à€°à„à€Źà€°à„à€à„à€ž à€à€° à€à„à€„à€°à„à€š à€Čà„ à€à€° à€¶à€°à€Ł à€šà€Ÿà€°à€à€ à€à€° à€źà€Ÿà€à€à€Č à€źà€à„à€šà€Ÿ à€Šà„à€”à€Ÿà€°à€Ÿ à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) à€à€° à€Żà€Ÿà€à€à„ à€à„à€ à€à€° à€”à„à€ à€Čà„ à€à€° à€Șà„à€à€° à€à„ à€Čà€żà€Żà„à„€
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (Google AI à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) à€à„à€Čà€żà€š à€°à„à€«à„à€Č à€à€° à€šà„à€ź à€¶à€à€Œà„à€° à€à€° à€à€Ąà€ź à€°à„à€Źà€°à„à€à„à€ž à€à€° à€à„à€„à€°à„à€š à€Čà„ à€à€° à€¶à€°à€Ł à€šà€Ÿà€°à€à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€à€° à€źà€Ÿà€à€à€Č à€źà€à„à€šà€Ÿ à€à€° à€Żà€Ÿà€à€à„ à€à„à€ à€à€° à€”à„à€ à€Čà„ à€à€° à€Șà„à€à€° à€à„ à€Čà€żà€Żà„à„€
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) à€Źà„à€°à„à€à€Ąà€š à€žà„à€źà„à€, à€°à„à€čà€żà€€ à€Șà„à€žà€Ÿà€Čà€Ÿ, à€°à„à€Źà€żà€š à€
à€Źà„à€°à€Ÿà€čà€ź à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) à€à„à€šà€Ÿà€„à€š à€čà€°à„à€à€Œà€żà€, à€Șà€Ÿà€”à„à€Č à€à„à€°à€żà€à€Œà€żà€žà„à€€à„à€«à€Œ à€šà„à€”à€Ÿà€, à€„à„à€źà€ž à€źà„à€Čà€°, à€«à„à€°à€Ÿà€à€žà„à€žà„à€à„ à€Șà€żà€à€żà€šà„à€šà„ à€à€° à€à„à€Čà€żà€Żà€š à€źà€Ÿà€°à„à€à€żà€š à€à€žà„à€šà„à€à„à€Čà„à€ž à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) à€à€żà€Żà€Ÿà€š à€Čà€żà€Żà„, à€Źà„à€ à€à„à€š, à€à€żà€Żà€Ÿà€à„ à€à„à€, à€źà„à€°à„à€à„à€à€Œà€Ÿ à€à€Œà€żà€Żà€Ÿà€Šà„, à€à€Œà„à€à„ à€Čà€żà€š, à€”à„à€à€Œà„ à€à„à€š, à€à€żà€Żà€Ÿà€š-à€à„à€à€à€ à€Čà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU à€à„ à€à€° à€žà„) à€à€Ÿà€à€ à€à„ à€žà€Ÿà€„ [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) à€à„à€”à„à€à„à€ à€”à„. à€Čà„, à€°à„à€žà„à€Čà„à€š à€žà€Čà€Ÿà€à„à€€à€Šà„
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (Microsoft Research à€žà„) Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research à€žà„) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Šà€żà€Żà€Ÿ à€à€Żà€Ÿ à€Șà„à€Șà€° [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) à€à„à€à€à€ à€”à€Ÿà€à€, à€Żà„ à€”à„, à€Żà€Ÿà€ à€à€żà€Żà€Ÿà€š, à€à„à€šà€żà€à„ à€à„à€źà€Ÿà€€à€Ÿà€šà„, à€¶à„à€à„ à€Čà€żà€Żà„, à€«à„à€°à„ à€”à„à€, à€źà€Ÿà€à€à€Č à€à€Œà„à€à€, à€à€Œà„à€à€Šà„à€à€ à€čà„à€à€à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€žà„) à€à€Ÿà€à€ à€à„ à€žà€Ÿà€„ [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) à€žà€Ÿà€šà€Żà„à€à€š à€à„à€š, à€Żà„ à€”à„, à€à„à€à€à„à€Żà„ à€”à€Ÿà€à€, à€à„à€à€à€Żà€Ÿà€à€ à€à„à€š, à€à„à€ à€à„à€š, à€¶à„à€à„ à€Čà€żà€Żà„, à€à€żà€Żà€Ÿà€š à€”à„, à€Żà€Ÿà€ à€à€żà€Żà€Ÿà€š, à€«à„à€°à„ à€”à„à€, à€à€żà€šà„à€Żà„ à€Čà„, à€à€żà€Żà€Ÿà€à€à€à€Œà€Ÿà€š à€Żà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (à€žà€żà€à€à„à€ à€Żà„à€šà€żà€”à€°à„à€žà€żà€à„ à€à€° à€šà€šà€à€Ÿà€ à€Żà„à€šà€żà€”à€°à„à€žà€żà€à„ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Visual Attention Network](https://arxiv.org/abs/2202.09741) à€źà„à€à€-à€čà€Ÿà€ à€à„à€, à€à„à€à€-à€à€Œà„ à€Čà„, à€à„à€à€-à€šà€żà€à€ à€Čà€żà€Żà„, à€źà€żà€à€-à€źà€żà€à€ à€à„à€à€, à€¶à€ż-à€źà€żà€š à€čà„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (à€źà€Čà„à€à„à€źà„à€Ąà€żà€Żà€Ÿ à€à€źà„à€Șà„à€Żà„à€à€żà€à€ à€à„à€°à„à€Ș, à€šà€Ÿà€šà€à€żà€à€ à€Żà„à€šà€żà€”à€°à„à€žà€żà€à„ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) à€à€Œà€Ÿà€š à€à„à€à€, à€Żà€żà€Źà€żà€à€ à€žà„à€šà„à€, à€à„à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€”à€Ÿà€à€, à€Čà€żà€źà€żà€š à€”à€Ÿà€à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) à€”à„à€šà€à„ à€à€żà€ź, à€Źà„à€à„à€Żà„à€à€ à€žà„à€š, à€à€Čà„à€Ąà„ à€à€żà€ź à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (University of WisconsinâMadison à€žà„) Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (à€à„à€à€Č à€à€à€ à€žà„) à€à€Ÿà€à€ à€à„ à€žà€Ÿà€„ [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) à€à€Čà„à€à„à€žà„ à€Ąà„à€žà„à€”à€żà€€à„à€žà„à€à„, à€Čà„à€à€Ÿà€ž à€Źà„à€Żà€°, à€
à€Čà„à€à„à€à„à€à€Ąà€° à€à„à€Čà„à€žà€šà€żà€à„à€”, à€Ąà€żà€°à„à€ à€”à„à€žà„à€šà€Źà„à€°à„à€š, à€¶à€żà€Żà€Ÿà€à€čà„à€ à€à€Ÿà€, à€„à„à€źà€ž à€
à€šà€à€°à€„à€żà€šà€°, à€źà„à€žà„à€€à€«à€Ÿ à€Šà„à€čà€à€Ÿà€šà„, à€źà„à€„à€żà€Żà€Ÿà€ž à€źà€żà€à€Ąà€°à€°, à€à„à€°à„à€ à€čà„à€à„à€Čà„à€Ą, à€žà€żà€Čà„à€”à„à€š à€à„à€Čà„, à€à„à€à€Ź à€à€žà„à€à€Œà€à„à€°à„à€à€ à€Šà„à€”à€Ÿà€°à€Ÿ à€čà„à€Čà„à€žà€Źà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) à€Čà€żà€Żà„à€šà€żà€Żà€š à€čà„à€°à„à€Čà„à€Ą à€Čà„, à€źà€Ÿà€°à„à€ à€Żà€Ÿà€€à„à€žà„à€à€°, à€Šà€Ÿ à€Żà€żà€š, à€à„-à€à„à€ à€čà€žà„à€č, à€à€Ÿà€-à€”à„à€ à€à€Ÿà€à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (Meta AI à€žà„) Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (à€źà„à€à€Ÿ à€à€à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) à€à„à€źà€żà€à€ à€čà„, à€à€Œà€żà€šà„à€Čà„ à€à„à€š, à€žà„à€šà€żà€à€ à€à€Œà„, à€Żà€Ÿà€à€à€čà„ à€Čà„, à€Șà€żà€à€à„à€° à€Ąà„à€Čà€°, à€°à„à€ž à€à€żà€°à„à€¶à€żà€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL à€žà„) Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (à€źà„à€à€Ÿ à€à€à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) à€źà€čà€źà„à€Š à€
à€žà€°à€Ÿà€š, à€źà€„à€żà€Čà„à€Ąà„ à€à„à€°à€š, à€à€¶à€Ÿà€š à€źà€żà€¶à„à€°à€Ÿ, à€Șà€żà€Żà„à€à„à€° à€Źà„à€à€Ÿà€šà„à€”à€žà„à€à„, à€«à„à€Čà„à€°à€żà€Żà€š à€Źà„à€°à„à€Ąà„à€ž, à€Șà€Ÿà€žà„à€à€Č à€”à€żà€à€žà„à€à€, à€à€°à„à€źà€à€Ą à€à„à€Čà€żà€š, à€źà€Ÿà€à€à€Č à€°à€Źà„à€Źà€€, à€šà€żà€à„à€Čà€ž à€Źà€Čà„à€Čà€Ÿà€ž à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise à€žà„) Jaehyeon Kim, Jungil Kong, Juhee Son. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (à€«à„à€žà€Źà„à€ à€à€à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) à€à€Čà„à€à„à€žà„ à€Źà„à€”à€žà„à€à„, à€čà„à€šà€°à„ à€à„à€, à€
à€Źà„à€Šà„à€Čà€°à€čà€źà€Ÿà€š à€źà„à€čà€źà„à€źà€Š, à€źà€Ÿà€à€à€Č à€à€Čà„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) à€à€Ÿà€à€à€čà€Ÿà€š à€”à€Ÿà€à€, à€Żà„à€ à€€à€Ÿà€à€, à€à„à€€à€Ÿà€ à€źà€Ÿ, à€à€šà„ à€”à„, à€žà€°à€”à„à€Żà€Ÿ à€Șà„à€Șà„à€°à„, à€Šà€żà€źà€żà€€à„à€°à„ à€à€à„à€šà€à„, à€à„à€à€š à€Șà€żà€šà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) à€à€żà€Żà€Ÿà€šà€à„à€à€ à€à„, à€à€Čà„à€à„à€žà„ à€Źà€Ÿà€à€”à„à€žà„à€à„, à€źà€Ÿà€à€à€Č à€à€Čà„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€žà„) à€Șà„à€Șà€° à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) à€žà€Ÿà€šà€Żà„à€à€š à€à„à€š, à€à„à€à€à€Żà„ à€”à€Ÿà€à€, à€à„à€à€à€Żà€Ÿà€à€ à€à„à€š, à€Żà„ à€”à„, à€¶à„à€à„ à€Čà€żà€Żà„, à€à€Œà„à€ à€à„à€š, à€à€żà€šà„à€Żà„ à€Čà„, à€šà€Ÿà€à€Żà„à€à„ à€à€Ÿà€à€Ąà€Ÿ, à€€à€Ÿà€à„à€Żà€Ÿ à€Żà„à€¶à€żà€Żà„à€à€Ÿ, à€à€Œà€żà€à€à€ à€à€żà€, à€à€żà€Żà€Ÿà€š à€”à„, à€Čà„à€šà„à€ à€à„à€, à€¶à„à€ à€°à„à€š, à€Żà€Ÿà€šà€źà€żà€š à€à€żà€Żà€Ÿà€š, à€Żà€Ÿà€ à€à€żà€Żà€Ÿà€š, à€à€żà€Żà€Ÿà€š à€”à„, à€źà€Ÿà€à€à€Č à€à€Œà„à€à€, à€«à„à€°à„ à€”à„à€à„€
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) à€à€Čà„à€ à€°à„à€Ąà€«à„à€°à„à€Ą, à€à„à€à€ à€”à„à€ à€à€żà€ź, à€€à€Ÿà€ à€à„, à€à„à€°à„à€ à€Źà„à€°à„à€à€źà„à€š, à€à„à€°à€żà€žà„à€à„à€š à€źà„à€à€Čà„à€”à„, à€à€Čà„à€Żà€Ÿ à€žà„à€€à„à€žà„à€à„à€”à€° à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€žà„) à€à€Ÿà€à€ à€à„ à€žà€Ÿà€„ [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) à€Źà„à€Čà€żà€š à€šà„, à€čà„à€à€”à„à€š à€Șà„à€à€, à€źà€żà€à€à€Ÿà€ à€à„à€š, à€žà„à€à€à€Żà€Ÿà€à€ à€à€Ÿà€à€, à€à€Ÿà€à€«à„à€à€ à€źà„à€à€, à€à€żà€Żà€Ÿà€šà€Čà„à€à€ à€«à„, à€¶à€żà€źà€żà€à€ à€à€żà€Żà€Ÿà€à€, à€čà„à€Źà€żà€š à€Čà€żà€à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (Meta AI à€žà„) Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe. à€Šà„à€”à€Ÿà€°à€Ÿà€
à€šà„à€žà€à€§à€Ÿà€š à€Șà€€à„à€° [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) à€à„ à€žà€Ÿà€„ à€à€Ÿà€°à„ à€à€żà€Żà€Ÿ à€à€Żà€Ÿ
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (à€«à„à€žà€Źà„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) à€à€żà€Čà€Ÿà€à€ź à€Čà„à€źà„à€Șà€Č à€à€° à€à€Čà„à€à„à€žà€żà€ž à€à„à€šà„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (à€źà€Ÿà€à€à„à€°à„à€žà„à€«à„à€ à€°à€żà€žà€°à„à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) à€Żà„ à€Żà€Ÿà€š, à€”à„à€à€Œà„à€š à€à„à€Żà„à€, à€Żà„à€Żà„à€š à€à„à€à€, à€Šà€Żà€Ÿà€čà„à€à€ à€Čà€żà€Żà„, à€šà€Ÿà€š à€Ąà„à€à€š, à€à€żà€à€¶à„à€à€ à€à„à€š, à€°à„à€à€«à€Œà„à€ à€à€Ÿà€à€ à€à€° à€źà€żà€à€ à€à„à€ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (à€«à„à€žà€Źà„à€ à€à€à€ à€žà„), à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) à€à€Čà„à€à„à€žà€żà€ž à€à„à€šà„à€Żà„*, à€à€Ÿà€°à„à€€à€żà€à„à€Ż à€à€à€Ąà„à€Čà€”à€Ÿà€Č*, à€šà€źà€š à€à„à€Żà€Č, à€”à€żà€¶à„à€°à€” à€à„à€§à€°à„, à€à€żà€Čà€Ÿà€à€ź à€”à„à€šà€à€Œà„à€, à€«à„à€°à€Ÿà€à€žà€żà€žà„à€à„ à€à„à€à€Œà€źà„à€š à€Šà„à€”à€Ÿà€°à€Ÿ , à€à€Ąà„à€°à„à€Ą à€à„à€°à„à€”, à€źà€Ÿà€Żà€Č à€à€, à€Čà„à€Żà„à€ à€à€Œà„à€à€Čà€źà„à€Żà€° à€à€° à€”à„à€žà„à€Čà€żà€š à€žà„à€à„à€Żà€Ÿà€šà„à€” à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI à€žà„) à€žà€Ÿà€„ à€źà„à€ à€à€Ÿà€à€ [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) à€šà€źà€š à€à„à€Żà€Č, à€à€żà€à€à€«à„à€ à€Ąà„, à€źà€Ÿà€Żà€Č à€à€, à€à€żà€°à€ż à€
à€šà€à€€à€°à€Ÿà€źà€š, à€à€Čà„à€à„à€žà€żà€ž à€à„à€šà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) à€à€Œà„à€Čà€żà€š à€Żà€Ÿà€à€*, à€à€Œà€żà€čà€Ÿà€à€ à€Šà€Ÿà€*, à€Żà€żà€źà€żà€à€ à€Żà€Ÿà€à€, à€à„à€ź à€à€Ÿà€°à„à€Źà„à€šà„à€Č, à€°à„à€žà„à€Čà€Ÿà€š à€žà€Čà€Ÿà€à„à€€à€Šà„à€šà„à€”, à€à„à€”à„à€ à€”à„. à€Čà„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI à€žà„) à€žà€Ÿà€„ à€”à€Ÿà€Čà€Ÿ à€Șà„à€Șà€° [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) à€
à€°à„à€Ł à€Źà€Ÿà€Źà„, à€à€Ÿà€à€à€čà€Ÿà€š à€”à€Ÿà€à€, à€à€à€Ąà„à€°à„à€ž à€€à€à€à€Šà„à€°à€Ÿ, à€à„à€¶à€Ÿà€Č à€Čà€à„à€à€żà€Żà€Ÿ, à€à€żà€Żà€Ÿà€šà€à„à€à€ à€à„, à€šà€źà€š à€à„à€Żà€Č, à€à„à€€à€żà€à€Ÿ à€žà€żà€à€č, à€Șà„à€à„à€°à€żà€ à€”à„à€š à€Șà„à€Čà„à€à€š, à€Żà€Ÿà€„à€Ÿà€°à„à€„ à€žà€°à€Ÿà€«, à€à„à€à€š à€Șà€żà€šà„, à€à€Čà„à€à„à€žà„ à€Źà„à€”à€žà„à€à„, à€à€Čà„à€à„à€žà€żà€ž à€à„à€šà„à€Żà„, à€źà€Ÿà€à€à€Č à€à€Čà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (à€«à„à€žà€Źà„à€ à€à€à€ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) à€à€Čà„à€à„à€žà€żà€ž à€à„à€šà„à€Żà„, à€à€Čà„à€à„à€žà„ à€Źà„à€”à€žà„à€à„, à€°à„à€šà€š à€à„à€Čà„à€Źà€°à„à€, à€
à€Źà„à€Šà„à€Čà€°à€čà€źà€Ÿà€š à€źà„à€čà€źà„à€źà€Š, à€źà€Ÿà€à€à€Č à€à€Čà„ à€Šà„à€”à€Ÿà€°à€Ÿà„€
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (à€čà„à€à€à„à€à€ à€Żà„à€šà€żà€”à€°à„à€žà€żà€à„ à€à€« à€žà€Ÿà€à€à€ž à€à€à€Ą à€à„à€à„à€šà„à€Čà„à€à„ à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) à€Żà„à€à„à€žà€żà€š à€«à„à€à€, à€Źà„à€šà€à„à€à€ à€Čà€żà€Żà€Ÿà€, à€à€żà€à€à€à„à€à€ à€”à€Ÿà€à€, à€à„à€źà€żà€š à€«à„à€à€, à€à€żà€Żà€Ÿà€à€ à€à„à€Żà„à€, à€°à„à€ à€”à„, à€à€żà€Żà€Ÿà€šà€”à„à€ à€šà„à€Żà„, à€”à„à€šà„à€Żà„ à€Čà€żà€Żà„ à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (à€”à€żà€žà„à€à„à€šà„à€žà€żà€š à€”à€żà€¶à„à€”à€”à€żà€Šà„à€Żà€Ÿà€Čà€Ż - à€źà„à€Ąà€żà€žà€š à€žà„) à€žà€Ÿà€„ à€źà„à€ à€Șà„à€Șà€° [à€Żà„ à€à€šà€Čà„ à€žà„à€à€Șà€Č (à€Čà€à€à€) à€à€Œà€Ÿà€šà€Șà„à€à€ à€à€Œà„à€à€, à€Żà„à€šà€Żà€Ÿà€à€ à€à€Œà€żà€à€à€ à€Šà„à€”à€Ÿà€°à€Ÿ , à€žà€€à„à€Ż à€à€š. à€°à€”à€ż, à€¶à„à€Čà„à€¶ à€à€à€Ÿà€°à„à€Ż, à€à„à€Čà„à€š à€«à€à€, à€”à€żà€à€Ÿà€ž à€žà€żà€à€č à€Šà„à€”à€Ÿà€°à€Ÿ à€Șà„à€žà„à€ à€à€żà€Żà€Ÿ à€à€Żà€Ÿà„€
-1. à€à€ à€šà€ à€źà„à€Ąà€Č à€źà„à€ à€Żà„à€à€Šà€Ÿà€š à€Šà„à€šà€Ÿ à€à€Ÿà€čà€€à„ à€čà„à€? à€šà€ à€źà„à€Ąà€Č à€à„à€Ąà€Œà€šà„ à€źà„à€ à€à€Șà€à€Ÿ à€źà€Ÿà€°à„à€à€Šà€°à„à€¶à€š à€à€°à€šà„ à€à„ à€Čà€żà€ à€čà€źà€Ÿà€°à„ à€Șà€Ÿà€ž à€à€ **à€”à€żà€žà„à€€à„à€€ à€źà€Ÿà€°à„à€à€Šà€°à„à€¶à€żà€à€Ÿ à€à€° à€à„à€źà„à€Șà„à€Čà„à€** à€čà„à„€ à€à€Ș à€à€šà„à€čà„à€ [`à€à„à€źà„à€Șà€Čà„à€à„à€ž`](./templates) à€šà€żà€°à„à€Šà„à€¶à€żà€à€Ÿ à€źà„à€ à€Șà€Ÿ à€žà€à€€à„ à€čà„à€à„€ à€Șà„à€à€° à€¶à„à€°à„ à€à€°à€šà„ à€žà„ à€Șà€čà€Čà„ [à€Żà„à€à€Šà€Ÿà€š à€Šà€żà€¶à€Ÿà€šà€żà€°à„à€Šà„à€¶](./CONTRIBUTING.md) à€Šà„à€à€šà€Ÿ à€à€° à€
à€šà„à€°à€à„à€·à€à„à€ à€žà„ à€žà€à€Șà€°à„à€ à€à€°à€šà€Ÿ à€Żà€Ÿ à€Șà„à€°à€€à€żà€à„à€°à€żà€Żà€Ÿ à€Șà„à€°à€Ÿà€Șà„à€€ à€à€°à€šà„ à€à„ à€Čà€żà€ à€à€ à€šà€Żà€Ÿ à€źà„à€Šà„à€Šà€Ÿ à€à„à€Čà€šà€Ÿ à€Żà€Ÿà€Š à€°à€à„à€à„€
+đ€ à€à„à€°à€Ÿà€à€žà€«à„à€°à„à€źà€° à€”à€°à„à€€à€źà€Ÿà€š à€źà„à€ à€šà€żà€źà„à€šà€Čà€żà€à€żà€€ à€à€°à„à€à€żà€à„à€à„à€à€° à€à€Ÿ à€žà€źà€°à„à€„à€š à€à€°à€€à„ à€čà„à€: à€źà„à€Ąà€Č à€à„ à€
à€”à€Čà„à€à€š à€à„ à€Čà€żà€ [à€Żà€čà€Ÿà€ à€Šà„à€à„à€](https://huggingface.co/docs/transformers/model_summary)ïŒ
à€Żà€č à€à€Ÿà€à€à€šà„ à€à„ à€Čà€żà€ à€à€ż à€à„à€Żà€Ÿ à€à€żà€žà„ à€źà„à€Ąà€Č à€źà„à€ à€Șà€čà€Čà„ à€žà„ à€čà„ Flax, PyTorch à€Żà€Ÿ TensorFlow à€à€Ÿ à€à€Ÿà€°à„à€Żà€Ÿà€šà„à€”à€Żà€š à€čà„, à€Żà€Ÿ à€Żà€Šà€ż à€à€žà€à„ à€Șà€Ÿà€ž Tokenizers à€Čà€Ÿà€à€Źà„à€°à„à€°à„ à€źà„à€ à€žà€à€Źà€à€§à€żà€€ à€à„à€à€š à€čà„, à€€à„ [à€Żà€č à€€à€Ÿà€Čà€żà€à€Ÿ](https://huggingface.co/docs/transformers/index#supported) à€Šà„à€à„à€à„€ -à€«à„à€°à„à€źà€”à€°à„à€)à„€
diff --git a/README_ja.md b/README_ja.md
index df7b4f0597a6e2..49db335ad5d62b 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -301,276 +301,7 @@ FlaxăPyTorchăTensorFlowăcondaă§ă€ăłăčăăŒă«ăăæčæłăŻăăă
çŸćšăźăă§ăăŻăă€ăłăæ°: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€TransformersăŻçŸćšă仄äžăźăąăŒăăăŻăăŁăæäŸăăŠăăŸăïŒăăăăăźăă€ăŹăă«ăȘèŠçŽăŻ[ăăĄă](https://huggingface.co/docs/transformers/model_summary)ăćç
§ăăŠăă ăăïŒ:
-
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (Google Research and the Toyota Technological Institute at Chicago ăă) Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut ăăć
Źéăăăç 究è«æ: [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942)
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research ăă) Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. ăăć
Źéăăăç 究è«æ [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918)
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (BAAI ăă) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell ăăć
Źéăăăç 究è«æ: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679)
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (MIT ăă) Yuan Gong, Yu-An Chung, James Glass ăăć
Źéăăăç 究è«æ: [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778)
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (Facebook ăă) Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer ăăć
Źéăăăç 究è«æ: [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461)
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (Ăcole polytechnique ăă) Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis ăăć
Źéăăăç 究è«æ: [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321)
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research ăă) Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen ăăć
Źéăăăç 究è«æ: [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (Microsoft ăă) Hangbo Bao, Li Dong, Furu Wei ăăć
Źéăăăç 究è«æ: [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254)
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (Google ăă) Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova ăăć
Źéăăăç 究è«æ: [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (Google ăă) Sascha Rothe, Shashi Narayan, Aliaksei Severyn ăăć
Źéăăăç 究è«æ: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461)
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research ăă) Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen ăăć
Źéăăăç 究è«æ: [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/)
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (Google Research ăă) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed ăăć
Źéăăăç 究è«æ: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062)
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (Google Research ăă) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed ăăć
Źéăăăç 究è«æ: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062)
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (Microsoft Research AI4Science ăă) Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu ăăć
Źéăăăç 究è«æ: [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9)
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (Google AI ăă) Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil ăăć
Źéăăăç 究è«æ: [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370)Houlsby.
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (Facebook ăă) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston ăăć
Źéăăăç 究è«æ: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637)
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (Facebook ăă) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston ăăć
Źéăăăç 究è«æ: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637)
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (Salesforce ăă) Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi ăăć
Źéăăăç 究è«æ: [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086)
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (Salesforce ăă) Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. ăăć
Źéăăăç 究è«æ [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597)
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (BigScience workshop ăă) [BigScience Workshop](https://bigscience.huggingface.co/) ăăć
ŹéăăăŸăă.
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa ăă) Adrian de Wynter and Daniel J. Perry ăăć
Źéăăăç 究è«æ: [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499)
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (Harbin Institute of Technology/Microsoft Research Asia/Intel Labs ăă) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (NAVER CLOVA ăă) Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park. ăăć
Źéăăăç 究è«æ [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539)
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google Research ăă) Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel ăăć
Źéăăăç 究è«æ: [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626)
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne ăă) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz SuĂĄrez*, Yoann Dupont, Laurent Romary, Ăric Villemonte de la Clergerie, DjamĂ© Seddah and BenoĂźt Sagot ăăć
Źéăăăç 究è«æ: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research ăă) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting ăăć
Źéăăăç 究è«æ: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys ăă) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou ăăć
Źéăăăç 究è«æ: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI ăă) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. ăăć
Źéăăăç 究è«æ [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687)
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI ăă) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever ăăć
Źéăăăç 究è«æ: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen ăă) Timo LĂŒddecke and Alexander Ecker ăăć
Źéăăăç 究è«æ: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce ăă) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong ăăć
Źéăăăç 究è«æ: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (MetaAI ăă) Baptiste RoziĂšre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, JĂ©rĂ©my Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre DĂ©fossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. ăăć
Źéăăăç 究è«æ [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/)
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (Cohere ăă) Cohere. ăăć
Źéăăăç 究è«æ [Command-R: Retrieval Augmented Generation at Production Scale]()
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (Microsoft Research Asia ăă) Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang ăăć
Źéăăăç 究è«æ: [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152)
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech ăă) Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan ăăć
Źéăăăç 究è«æ: [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496)
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI ăă) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie ăăć
Źéăăăç 究è«æ: [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545)
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University ăă) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun ăăć
Źéăăăç 究è«æ: [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413)
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (OpenBMB ăă) [OpenBMB](https://www.openbmb.org/) ăăć
ŹéăăăŸăă.
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce ăă) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher ăăć
Źéăăăç 究è«æ: [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858)
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft ăă) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang ăăć
Źéăăăç 究è«æ: [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808)
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook ăă) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli ăăć
Źéăăăç 究è«æ: [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555)
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft ăă) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen ăăć
Źéăăăç 究è«æ: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft ăă) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen ăăć
Źéăăăç 究è«æ: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google ăă) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch ăăć
Źéăăăç 究è«æ: [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (SenseTime Research ăă) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai ăăć
Źéăăăç 究è«æ: [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159)
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (Facebook ăă) Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, HervĂ© JĂ©gou ăăć
Źéăăăç 究è«æ: [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877)
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (Google AI ăă) Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. ăăć
Źéăăăç 究è«æ [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505)
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (University of Hong Kong and TikTok ăă) Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao. ăăć
Źéăăăç 究è«æ [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891)
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (The University of Texas at Austin ăă) Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl. ăăć
Źéăăăç 究è«æ [NMS Strikes Back](https://arxiv.org/abs/2212.06137)
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook ăă) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko ăăć
Źéăăăç 究è«æ: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872)
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research ăă) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan ăăć
Źéăăăç 究è«æ: [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536)
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs ăă) Ali Hassani and Humphrey Shi ăăć
Źéăăăç 究è«æ: [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001)
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (Meta AI ăă) Maxime Oquab, TimothĂ©e Darcet, ThĂ©o Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, HervĂ© Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. ăăć
Źéăăăç 究è«æ [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace ăă), Victor Sanh, Lysandre Debut and Thomas Wolf. ćăææłă§ GPT2, RoBERTa ăš Multilingual BERT ăźć§çžźăèĄăăŸăă.ć§çžźăăăăąăă«ăŻăăăă [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108)ă[DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)ă[DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) ăšćä»ăăăăŸăă. ć
Źéăăăç 究è«æ: [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108)
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research ăă) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei ăăć
Źéăăăç 究è«æ: [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378)
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER ăă), Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park ăăć
Źéăăăç 究è«æ: [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664)
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook ăă) Vladimir Karpukhin, Barlas OÄuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih ăăć
Źéăăăç 究è«æ: [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906)
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs ăă) RenĂ© Ranftl, Alexey Bochkovskiy, Vladlen Koltun ăăć
Źéăăăç 究è«æ: [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413)
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (Snap Research ăă) Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. ăăć
Źéăăăç 究è«æ [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191)
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University ăă) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning ăăć
Źéăăăç 究è«æ: [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555)
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (Meta AI ăă) Alexandre DĂ©fossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. ăăć
Źéăăăç 究è«æ [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research ăă) Sascha Rothe, Shashi Narayan, Aliaksei Severyn ăăć
Źéăăăç 究è«æ: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461)
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu ăă) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu ăăć
Źéăăăç 究è«æ: [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223)
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu ăă) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. ăăć
Źéăăăç 究è«æ [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI ăă) ăŻăă©ăłăčăă©ăŒăăŒăăăă€ăłèšèȘăąăă«ă§ă. **ESM-1b** 㯠Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus ăăć
Źéăăăç 究è«æ: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** 㯠Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rivesăăăć
Źéăăăç 究è«æ: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** ăšă**ESMFold** 㯠Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives ăăć
Źéăăăç 究è«æ: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902)
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (ESPnet and Microsoft Research ăă) Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang. ăăć
Źéăăăç 究è«æ [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956)
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (Google AI ăă) Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V ăăć
ŹéăăăăŹăăžăăȘăŒ [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS ăă) Hang Le, LoĂŻc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, BenoĂźt CrabbĂ©, Laurent Besacier, Didier Schwab ăăć
Źéăăăç 究è«æ: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372)
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (Facebook AI ăă) Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela ăăć
Źéăăăç 究è«æ: [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482)
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research ăă) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon ăăć
Źéăăăç 究è«æ: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824)
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research ăă) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. ăăć
Źéăăăç 究è«æ [Focal Modulation Networks](https://arxiv.org/abs/2203.11926)
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain ăă) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le ăăć
Źéăăăç 究è«æ: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236)
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (ADEPT ăă) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, SaÄnak TaĆırlar. ăăć
Źéăăăç 究è«æ [blog post](https://www.adept.ai/blog/fuyu-8b)
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (Google ăă) the Gemma Google team. ăăć
Źéăăăç 究è«æ [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/)
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (Microsoft Research ăă) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. ăăć
Źéăăăç 究è«æ [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100)
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST ăă) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim ăăć
Źéăăăç 究è«æ: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436)
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI ăă) Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever ăăć
Źéăăăç 究è«æ: [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/)
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI ăă) Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy ăăć
ŹéăăăăŹăăžăăȘăŒ : [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo)
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (EleutherAI ăă) Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach ăăć
Źéăăăç 究è«æ: [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (ABEJA ăă) Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori ăăăȘăȘăŒăč.
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI ăă) Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever ăăć
Źéăăăç 究è«æ: [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/)
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI ăă) Ben Wang and Aran Komatsuzaki ăăć
ŹéăăăăŹăăžăăȘăŒ [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/)
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (AI-Sweden ăă) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren ăăć
Źéăăăç 究è«æ: [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf)
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode ăă) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. ăăć
Źéăăăç 究è«æ [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) ćæŹäżäč(tanreinama)ăăăȘăȘăŒăčăăăŸăă.
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft ăă) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu ăăć
Źéăăăç 究è«æ: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234).
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others ăă) Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. ăăć
Źéăăăç 究è«æ [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA ăă) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang ăăć
Źéăăăç 究è«æ: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094)
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology ăă) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. ăăć
Źéăăăç 究è«æ [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook ăă) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed ăăć
Źéăăăç 究è«æ: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447)
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley ăă) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer ăăć
Źéăăăç 究è«æ: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321)
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (Hugging Face ăă) LĂ©o Tronchon, Hugo Laurencon, Victor Sanh. ăăć
Źéăăăç 究è«æ [IDEFICS2](https://huggingface.co/blog/idefics2)
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI ăă) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever ăăć
Źéăăăç 究è«æ: [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/)
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce ăă) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. ăăć
Źéăăăç 究è«æ [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI ăă) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever ăăć
Źéăăăç 究è«æ: [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf)
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia ăă) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou ăăć
Źéăăăç 究è«æ: [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318)
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia ăă) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou ăăć
Źéăăăç 究è«æ: [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740)
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (Microsoft Research Asia ăă) Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei ăăć
Źéăăăç 究è«æ: [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387)
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (Microsoft Research Asia ăă) Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei ăăć
Źéăăăç 究è«æ: [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836)
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (AllenAI ăă) Iz Beltagy, Matthew E. Peters, Arman Cohan ăăć
Źéăăăç 究è«æ: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI ăă) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, HervĂ© JĂ©gou, Matthijs Douze ăăć
Źéăăăç 究è«æ: [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136)
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology ăă) Jiapeng Wang, Lianwen Jin, Kai Ding ăăć
Źéăăăç 究è«æ: [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669)
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI ăă) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, TimothĂ©e Lacroix, Baptiste RoziĂšre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. ăăć
Źéăăăç 究è«æ [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI ăă) Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.. ăăć
Źéăăăç 究è«æ [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (Microsoft Research & University of Wisconsin-Madison ăă) Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee. ăăć
Źéăăăç 究è«æ [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485)
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (Microsoft Research & University of Wisconsin-Madison ăă) Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee. ăăć
Źéăăăç 究è«æ [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744)
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI ăă) Iz Beltagy, Matthew E. Peters, Arman Cohan ăăć
Źéăăăç 究è«æ: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI ăă) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang ăăć
Źéăăăç 究è«æ: [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916)
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia ăă) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto ăăć
Źéăăăç 究è«æ: [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057)
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (UNC Chapel Hill ăă) Hao Tan and Mohit Bansal ăăć
Źéăăăç 究è«æ: [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490)
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (Facebook ăă) Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert ăăć
Źéăăăç 究è«æ: [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161)
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook ăă) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin ăăć
Źéăăăç 究è«æ: [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125)
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (Albert Gu and Tri Dao ăă) Albert Gu and Tri Dao. ăăć
Źéăăăç 究è«æ [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752)
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg Tiedemann ăă. [OPUS](http://opus.nlpl.eu/) ăäœżăăȘăăćŠçżăăă "Machine translation" (ăă·ăłăă©ăłăčăŹăŒă·ă§ăł) ăąăă«. [Marian Framework](https://marian-nmt.github.io/) ăŻMicrosoft Translator TeamăăçŸćšéçșäžă§ă.
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia ăă) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei ăăć
Źéăăăç 究è«æ: [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518)
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC ăă) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. ăăć
Źéăăăç 究è«æ [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC ăă) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov ăăć
Źéăăăç 究è«æ: [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (Google AI ăă) Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. ăăć
Źéăăăç 究è«æ [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662)
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook ăă) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer ăăć
Źéăăăç 究è«æ: [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210)
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook ăă) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan ăăć
Źéăăăç 究è«æ: [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401)
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (Facebook ăă) Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. ăăć
Źéăăăç 究è«æ [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655)
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA ăă) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro ăăć
Źéăăăç 究è«æ: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA ăă) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro ăăć
Źéăăăç 究è«æ: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research ăă) Peng Wang, Cheng Da, and Cong Yao. ăăć
Źéăăăç 究è«æ [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed..
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia ăă) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka ăăć
Źéăăăç 究è«æ: [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151)
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook ăă) Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. ăăć
Źéăăăç 究è«æ [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516)
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain ăă) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou ăăć
Źéăăăç 究è«æ: [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984)
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. ăă) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam ăăć
Źéăăăç 究è«æ: [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861)
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. ăă) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen ăăć
Źéăăăç 究è«æ: [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple ăă) Sachin Mehta and Mohammad Rastegari ăăć
Źéăăăç 究è«æ: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178)
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple ăă) Sachin Mehta and Mohammad Rastegari. ăăć
Źéăăăç 究è«æ [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research ăă) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu ăăć
Źéăăăç 究è«æ: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297)
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (MosaiML ăă) the MosaicML NLP Team. ăăć
Źéăăăç 究è«æ [llm-foundry](https://github.com/mosaicml/llm-foundry/)
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison ăă) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. ăăć
Źéăăăç 究è«æ [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284)
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI ăă) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel ăăć
Źéăăăç 究è«æ: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934)
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box ăă) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen ăăć
Źéăăăç 究è«æ: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131)
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs ăă) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi ăăć
Źéăăăç 究è«æ: [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143)
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noahâs Ark Lab ăă) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu ăăć
Źéăăăç 究è«æ: [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204)
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta ăă) the NLLB team ăăć
Źéăăăç 究è«æ: [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta ăă) the NLLB team. ăăć
Źéăăăç 究è«æ [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI ăă) Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. ăăć
Źéăăăç 究è«æ [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418)
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison ăă) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh ăăć
Źéăăăç 究è«æ: [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902)
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (AI2 ăă) Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi. ăăć
Źéăăăç 究è«æ [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838)
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs ăă) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi ăăć
Źéăăăç 究è«æ: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI ăă) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al ăăć
Źéăăăç 究è«æ: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI ăă) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby ăăć
Źéăăăç 究è«æ: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (Google AI ăă) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. ăăć
Źéăăăç 究è«æ [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** ( IBM Research ăă) Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. ăăć
Źéăăăç 究è«æ [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf)
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (IBM ăă) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. ăăć
Źéăăăç 究è«æ [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730)
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google ăă) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu ăăć
Źéăăăç 究è«æ: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google ăă) Jason Phang, Yao Zhao, and Peter J. Liu ăăć
Źéăăăç 究è«æ: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind ăă) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier HĂ©naff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, JoĂŁo Carreira ăăć
Źéăăăç 究è«æ: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795)
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (ADEPT ăă) Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. ăăć
Źéăăăç 究è«æ [blog post](https://www.adept.ai/blog/persimmon-8b)
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio CĂ©sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, SĂ©bastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, SĂ©bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research ăă) Dat Quoc Nguyen and Anh Tuan Nguyen ăăć
Źéăăăç 究è«æ: [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/)
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google ăă) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. ăăć
Źéăăăç 究è«æ [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP ăă) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang ăăć
Źéăăăç 究è«æ: [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333)
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs ăă) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng ăăć
Źéăăăç 究è«æ: [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research ăă) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou ăăć
Źéăăăç 究è«æ: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063)
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. ăă) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. ăăć
Źéăăăç 究è«æ [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf)
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc. ăă) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. ăăć
Źéăăăç 究è«æ [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797)
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA ăă) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius ăăć
Źéăăăç 究è«æ: [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602)
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (the Qwen team, Alibaba Group ăă) Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu. ăăć
Źéăăăç 究è«æ [Qwen Technical Report](https://arxiv.org/abs/2309.16609)
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (the Qwen team, Alibaba Group ăă) Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. ăăć
Źéăăăç 究è«æ [blog post](https://qwenlm.github.io/blog/qwen-moe/)
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook ăă) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich KĂŒttler, Mike Lewis, Wen-tau Yih, Tim RocktĂ€schel, Sebastian Riedel, Douwe Kiela ăăć
Źéăăăç 究è«æ: [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401)
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research ăă) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang ăăć
Źéăăăç 究è«æ: [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909)
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (Google ăă) the Griffin, RLHF and Gemma Teams. ăăć
Źéăăăç 究è«æ [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf)
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (Google Research ăă) Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya ăăć
Źéăăăç 究è«æ: [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451)
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (META Platforms ăă) Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr DollĂĄr ăăć
Źéăăăç 究è«æ: [Designing Network Design Space](https://arxiv.org/abs/2003.13678)
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research ăă) Hyung Won Chung, Thibault FĂ©vry, Henry Tsai, M. Johnson, Sebastian Ruder ăăć
Źéăăăç 究è«æ: [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821)
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (Microsoft Research ăă) Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun ăăć
Źéăăăç 究è«æ: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (Facebook ăă), Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov ăăć
Źéăăăç 究è«æ: [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692)
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook ăă) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli ăăć
Źéăăăç 究è«æ: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038)
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI ăă) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou ăăć
Źéăăăç 究è«æ: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology ăă), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu ăăć
Źéăăăç 究è«æ: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng ăă) Bo Peng. ăăć
Źéăăăç 究è«æ [this repo](https://github.com/BlinkDL/RWKV-LM)
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA ăă) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo ăăć
Źéăăăç 究è«æ: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (Beijing Academy of Artificial Intelligence (BAAI ăă) Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang. ăăć
Źéăăăç 究è«æ [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284)
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI ăă) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. ăăć
Źéăăăç 究è«æ [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP ăă) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi ăăć
Źéăăăç 究è«æ: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP ăă) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi ăăć
Źéăăăç 究è«æ: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (Google AI ăă) Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer. ăăć
Źéăăăç 究è«æ [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343)
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research ăă) Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. ăăć
Źéăăăç 究è«æ [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (Facebook ăă), Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino ăăć
Źéăăăç 究è«æ: [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171)
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook ăă), Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau ăăć
Źéăăăç 究è«æ: [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678)
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University ăă), Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy ăăć
Źéăăăç 究è«æ: [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438)
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley ăă) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer ăăć
Źéăăăç 究è«æ: [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316)
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI ăă) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. ăăć
Źéăăăç 究è«æ [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft ăă) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo ăăć
Źéăăăç 究è«æ: [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft ăă) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo ăăć
Źéăăăç 究è«æ: [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883)
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of WĂŒrzburg ăă) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte ăăć
Źéăăăç 究è«æ: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345)
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (Google ăă) William Fedus, Barret Zoph, Noam Shazeer ăăć
Źéăăăç 究è«æ: [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (Google AI ăă) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu ăăć
Źéăăăç 究è«æ: [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683)
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (Google AI ăă) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu ăăć
ŹéăăăăŹăăžăăȘăŒ [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511)
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research ăă) Brandon Smock, Rohith Pesala, Robin Abraham ăăć
Źéăăăç 究è«æ: [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061)
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI ăă) Jonathan Herzig, PaweĆ Krzysztof Nowak, Thomas MĂŒller, Francesco Piccinno and Julian Martin Eisenschlos ăăć
Źéăăăç 究è«æ: [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349)
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research ăă) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou ăăć
Źéăăăç 究è«æ: [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653)
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (HuggingFace ăă).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook ăă) Gedas Bertasius, Heng Wang, Lorenzo Torresani ăăć
Źéăăăç 究è«æ: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095)
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley ăă) Michael Janner, Qiyang Li, Sergey Levine ăăć
Źéăăăç 究è«æ: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU ăă) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov ăăć
Źéăăăç 究è«æ: [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft ăă), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei ăăć
Źéăăăç 究è«æ: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282)
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill ăă), Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal ăăć
Źéăăăç 究è«æ: [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (Intel ăă), Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding ăăć
Źéăăăç 究è«æ: [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995)
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (Microsoft Research ăă) Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal. ăăć
Źéăăăç 究è«æ [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623)
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research ăă) Yi Tay, Mostafa Dehghani, Vinh Q ăăć
Źéăăăç 究è«æ: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research ăă) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. ăăć
Źéăăăç 究è«æ [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research ăă) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang ăăć
Źéăăăç 究è«æ: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597)
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research ăă) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu ăăć
Źéăăăç 究è«æ: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752)
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University ăă) Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. ăăć
Źéăăăç 究è«æ [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University ăă) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu ăăć
Źéăăăç 究è«æ: [Visual Attention Network](https://arxiv.org/abs/2202.09741)
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University ăă) Zhan Tong, Yibing Song, Jue Wang, Limin Wang ăăć
Źéăăăç 究è«æ: [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602)
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain ăă) Wonjae Kim, Bokyung Son, Ildoo Kim ăăć
Źéăăăç 究è«æ: [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334)
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (University of WisconsinâMadison ăă) Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee. ăăć
Źéăăăç 究è«æ [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784)
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI ăă) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby ăăć
Źéăăăç 究è«æ: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP ăă) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang ăăć
Źéăăăç 究è«æ: [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557)
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI ăă) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby ăăć
Źéăăăç 究è«æ: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (Meta AI ăă) Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. ăăć
Źéăăăç 究è«æ [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527)
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI ăă) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr DollĂĄr, Ross Girshick ăăć
Źéăăăç 究è«æ: [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL ăă) Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. ăăć
Źéăăăç 究è«æ [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272)
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI ăă) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas ăăć
Źéăăăç 究è«æ: [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141)
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise ăă) Jaehyeon Kim, Jungil Kong, Juhee Son. ăăć
Źéăăăç 究è«æ [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103)
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI ăă) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli ăăć
Źéăăăç 究è«æ: [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477)
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI ăă) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino ăăć
Źéăăăç 究è«æ: [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171)
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI ăă) Qiantong Xu, Alexei Baevski, Michael Auli ăăć
Źéăăăç 究è«æ: [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680)
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (Microsoft Research ăă) Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei ăăć
Źéăăăç 究è«æ: [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900)
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI ăă) Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever ăăć
Źéăăăç 究è«æ: [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf)
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (Microsoft Research ăă) Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling ăăć
Źéăăăç 究è«æ: [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816)
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (Meta AI ăă) Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe. ăăć
Źéăăăç 究è«æ [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255)
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li ăăć
Źéăăăç 究è«æ: [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668)
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (Facebook ăă) Guillaume Lample and Alexis Conneau ăăć
Źéăăăç 究è«æ: [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291)
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (Microsoft Research ăă) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou ăăć
Źéăăăç 究è«æ: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063)
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (Facebook AI ăă), Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco GuzmĂĄn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov ăăć
Źéăăăç 究è«æ: [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116)
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI ăă), Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau ăăć
Źéăăăç 究è«æ: [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572)
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (Meta AI ăă) Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa ăăć
Źéăăăç 究è«æ: [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472)
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU ăă) Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le ăăć
Źéăăăç 究è«æ: [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237)
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI ăă) Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli ăăć
Źéăăăç 究è«æ: [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296)
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (Facebook AI ăă) Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli ăăć
Źéăăăç 究è«æ: [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979)
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (Huazhong University of Science & Technology ăă) Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu ăăć
Źéăăăç 究è«æ: [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666)
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (the University of Wisconsin - Madison ăă) Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh ăăć
Źéăăăç 究è«æ: [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714)
-1. æ°ăăăąăă«ăæçšżăăăă§ăăïŒæ°ăăăąăă«ăèżœć ăăăăăźăŹă€ăăšăăŠă**è©łçŽ°ăȘăŹă€ăăšăăłăăŹăŒă**ăèżœć ăăăŸăăăăăăăŻăȘăăžăăȘăź[`templates`](./templates)ăă©ă«ăă«ăăăŸăăPRăć§ăăćă«ăćż
ă[ăłăłăăȘăă„ăŒă·ă§ăłăŹă€ă](./CONTRIBUTING.md)ăçąșèȘăăăĄăłăăă«éŁç”ĄăăăăăăŁăŒăăăăŻăćéăăăăă«issueăéăăŠăă ăăă
+đ€TransformersăŻçŸćšă仄äžăźăąăŒăăăŻăăŁăæäŸăăŠăăŸă: ăăăăăźăă€ăŹăă«ăȘèŠçŽăŻ[ăăĄă](https://huggingface.co/docs/transformers/model_summary)ăćç
§ăăŠăă ăă.
ćăąăă«ăFlaxăPyTorchăTensorFlowă§ćźèŁ
ăăăŠăăăăđ€Tokenizersă©ă€ăă©ăȘă«æŻăăăăéąéŁăăŒăŻăă€ă¶ăæăŁăŠăăăăŻă[ăăźèĄš](https://huggingface.co/docs/transformers/index#supported-frameworks)ăćç
§ăăŠăă ăăă
diff --git a/README_ko.md b/README_ko.md
index fc4b10f79fdbf2..cc67dd13b33688 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -216,276 +216,7 @@ Flax, PyTorch, TensorFlow ì€ìč íìŽì§ìì ìŽë€ì condaëĄ ì€ìčíë
íìŹ ìŹì© ê°ë„í ëȘšëž ìČŽíŹíŹìžížì ê°ì: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€ Transformersë ë€ì ëȘšëžë€ì ì êł”í©ëë€ (ê° ëȘšëžì ììœì [ìŹêž°](https://huggingface.co/docs/transformers/model_summary)ì íìžíìžì):
-
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research ìì ì êł”)ì Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.ì [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from Ăcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (Salesforce ìì ì êł”)ì Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.ì [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa ìì) Adrian de Wynter and Daniel J. Perry ì [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (NAVER CLOVA ìì ì êł”)ì Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.ì [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google Research ìì) Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel ì [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne ìì) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz SuĂĄrez*, Yoann Dupont, Laurent Romary, Ăric Villemonte de la Clergerie, DjamĂ© Seddah and BenoĂźt Sagot ì [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research ìì) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting ì [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys ìì) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou ì [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI ìì ì êł”)ì Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.ì [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI ìì) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever ì [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen ìì) Timo LĂŒddecke and Alexander Ecker ì [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce ìì) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong ì [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (MetaAI ìì ì êł”)ì Baptiste RoziĂšre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, JĂ©rĂ©my Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre DĂ©fossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.ì [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (Cohere ìì ì êł”)ì Cohere. ì [Command-R: Retrieval Augmented Generation at Production Scale]()ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (Microsoft Research Asia ìì) Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang ì [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech ìì) Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan ì [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI ìì) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie ì [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University ìì) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun ì [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce ìì) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher ì [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft ìì) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang ì [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook ìì) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli ì [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft ìì) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen ì [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft ìì) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen ì [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google ìì) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch ì [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (SenseTime Research ìì) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai ì [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (Facebook ìì) Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, HervĂ© JĂ©gou ì [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (Google AI ìì ì êł”)ì Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.ì [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (University of Hong Kong and TikTok ìì ì êł”)ì Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao.ì [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (The University of Texas at Austin ìì ì êł”)ì Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl.ì [NMS Strikes Back](https://arxiv.org/abs/2212.06137)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook ìì) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko ì [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research ìì) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan ì [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs ìì) Ali Hassani and Humphrey Shi ì [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (Meta AI ìì ì êł”)ì Maxime Oquab, TimothĂ©e Darcet, ThĂ©o Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, HervĂ© Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.ì [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace ìì) Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT ì [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research ìì) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei ì [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER ìì) Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park ì [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook ìì) Vladimir Karpukhin, Barlas OÄuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih ì [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs ìì) RenĂ© Ranftl, Alexey Bochkovskiy, Vladlen Koltun ì [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University ìì) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning ì [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (Meta AI ìì ì êł”)ì Alexandre DĂ©fossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.ì [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research ìì) Sascha Rothe, Shashi Narayan, Aliaksei Severyn ì [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu ìì) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu ì [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu ìì ì êł”)ì Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.ì [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (ESPnet and Microsoft Research ìì ì êł”)ì Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang.ì [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoßt Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, SaÄnak TaĆırlar. ë
ŒëŹžêłŒ íšê» êł”ê° [blog post](https://www.adept.ai/blog/fuyu-8b)
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (Google ìì ì êł”)ì the Gemma Google team.ì [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (EleutherAI ìì) Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbac ì [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI ìì) Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever ì [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (AI-Sweden ìì) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren. ì [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode ìì ì êł”)ì Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.ì [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu ì [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others ìì ì êł”)ì Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.ì [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA ìì) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang ì [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology ìì ì êł”)ì Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.ì [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook ìì) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed ì [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley ìì) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer ì [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (Hugging Face ìì ì êł”)ì LĂ©o Tronchon, Hugo Laurencon, Victor Sanh.ì [IDEFICS2](https://huggingface.co/blog/idefics2)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI ìì) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever ì [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce ìì ì êł”)ì Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.ì [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI ìì) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever ì [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia ìì) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou ì [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia ìì) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou ì [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (Microsoft Research Asia ìì) Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei ì [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (Microsoft Research Asia ìì) Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei ì [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (AllenAI ìì) Iz Beltagy, Matthew E. Peters, Arman Cohan ì [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI ìì) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, HervĂ© JĂ©gou, Matthijs Douze ì [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology ìì) Jiapeng Wang, Lianwen Jin, Kai Ding ì [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI ìì ì êł”)ì Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, TimothĂ©e Lacroix, Baptiste RoziĂšre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.ì [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI ìì ì êł”)ì Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..ì [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (Microsoft Research & University of Wisconsin-Madison ìì ì êł”)ì Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.ì [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (Microsoft Research & University of Wisconsin-Madison ìì ì êł”)ì Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.ì [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI ìì) Iz Beltagy, Matthew E. Peters, Arman Cohan ì [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI ìì) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang ì [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia ìì) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto ì [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (UNC Chapel Hill ìì) Hao Tan and Mohit Bansal ì [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (Facebook ìì) Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert ì [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook ìì) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin ì [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (Albert Gu and Tri Dao ìì ì êł”)ì Albert Gu and Tri Dao.ì [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia ìì) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei ì [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC ìì ì êł”)ì Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.ì [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC ìì) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov ì [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (Google AI ìì ì êł”)ì Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.ì [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook ìì) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer ì [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook ìì) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan ì [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (Facebook ìì ì êł”)ì Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.ì [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA ìì) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro ì [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA ìì) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro ì [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research ìì ì êł”)ì Peng Wang, Cheng Da, and Cong Yao.ì [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed..
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia ìì) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka ì [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook ìì ì êł”)ì Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.ì [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain ìì) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou ì [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. ìì) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam ì [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. ìì) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen ì [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple ìì) Sachin Mehta and Mohammad Rastegari ì [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple ìì ì êł”)ì Sachin Mehta and Mohammad Rastegari.ì [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research ìì) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu ì [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (MosaiML ìì ì êł”)ì the MosaicML NLP Team.ì [llm-foundry](https://github.com/mosaicml/llm-foundry/)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison ìì ì êł”)ì Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.ì [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI ìì) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel ì [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box ìì) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen ì [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs ìì) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi ì [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noahâs Ark Lab ìì) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu ì [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta ìì) the NLLB team ì [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta ìì ì êł”)ì the NLLB team.ì [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI ìì ì êł”)ì Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.ì [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison ìì) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh ì [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (AI2 ìì ì êł”)ì Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.ì [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs ìì) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi ì [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI ìì) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al ì [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI ìì) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby ì [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (Google AI ìì ì êł”)ì Matthias Minderer, Alexey Gritsenko, Neil Houlsby.ì [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** ( IBM Research ìì ì êł”)ì Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.ì [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (IBM ìì ì êł”)ì Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.ì [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google ìì) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu ì [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google ìì) Jason Phang, Yao Zhao, Peter J. Liu ì [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind ìì) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier HĂ©naff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, JoĂŁo Carreira ì [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (ADEPT ìì ì êł”)ì Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.ì [blog post](https://www.adept.ai/blog/persimmon-8b)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio CĂ©sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, SĂ©bastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, SĂ©bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research ìì) Dat Quoc Nguyen and Anh Tuan Nguyen ì [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google ìì ì êł”)ì Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.ì [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP ìì) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang ì [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs ìì) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng ì [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research ìì) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou ì [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. ìì ì êł”)ì Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.ì [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc. ìì ì êł”)ì Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.ì [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA ìì) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius ì [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (the Qwen team, Alibaba Group ìì ì êł”)ì Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.ì [Qwen Technical Report](https://arxiv.org/abs/2309.16609)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (the Qwen team, Alibaba Group ìì ì êł”)ì Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.ì [blog post](https://qwenlm.github.io/blog/qwen-moe/)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook ìì) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich KĂŒttler, Mike Lewis, Wen-tau Yih, Tim RocktĂ€schel, Sebastian Riedel, Douwe Kiela ì [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research ìì) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang ì [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (Google ìì ì êł”)ì the Griffin, RLHF and Gemma Teams.ì [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (Google Research ìì) Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya ì [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (META Research ìì) Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr DollĂĄr ì [Designing Network Design Space](https://arxiv.org/abs/2003.13678) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research ìì) Hyung Won Chung, Thibault FĂ©vry, Henry Tsai, M. Johnson, Sebastian Ruder ì [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (Microsoft Research ìì) Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun ì [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (Facebook ìì) Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov ì a [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook ìì) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli ì [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI ìì) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou ì [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology ìì) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu ì a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng ìì ì êł”)ì Bo Peng.ì [this repo](https://github.com/BlinkDL/RWKV-LM)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA ìì) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo ì [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (Beijing Academy of Artificial Intelligence (BAAI ìì ì êł”)ì Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.ì [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI ìì ì êł”)ì Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.ì [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP ìì) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi ì [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP ìì) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi ì [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (Google AI ìì ì êł”)ì Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer.ì [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research ìì ì êł”)ì Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.ì [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (Facebook ìì) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino ì [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook ìì) Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau ì [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University ìì) Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy ì [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley ìì) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer ì [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI ìì ì êł”)ì Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.ì [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft ìì) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo ì [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft ìì) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo ì [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of WĂŒrzburg ìì) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte ì [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (Google ìì) William Fedus, Barret Zoph, Noam Shazeer. ì [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (Google AI ìì) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu ì [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research ìì) Brandon Smock, Rohith Pesala, Robin Abraham ì [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI ìì) Jonathan Herzig, PaweĆ Krzysztof Nowak, Thomas MĂŒller, Francesco Piccinno and Julian Martin Eisenschlos ì [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research ìì) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou ì [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook ìì) Gedas Bertasius, Heng Wang, Lorenzo Torresani ì [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley ìì) Michael Janner, Qiyang Li, Sergey Levin ì [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU ìì) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov ì [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft ìì) Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei ì [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill ìì) Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal ì [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (Intel ìì) Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding ì [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (Microsoft Research ìì ì êł”)ì Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal.ì [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research ìì) Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzle ì [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research ìì ì êł”)ì Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.ì [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research ìì) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang ì [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research ìì) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu ì [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University ìì ì êł”)ì Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.ì [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University ìì) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu ì [Visual Attention Network](https://arxiv.org/abs/2202.09741) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University ìì) Zhan Tong, Yibing Song, Jue Wang, Limin Wang ì [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain ìì) Wonjae Kim, Bokyung Son, Ildoo Kim ì [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (University of WisconsinâMadison ìì ì êł”)ì Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.ì [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI ìì) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby ì [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP ìì) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang ì [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI ìì) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby ì [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (Meta AI ìì ì êł”)ì Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.ì [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI ìì) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr DollĂĄr, Ross Girshick ì [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL ìì ì êł”)ì Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.ì [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI ìì) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas ì [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise ìì ì êł”)ì Jaehyeon Kim, Jungil Kong, Juhee Son.ì [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI ìì) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli ì [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI ìì) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino ì [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI ìì) Qiantong Xu, Alexei Baevski, Michael Auli ì [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (Microsoft Research ìì) Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei ì [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI ìì) Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever ì [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (Microsoft Research ìì) Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling ì [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (Meta AI ìì ì êł”)ì Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.ì [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255)ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (Facebook AI ìì ì êł”) Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li ì [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (Facebook ìì) Guillaume Lample and Alexis Conneau ì [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (Microsoft Research ìì) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou ì [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (Facebook AI ìì) Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco GuzmĂĄn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov ì [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI ìì) Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau ì [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (Meta AI ìì) Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa ì [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU ìì) Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le ì [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI ìì) Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli ì [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (Facebook AI ìì) Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli ì [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (Huazhong University of Science & Technology ìì) Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu ì [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (the University of Wisconsin - Madison ìì) Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh ì [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) ë
ŒëŹžêłŒ íšê» ë°ííì”ëë€.
-1. ìëĄìŽ ëȘšëžì ìŹëŠŹêł ì¶ëì? ì°ëŠŹê° **ììží ê°ìŽëì í
í늿** ìŒëĄ ìëĄìŽ ëȘšëžì ìŹëŠŹëëĄ ëìë늎êČì. ê°ìŽëì í
í늿ì ìŽ ì ì„ìì [`templates`](./templates) íŽëìì íìžíì€ ì ìì”ëë€. [컚ížëŠŹë·°ì
ê°ìŽëëŒìž](./CONTRIBUTING.md)ì êŒ íìžíŽìŁŒìêł , PRì ìŹëŠŹêž° ì ì ë©ìží
ìŽëìêČ ì°ëœíê±°ë ìŽìë„Œ ì€ííŽ íŒëë°±ì ë°ìŒìêžž ë°ëëë€.
+đ€ Transformersë ë€ì ëȘšëžë€ì ì êł”í©ëë€: ê° ëȘšëžì ììœì [ìŹêž°](https://huggingface.co/docs/transformers/model_summary)ì íìžíìžì.
ê° ëȘšëžìŽ Flax, PyTorch, TensorFlowìŒëĄ ê”Źíëìëì§ ëë đ€ Tokenizers ëŒìŽëžëŹëŠŹê° ì§ìíë í íŹëìŽì ë„Œ ìŹì©íëì§ íìžíë €ë©Ž, [ìŽ í](https://huggingface.co/docs/transformers/index#supported-frameworks)ë„Œ íìžíìžì.
diff --git a/README_pt-br.md b/README_pt-br.md
index 6e427643e5d3a2..6f9f4e8a66a6ea 100644
--- a/README_pt-br.md
+++ b/README_pt-br.md
@@ -299,277 +299,7 @@ Siga as påginas de instalação do Flax, PyTorch ou TensorFlow para ver como in
NĂșmero atual de pontos de verificação: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€ Transformers atualmente fornece as seguintes arquiteturas (veja [aqui](https://huggingface.co/docs/transformers/model_summary) para um resumo de alto nĂvel de cada uma delas):
-
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from Ăcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz SuĂĄrez*, Yoann Dupont, Laurent Romary, Ăric Villemonte de la Clergerie, DjamĂ© Seddah and BenoĂźt Sagot.
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo LĂŒddecke and Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste RoziÚre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere.
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (from University of Hong Kong and TikTok) released with the paper [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao.
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl.
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas OÄuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre DĂ©fossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (from ESPnet) released with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoßt Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, SaÄnak TaĆırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (from Google) released with the paper [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by the Gemma Google team.
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (from Hugging Face) released with the paper [IDEFICS2](https://huggingface.co/blog/idefics2) by LĂ©o Tronchon, Hugo Laurencon, Victor Sanh.
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste RoziÚre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (from Albert Gu and Tri Dao) released with the paper [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) by Albert Gu and Tri Dao.
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noahâs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (from AI2) released with the paper [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) by Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** (from IBM Research) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier HĂ©naff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, JoĂŁo Carreira.
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (from Microsoft) released with the paper [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio CĂ©sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, SĂ©bastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, SĂ©bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich KĂŒttler, Mike Lewis, Wen-tau Yih, Tim RocktĂ€schel, Sebastian Riedel, Douwe Kiela.
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams.
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr DollĂĄr.
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault FĂ©vry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (from Beijing Academy of Artificial Intelligence (BAAI) released with the paper [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (from Google AI) released with the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer.
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of WĂŒrzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, PaweĆ Krzysztof Nowak, Thomas MĂŒller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (from Microsoft Research) released with the paper [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) by Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal.
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (from University of WisconsinâMadison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr DollĂĄr, Ross Girshick.
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco GuzmĂĄn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-
-1. Quer contribuir com um novo modelo? Adicionamos um **guia detalhado e modelos de exemplo** para orientar vocĂȘ no processo de adição de um novo modelo. VocĂȘ pode encontrĂĄ-los na pasta [`templates`](./templates) do repositĂłrio. Certifique-se de verificar as [diretrizes de contribuição](./CONTRIBUTING.md) e entrar em contato com os mantenedores ou abrir uma issue para coletar feedback antes de iniciar sua PR.
+đ€ Transformers atualmente fornece as seguintes arquiteturas: veja [aqui](https://huggingface.co/docs/transformers/model_summary) para um resumo de alto nĂvel de cada uma delas.
Para verificar se cada modelo tem uma implementação em Flax, PyTorch ou TensorFlow, ou possui um tokenizador associado com a biblioteca đ€ Tokenizers, consulte [esta tabela](https://huggingface.co/docs/transformers/index#supported-frameworks).
diff --git a/README_ru.md b/README_ru.md
index fa55fd88eddce1..71022439858194 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -289,277 +289,7 @@ conda install conda-forge::transformers
йДĐșŃŃДД ĐșĐŸĐ»ĐžŃĐ”ŃŃĐČĐŸ ĐșĐŸĐœŃŃĐŸĐ»ŃĐœŃŃ
ŃĐŸŃĐ”Đș: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€ Đ ĐœĐ°ŃŃĐŸŃŃДД ĐČŃĐ”ĐŒŃ Transformers ĐżŃĐ”ĐŽĐŸŃŃĐ°ĐČĐ»ŃĐ”Ń ŃлДЎŃŃŃОД Đ°ŃŃ
ĐžŃĐ”ĐșŃŃŃŃ (ĐżĐŸĐŽŃĐŸĐ±ĐœĐŸĐ” ĐŸĐżĐžŃĐ°ĐœĐžĐ” ĐșĐ°Đ¶ĐŽĐŸĐč Оз ĐœĐžŃ
ŃĐŒ. [Đ·ĐŽĐ”ŃŃ](https://huggingface.co/docs/transformers/model_summary)):
-
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from Ăcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz SuĂĄrez*, Yoann Dupont, Laurent Romary, Ăric Villemonte de la Clergerie, DjamĂ© Seddah and BenoĂźt Sagot.
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo LĂŒddecke and Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste RoziÚre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere.
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (from University of Hong Kong and TikTok) released with the paper [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao.
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl.
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas OÄuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre DĂ©fossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (from ESPnet) released with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoßt Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, SaÄnak TaĆırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (from Google) released with the paper [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by the Gemma Google team.
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (from Hugging Face) released with the paper [IDEFICS2](https://huggingface.co/blog/idefics2) by LĂ©o Tronchon, Hugo Laurencon, Victor Sanh.
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste RoziÚre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (from Albert Gu and Tri Dao) released with the paper [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) by Albert Gu and Tri Dao.
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noahâs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (from AI2) released with the paper [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) by Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** (from IBM Research) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier HĂ©naff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, JoĂŁo Carreira.
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (from Microsoft Research) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio CĂ©sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, SĂ©bastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, SĂ©bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich KĂŒttler, Mike Lewis, Wen-tau Yih, Tim RocktĂ€schel, Sebastian Riedel, Douwe Kiela.
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams.
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr DollĂĄr.
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault FĂ©vry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (from Beijing Academy of Artificial Intelligence (BAAI) released with the paper [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (from Google AI) released with the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer.
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of WĂŒrzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, PaweĆ Krzysztof Nowak, Thomas MĂŒller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (from Microsoft Research) released with the paper [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) by Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal.
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (from University of WisconsinâMadison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr DollĂĄr, Ross Girshick.
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco GuzmĂĄn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-
-1. Đ„ĐŸŃĐžŃĐ” ĐČĐœĐ”ŃŃĐž ĐœĐŸĐČŃŃ ĐŒĐŸĐŽĐ”Đ»Ń? ĐŃ ĐŽĐŸĐ±Đ°ĐČОлО **ĐżĐŸĐŽŃĐŸĐ±ĐœĐŸĐ” ŃŃĐșĐŸĐČĐŸĐŽŃŃĐČĐŸ Đž ŃĐ°Đ±Đ»ĐŸĐœŃ**, ŃŃĐŸĐ±Ń ĐżĐŸĐŒĐŸŃŃ ĐČĐ°ĐŒ ĐČ ĐżŃĐŸŃĐ”ŃŃĐ” ĐŽĐŸĐ±Đ°ĐČĐ»Đ”ĐœĐžŃ ĐœĐŸĐČĐŸĐč ĐŒĐŸĐŽĐ”Đ»Đž. ĐŃ ĐŒĐŸĐ¶Đ”ŃĐ” ĐœĐ°ĐčŃĐž ĐžŃ
ĐČ ĐżĐ°ĐżĐșĐ” [`templates`](./templates) ŃĐ”ĐżĐŸĐ·ĐžŃĐŸŃĐžŃ. ĐбŃĐ·Đ°ŃДлŃĐœĐŸ ĐŸĐ·ĐœĐ°ĐșĐŸĐŒŃŃĐ”ŃŃ Ń [ŃŃĐșĐŸĐČĐŸĐŽŃŃĐČĐŸĐŒ ĐżĐŸ ĐČĐœĐ”ŃĐ”ĐœĐžŃ ĐžĐ·ĐŒĐ”ĐœĐ”ĐœĐžĐč](./CONTRIBUTING.md) Đž ŃĐČŃжОŃĐ”ŃŃ Ń ĐŸŃĐČĐ”ŃŃŃĐČĐ”ĐœĐœŃĐŒ ŃĐ°Đ·ŃĐ°Đ±ĐŸŃŃĐžĐșĐŸĐŒ ОлО ĐŸŃĐșŃĐŸĐčŃĐ” Đ·Đ°ĐŽĐ°ŃŃ, ŃŃĐŸĐ±Ń ŃĐŸĐ±ŃĐ°ŃŃ ĐŸŃĐ·ŃĐČŃ ĐżĐ”ŃДЎ ĐœĐ°ŃĐ°Đ»ĐŸĐŒ ŃĐ°Đ±ĐŸŃŃ ĐœĐ°ĐŽ ĐČĐ°ŃĐžĐŒ ĐżŃлл-ŃĐ”ĐșĐČĐ”ŃŃĐŸĐŒ.
+đ€ Đ ĐœĐ°ŃŃĐŸŃŃДД ĐČŃĐ”ĐŒŃ Transformers ĐżŃĐ”ĐŽĐŸŃŃĐ°ĐČĐ»ŃĐ”Ń ŃлДЎŃŃŃОД Đ°ŃŃ
ĐžŃĐ”ĐșŃŃŃŃ: ĐżĐŸĐŽŃĐŸĐ±ĐœĐŸĐ” ĐŸĐżĐžŃĐ°ĐœĐžĐ” ĐșĐ°Đ¶ĐŽĐŸĐč Оз ĐœĐžŃ
ŃĐŒ. [Đ·ĐŽĐ”ŃŃ](https://huggingface.co/docs/transformers/model_summary).
ЧŃĐŸĐ±Ń ĐżŃĐŸĐČĐ”ŃĐžŃŃ, Đ”ŃŃŃ Đ»Đž Ń ĐșĐ°Đ¶ĐŽĐŸĐč ĐŒĐŸĐŽĐ”Đ»Đž ŃДалОзаŃĐžŃ ĐœĐ° Flax, PyTorch ОлО TensorFlow, ОлО ŃĐČŃĐ·Đ°ĐœĐœŃĐč Ń ĐœĐ”Đč ŃĐŸĐșĐ”ĐœĐžĐ·Đ°ŃĐŸŃ, ĐżĐŸĐŽĐŽĐ”ŃжОĐČĐ°Đ”ĐŒŃĐč Đ±ĐžĐ±Đ»ĐžĐŸŃĐ”ĐșĐŸĐč đ€ Tokenizers, ĐŸĐ±ŃĐ°ŃĐžŃĐ”ŃŃ Đș [ŃŃĐŸĐč ŃаблОŃĐ”](https://huggingface.co/docs/transformers/index#supported-frameworks).
diff --git a/README_te.md b/README_te.md
index 6677b33b11a75b..19cbe320624186 100644
--- a/README_te.md
+++ b/README_te.md
@@ -291,278 +291,8 @@ Flax, PyTorch à°Čà±à°Šà°Ÿ TensorFlow à°Żà±à°à±à° à°à°šà±âà°žà±à°à°Ÿ
à°Șà±à°°à°žà±à°€à±à°€ à°€à°šà°żà°à± à°à±à°à°Šà±à°°à°Ÿà°Č à°žà°à°à±à°Ż: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€ à°à±à°°à°Ÿà°šà±à°žà±âà°«à°Ÿà°°à±à°źà°°à±à°Čà± à°Șà±à°°à°žà±à°€à±à°€à° à°à°żà°à°Šà°ż à°à°°à±à°à°żà°à±à°à±à°à°°à±âà°Čà°šà± à°
à°à°Šà°à±à°žà±à°€à±à°šà±à°šà°Ÿà°Żà°ż (ఔటà°à°żà°Čà± à°Șà±à°°à°€à°ż à°à°à±à°à°à°ż à°à°šà±à°šà°€ à°žà±à°„à°Ÿà°Żà°ż à°žà°Ÿà°°à°Ÿà°à°¶à° à°à±à°žà° [à°à°à±à°à°Ą](https://huggingface.co/docs/transformers/model_summary) à°à±à°Ąà°à°Ąà°ż):
-
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer.
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from Ăcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz SuĂĄrez*, Yoann Dupont, Laurent Romary, Ăric Villemonte de la Clergerie, DjamĂ© Seddah and BenoĂźt Sagot.
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo LĂŒddecke and Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste RoziÚre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere.
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (from University of Hong Kong and TikTok) released with the paper [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao.
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl.
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas OÄuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre DĂ©fossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (from ESPnet) released with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoßt Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, SaÄnak TaĆırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (from Google) released with the paper [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by the Gemma Google team.
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (from Hugging Face) released with the paper [IDEFICS2](https://huggingface.co/blog/idefics2) by LĂ©o Tronchon, Hugo Laurencon, Victor Sanh.
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste RoziÚre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (from Albert Gu and Tri Dao) released with the paper [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) by Albert Gu and Tri Dao.
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noahâs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (from AI2) released with the paper [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) by Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** (from IBM Research) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier HĂ©naff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, JoĂŁo Carreira.
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (from Microsoft) released with the paper [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio CĂ©sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, SĂ©bastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, SĂ©bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich KĂŒttler, Mike Lewis, Wen-tau Yih, Tim RocktĂ€schel, Sebastian Riedel, Douwe Kiela.
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams.
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr DollĂĄr.
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault FĂ©vry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (from Beijing Academy of Artificial Intelligence (BAAI) released with the paper [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (from Google AI) released with the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer.
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of WĂŒrzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, PaweĆ Krzysztof Nowak, Thomas MĂŒller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (from Microsoft Research) released with the paper [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) by Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal.
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (from University of WisconsinâMadison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr DollĂĄr, Ross Girshick.
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco GuzmĂĄn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-1. à°à±à°€à±à°€ à°źà±à°Ąà°Čà±âà°šà± à°
à°à°Šà°żà°à°à°Ÿà°Čà°šà±à°à±à°à°à±à°šà±à°šà°Ÿà°°à°Ÿ? à°à±à°€à±à°€ à°źà±à°Ąà°Čà±âà°šà± à°à±à°Ąà°żà°à°à± à°Șà±à°°à°à±à°°à°żà°Żà°Čà± à°źà±à°à± à°źà°Ÿà°°à±à°à°šà°żà°°à±à°Šà±à°¶à° à°à±à°žà±à°à°Šà±à°à± à°źà±à°źà± **à°”à°żà°”à°°à°Łà°Ÿà°€à±à°źà° à°à±à°Ąà± à°źà°°à°żà°Żà± à°à±à°à°Șà±à°Čà±à°à±âà°Čà°šà±** à°à±à°Ąà°żà°à°à°Ÿà°źà±. à°źà±à°°à± ఔటà°à°żà°šà°ż à°°à°żà°Șà±à°à°żà°à°°à± à°Żà±à°à±à° [`à°à±à°à°Șà±à°Čà±à°à±à°Čà±`](./à°à±à°à°Șà±à°Čà±à°à±à°Čà±) à°«à±à°Čà±à°Ąà°°à±âà°Čà± à°à°šà±à°à±à°šà°”à°à±à°à±. à°źà± PRà°šà°ż à°Șà±à°°à°Ÿà°°à°à°à°żà°à°à°Ąà°Ÿà°šà°żà°à°ż à°źà±à°à°Šà± [à°žà°čà°à°Ÿà°° à°źà°Ÿà°°à±à°à°Šà°°à±à°¶à°à°Ÿà°Čà±](./CONTRIBUTING.md)à°šà°ż à°€à°šà°żà°à± à°à±à°žà°ż, à°šà°żà°°à±à°”à°čà°Łà°Šà°Ÿà°°à±à°Čà°šà± à°žà°à°Șà±à°°à°Šà°żà°à°à°à°Ąà°ż à°Čà±à°Šà°Ÿ à°
à°à°żà°Șà±à°°à°Ÿà°Żà°Ÿà°šà±à°šà°ż à°žà±à°à°°à°żà°à°à°Ąà°Ÿà°šà°żà°à°ż à°žà°źà°žà±à°Żà°šà± à°€à±à°°à°”à°à°Ąà°ż.
-
-à°Șà±à°°à°€à°ż à°źà±à°Ąà°Čà± à°«à±à°Čà°Ÿà°à±à°žà±, à°Șà±à°à°Ÿà°°à±à°à± à°Čà±à°Šà°Ÿ à°à±à°šà±à°žà°°à±âà°«à±à°Čà±à°Čà± à°
à°źà°Čà± à°à±à°Żà°Źà°Ąà°żà°à°Šà°Ÿ à°Čà±à°Šà°Ÿ đ€ Tokenizers à°Čà±à°Źà±à°°à°°à± à°Šà±à°”à°Ÿà°°à°Ÿ à°
à°šà±à°Źà°à°§à°żà°à°à°Źà°Ąà°żà°š à°à±à°à±à°šà±à°à°°à±âà°šà°ż à°à°Čà°żà°à°ż à°à°à°Šà± à°Čà±à°Šà± à°€à°šà°żà°à± à°à±à°Żà°Ąà°Ÿà°šà°żà°à°ż, [à° à°Șà°à±à°à°żà°](https://huggingface.co/docs/transformers/index#supported-frameworks).
+đ€ à°à±à°°à°Ÿà°šà±à°žà±âà°«à°Ÿà°°à±à°źà°°à±à°Čà± à°Șà±à°°à°žà±à°€à±à°€à° à°à°żà°à°Šà°ż à°à°°à±à°à°żà°à±à°à±à°à°°à±âà°Čà°šà± à°
à°à°Šà°à±à°žà±à°€à±à°šà±à°šà°Ÿà°Żà°ż: ఔటà°à°żà°Čà± à°Șà±à°°à°€à°ż à°à°à±à°à°à°ż à°à°šà±à°šà°€ à°žà±à°„à°Ÿà°Żà°ż à°žà°Ÿà°°à°Ÿà°à°¶à° à°à±à°žà° [à°à°à±à°à°Ą](https://huggingface.co/docs/transformers/model_summary) à°à±à°Ąà°à°Ąà°ż.
+
à° à°
à°źà°Čà±à°Čà± à°
à°šà±à° à°Ąà±à°à°Ÿà°žà±à°à±âà°Čà°Čà± à°Șà°°à±à°à±à°·à°żà°à°à°Źà°Ąà±à°Ąà°Ÿà°Żà°ż (à°à°Šà°Ÿà°čà°°à°Ł à°žà±à°à±à°°à°żà°Șà±à°à±âà°Čà°šà± à°à±à°Ąà°à°Ąà°ż) à°źà°°à°żà°Żà± à°
à°žà°Čà±à°š à°
à°źà°Čà±à°Č à°Șà°šà°żà°€à±à°°à±à°€à± à°žà°°à°żà°Șà±à°Čà°Ÿà°Čà°ż. à°źà±à°°à± [à°Ąà°Ÿà°à±à°Żà±à°źà±à°à°à±à°·à°šà±](https://github.com/huggingface/transformers/tree/main/examples) à°Żà±à°à±à° à°à°Šà°Ÿà°čà°°à°Łà°Č à°”à°żà°à°Ÿà°à°à°Čà± à°Șà°šà°żà°€à±à°°à±à°Șà± à°źà°°à°żà°šà±à°šà°ż à°”à°żà°”à°°à°Ÿà°Čà°šà± à°à°šà±à°à±à°šà°”à°à±à°à±.
diff --git a/README_vi.md b/README_vi.md
index 6f77b43da9add0..4b48800ee349b4 100644
--- a/README_vi.md
+++ b/README_vi.md
@@ -290,276 +290,7 @@ HĂŁy lĂ m theo trang cĂ i Äáș·t của Flax, PyTorch hoáș·c TensorFlow Äá» xem
Sá» lÆ°á»Łng Äiá»m kiá»m tra hiá»n táșĄi: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€ Transformers hiá»n Äang cung cáș„p cĂĄc kiáșżn trĂșc sau ÄĂąy (xem [á» ÄĂąy](https://huggingface.co/docs/transformers/model_summary) Äá» cĂł má»t tĂłm táșŻt tá»ng quan vá» má»i kiáșżn trĂșc):
-
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (từ Google Research vĂ Toyota Technological Institute táșĄi Chicago) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), của Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) của Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (từ BAAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) của Chen, Zhongzhi vĂ Liu, Guang vĂ Zhang, Bo-Wen vĂ Ye, Fulong vĂ Yang, Qinghong vĂ Wu, Ledell.
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (từ MIT) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) của Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (từ ÄáșĄi há»c Tsinghua) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) của Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (từ Suno) ÄÆ°á»Łc phĂĄt hĂ nh trong kho lÆ°u trữ [suno-ai/bark](https://github.com/suno-ai/bark) bá»i Äá»i ngĆ© Suno AI.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) của Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov vĂ Luke Zettlemoyer.
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (từ Ăcole polytechnique) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) của Moussa Kamal Eddine, Antoine J.-P. Tixier vĂ Michalis Vazirgiannis.
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (từ VinAI Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) của Nguyen Luong Tran, Duong Minh Le vĂ Dat Quoc Nguyen.
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (từ Microsoft) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) của Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (từ Google) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) của Jacob Devlin, Ming-Wei Chang, Kenton Lee vĂ Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (từ Google) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) của Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (từ VinAI Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) của Dat Quoc Nguyen, Thanh Vu vĂ Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) của Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang vĂ Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) của Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang vĂ Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (từ Microsoft Research AI4Science) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (từ Google AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) của Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) của Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) của Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (từ Salesforce) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) của Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (từ Salesforce) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (từ BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (từ Alexa) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (từ Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (từ NAVER CLOVA) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (từ Inria/Facebook/Sorbonne) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz SuĂĄrez*, Yoann Dupont, Laurent Romary, Ăric Villemonte de la Clergerie, DjamĂ© Seddah and BenoĂźt Sagot.
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (từ OFA-Sys) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (từ LAION-AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (từ OpenAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (từ University of Göttingen) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo LĂŒddecke and Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (từ Salesforce) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (từ MetaAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste RoziĂšre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, JĂ©rĂ©my Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre DĂ©fossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (từ Cohere) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere.
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (từ Microsoft Research Asia) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (từ YituTech) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (từ Facebook AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (từ Facebook AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (từ Tsinghua University) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (từ OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (từ Salesforce) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (từ Microsoft) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (từ Microsoft) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (từ Microsoft) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (từ Berkeley/Facebook/Google) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (từ SenseTime Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, HervĂ© JĂ©gou.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (từ Google AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (từ University of Hong Kong and TikTok) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao.
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (từ The University of Texas at Austin) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl.
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (từ SHI Labs) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, TimothĂ©e Darcet, ThĂ©o Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, HervĂ© Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (từ HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (từ NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas OÄuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (từ Intel Labs) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by RenĂ© Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (từ Snap Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (từ Google Brain) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (từ Google Research/Stanford University) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre DĂ©fossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (từ Baidu) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (từ Baidu) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (từ Meta AI) are transformer protein language models. **ESM-1b** was ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (từ Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (từ ESPnet) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (từ Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (từ Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (từ CNRS) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, LoĂŻc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, BenoĂźt CrabbĂ©, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (từ Facebook AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (từ CMU/Google Brain) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (từ ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, SaÄnak TaĆırlar. ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [blog post](https://www.adept.ai/blog/fuyu-8b)
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (từ Google) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by the Gemma Google team.
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (từ KAIST) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (từ OpenAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (từ EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (từ EleutherAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (từ ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (từ OpenAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (từ EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (từ AI-Sweden) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (từ BigCode) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (từ Microsoft) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (từ Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (từ UCSD, NVIDIA) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (từ Allegro.pl, AGH University of Science and Technology) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (từ Berkeley) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (từ HuggingFace) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, LĂ©o Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (từ Hugging Face) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [IDEFICS2](https://huggingface.co/blog/idefics2) by LĂ©o Tronchon, Hugo Laurencon, Victor Sanh.
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (từ OpenAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (từ Beihang University, UC Berkeley, Rutgers University, SEDD Company) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (từ Salesforce) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (từ OpenAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (từ Microsoft Research Asia) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (từ Microsoft Research Asia) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (từ Microsoft Research Asia) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (từ Microsoft Research Asia) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (từ Microsoft Research Asia) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (từ AllenAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, HervĂ© JĂ©gou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (từ South China University of Technology) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (từ The FAIR team of Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, TimothĂ©e Lacroix, Baptiste RoziĂšre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (từ The FAIR team of Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (từ Microsoft Research & University of Wisconsin-Madison) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (từ Microsoft Research & University of Wisconsin-Madison) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (từ AllenAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (từ Google AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (từ Studio Ousia) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (từ UNC Chapel Hill) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (từ Google) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (từ Albert Gu and Tri Dao) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) by Albert Gu and Tri Dao.
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (từ Microsoft Research Asia) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (từ FAIR and UIUC) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (từ Meta and UIUC) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (từ Google AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (từ Meta/USC/CMU/SJTU) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (từ NVIDIA) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (từ NVIDIA) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (từ Alibaba Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (từ Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (từ Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (từ Studio Ousia) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (từ CMU/Google Brain) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (từ Google Inc.) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (từ Google Inc.) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (từ Apple) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (từ Apple) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (từ MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (từ the University of Wisconsin - Madison) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (từ Google AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (từ Meta) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (từ Meta) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (từ RUC AI Box) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (từ SHI Labs) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (từ Huawei Noahâs Ark Lab) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (từ Meta) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (từ Meta) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (từ the University of Wisconsin - Madison) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (từ AI2) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) by Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (từ SHI Labs) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (từ [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (từ Google AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (từ Google AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** (từ IBM Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (từ IBM) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (từ Google) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (từ Google) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (từ Deepmind) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier HĂ©naff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, JoĂŁo Carreira.
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (từ ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (từ Microsoft) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄos - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio CĂ©sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, SĂ©bastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, SĂ©bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (từ VinAI Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (từ Google) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (từ UCLA NLP) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (từ Sea AI Labs) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (từ Nanjing University, The University of Hong Kong etc.) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (từ Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (từ NVIDIA) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (từ the Qwen team, Alibaba Group) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (từ the Qwen team, Alibaba Group) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich KĂŒttler, Mike Lewis, Wen-tau Yih, Tim RocktĂ€schel, Sebastian Riedel, Douwe Kiela.
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (từ Google) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams.
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (từ META Platforms) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr DollĂĄr.
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault FĂ©vry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (từ Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (từ WeChatAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (từ ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (từ Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (từ NVIDIA) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (từ Beijing Academy of Artificial Intelligence (BAAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (từ ASAPP) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (từ ASAPP) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (từ Google AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer.
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (từ Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (từ Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (từ Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (từ Berkeley) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (từ Stability AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (từ BigCode team) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (từ MagicLeap) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (từ MBZUAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (từ Microsoft) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (từ Microsoft) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (từ University of WĂŒrzburg) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (từ Google) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (từ Google AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (từ Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (từ Google AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, PaweĆ Krzysztof Nowak, Thomas MĂŒller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (từ HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (từ Facebook) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (từ the University of California at Berkeley) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (từ Google/CMU) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (từ Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (từ UNC Chapel Hill) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (từ Intel) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) by Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal.
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (từ Kakao Corporation) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (từ Peking University) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (từ Tsinghua University and Nankai University) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (từ Multimedia Computing Group, Nanjing University) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (từ NAVER AI Lab/Kakao Enterprise/Kakao Brain) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (từ University of WisconsinâMadison) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (từ Google AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (từ UCLA NLP) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (từ Google AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr DollĂĄr, Ross Girshick.
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (từ HUST-VL) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (từ Kakao Enterprise) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (từ Google Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (từ Facebook AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (từ Facebook AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (từ Facebook AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (từ OpenAI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (từ Facebook AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (từ Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (từ Microsoft Research) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (từ Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmån, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (từ Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (từ Meta AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (từ Google/CMU) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (từ Facebook AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (từ Facebook AI) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (từ Huazhong University of Science & Technology) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (từ the University of Wisconsin - Madison) ÄÆ°á»Łc phĂĄt hĂ nh vá»i bĂ i bĂĄo [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-1. Muá»n ÄĂłng gĂłp má»t mĂŽ hĂŹnh má»i? ChĂșng tĂŽi ÄĂŁ thĂȘm má»t **hÆ°á»ng dáș«n chi tiáșżt vĂ máș«u** Äá» hÆ°á»ng dáș«n báșĄn trong quĂĄ trĂŹnh thĂȘm má»t mĂŽ hĂŹnh má»i. BáșĄn cĂł thá» tĂŹm tháș„y chĂșng trong thÆ° mỄc [`templates`](./templates) của kho lÆ°u trữ. HĂŁy cháșŻc cháșŻn kiá»m tra [hÆ°á»ng dáș«n ÄĂłng gĂłp](./CONTRIBUTING.md) vĂ liĂȘn há» vá»i ngÆ°á»i duy trĂŹ hoáș·c má» má»t váș„n Äá» Äá» thu tháșp pháșŁn há»i trÆ°á»c khi báșŻt Äáș§u PR của báșĄn.
+đ€ Transformers hiá»n Äang cung cáș„p cĂĄc kiáșżn trĂșc sau ÄĂąy: xem [á» ÄĂąy](https://huggingface.co/docs/transformers/model_summary) Äá» cĂł má»t tĂłm táșŻt tá»ng quan vá» má»i kiáșżn trĂșc.
Äá» kiá»m tra xem má»i mĂŽ hĂŹnh cĂł má»t phiĂȘn báșŁn thá»±c hiá»n trong Flax, PyTorch hoáș·c TensorFlow, hoáș·c cĂł má»t tokenizer liĂȘn quan ÄÆ°á»Łc há» trợ bá»i thÆ° viá»n đ€ Tokenizers, vui lĂČng tham kháșŁo [báșŁng nĂ y](https://huggingface.co/docs/transformers/index#supported-frameworks).
diff --git a/README_zh-hans.md b/README_zh-hans.md
index a92169769a3741..b89edf31071eb1 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -240,276 +240,7 @@ conda install conda-forge::transformers
çźćçæŁæ„çčæ°éïŒ ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€ Transformers çźćæŻæćŠäžçæ¶æïŒæšĄćæŠèż°èŻ·é
[èżé](https://huggingface.co/docs/transformers/model_summary)ïŒïŒ
-
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (æ„èȘ Google Research and the Toyota Technological Institute at Chicago) 䌎éèźșæ [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), ç± Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut ććžă
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (æ„èȘ Google Research) 䌎éèźșæ [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) ç± Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig ććžă
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (æ„èȘ BAAI) 䌎éèźșæ [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) ç± Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell ććžă
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (æ„èȘ MIT) 䌎éèźșæ [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) ç± Yuan Gong, Yu-An Chung, James Glass ććžă
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (æ„èȘ Facebook) 䌎éèźșæ [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) ç± Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer ććžă
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (æ„èȘ Ăcole polytechnique) 䌎éèźșæ [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) ç± Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis ććžă
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (æ„èȘ VinAI Research) 䌎éèźșæ [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) ç± Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen ććžă
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (æ„èȘ Microsoft) 䌎éèźșæ [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) ç± Hangbo Bao, Li Dong, Furu Wei ććžă
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (æ„èȘ Google) 䌎éèźșæ [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) ç± Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova ććžă
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (æ„èȘ Google) 䌎éèźșæ [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) ç± Sascha Rothe, Shashi Narayan, Aliaksei Severyn ććžă
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (æ„èȘ VinAI Research) 䌎éèźșæ [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) ç± Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen ććžă
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (æ„èȘ Google Research) 䌎éèźșæ [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) ç± Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed ććžă
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (æ„èȘ Google Research) 䌎éèźșæ [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) ç± Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed ććžă
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (æ„èȘ Microsoft Research AI4Science) 䌎éèźșæ [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) ç± Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu ććžă
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (æ„èȘ Google AI) 䌎éèźșæ [Big Transfer (BiT) ç± Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby ććžă
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (æ„èȘ Facebook) 䌎éèźșæ [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) ç± Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston ććžă
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (æ„èȘ Facebook) 䌎éèźșæ [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) ç± Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston ććžă
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (æ„èȘ Salesforce) 䌎éèźșæ [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) ç± Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi ććžă
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (æ„èȘ Salesforce) 䌎éèźșæ [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) ç± Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi ććžă
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (æ„èȘ Alexa) 䌎éèźșæ [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) ç± Adrian de Wynter and Daniel J. Perry ććžă
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (æ„èȘ NAVER CLOVA) 䌎éèźșæ [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) ç± Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park ććžă
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (æ„èȘ Google Research) 䌎éèźșæ [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) ç± Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel ććžă
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (æ„èȘ Inria/Facebook/Sorbonne) 䌎éèźșæ [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) ç± Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz SuĂĄrez*, Yoann Dupont, Laurent Romary, Ăric Villemonte de la Clergerie, DjamĂ© Seddah and BenoĂźt Sagot ććžă
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (æ„èȘ Google Research) 䌎éèźșæ [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) ç± Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting ććžă
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (æ„èȘ OFA-Sys) 䌎éèźșæ [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) ç± An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou ććžă
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (æ„èȘ LAION-AI) 䌎éèźșæ [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) ç± Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov ććžă
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (æ„èȘ OpenAI) 䌎éèźșæ [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) ç± Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever ććžă
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (æ„èȘ University of Göttingen) 䌎éèźșæ [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) ç± Timo LĂŒddecke and Alexander Ecker ććžă
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (æ„èȘ Salesforce) 䌎éèźșæ [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) ç± Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong ććžă
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (æ„èȘ MetaAI) 䌎éèźșæ [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) ç± Baptiste RoziĂšre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, JĂ©rĂ©my Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre DĂ©fossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve ććžă
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (æ„èȘ Cohere) 䌎éèźșæ [Command-R: Retrieval Augmented Generation at Production Scale]() ç± Cohere ććžă
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (æ„èȘ Microsoft Research Asia) 䌎éèźșæ [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) ç± Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang ććžă
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (æ„èȘ YituTech) 䌎éèźșæ [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) ç± Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan ććžă
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (æ„èȘ Facebook AI) 䌎éèźșæ [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) ç± Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie ććžă
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (æ„èȘ Tsinghua University) 䌎éèźșæ [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) ç± Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun ććžă
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (æ„èȘ Salesforce) 䌎éèźșæ [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) ç± Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher ććžă
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (æ„èȘ Microsoft) 䌎éèźșæ [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) ç± Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang ććžă
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (æ„èȘ Facebook) 䌎éèźșæ [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) ç± Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli ććžă
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (æ„èȘ Microsoft) 䌎éèźșæ [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) ç± Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen ććžă
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (æ„èȘ Microsoft) 䌎éèźșæ [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) ç± Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen ććžă
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (æ„èȘ Berkeley/Facebook/Google) 䌎éèźșæ [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) ç± Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch ććžă
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (æ„èȘ SenseTime Research) 䌎éèźșæ [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) ç± Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai ććžă
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (æ„èȘ Facebook) 䌎éèźșæ [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) ç± Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, HervĂ© JĂ©gou ććžă
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (æ„èȘ Google AI) 䌎éèźșæ [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) ç± Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun ććžă
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (æ„èȘ University of Hong Kong and TikTok) 䌎éèźșæ [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) ç± Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao ććžă
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (æ„èȘ The University of Texas at Austin) 䌎éèźșæ [NMS Strikes Back](https://arxiv.org/abs/2212.06137) ç± Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl ććžă
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (æ„èȘ Facebook) 䌎éèźșæ [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) ç± Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko ććžă
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (æ„èȘ Microsoft Research) 䌎éèźșæ [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) ç± Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan ććžă
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (æ„èȘ SHI Labs) 䌎éèźșæ [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) ç± Ali Hassani and Humphrey Shi ććžă
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (æ„èȘ Meta AI) 䌎éèźșæ [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) ç± Maxime Oquab, TimothĂ©e Darcet, ThĂ©o Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, HervĂ© Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski ććžă
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (æ„èȘ HuggingFace), 䌎éèźșæ [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) ç± Victor Sanh, Lysandre Debut and Thomas Wolf ććžă ćæ ·çæčæłäčćșçšäșć猩 GPT-2 ć° [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa ć° [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT ć° [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) ććŸ·èŻç DistilBERTă
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (æ„èȘ Microsoft Research) 䌎éèźșæ [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) ç± Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei ććžă
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (æ„èȘ NAVER) 䌎éèźșæ [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) ç± Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park ććžă
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (æ„èȘ Facebook) 䌎éèźșæ [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) ç± Vladimir Karpukhin, Barlas OÄuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih ććžă
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (æ„èȘ Intel Labs) 䌎éèźșæ [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) ç± RenĂ© Ranftl, Alexey Bochkovskiy, Vladlen Koltun ććžă
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (æ„èȘ Snap Research) 䌎éèźșæ [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) ç± Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren ććžă
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (æ„èȘ Google Research/Stanford University) 䌎éèźșæ [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) ç± Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning ććžă
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (æ„èȘ Meta AI) 䌎éèźșæ [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) ç± Alexandre DĂ©fossez, Jade Copet, Gabriel Synnaeve, Yossi Adi ććžă
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (æ„èȘ Google Research) 䌎éèźșæ [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) ç± Sascha Rothe, Shashi Narayan, Aliaksei Severyn ććžă
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (æ„èȘ Baidu) 䌎éèźșæ [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu ććžă
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (æ„èȘ Baidu) 䌎éèźșæ [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) ç± Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang ććžă
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (æ„èȘ ESPnet and Microsoft Research) 䌎éèźșæ [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) ç± Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang ććžă
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (æ„èȘ CNRS) 䌎éèźșæ [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) ç± Hang Le, LoĂŻc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, BenoĂźt CrabbĂ©, Laurent Besacier, Didier Schwab ććžă
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (æ„èȘ Facebook AI) 䌎éèźșæ [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) ç± Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela ććžă
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (æ„èȘ Google Research) 䌎éèźșæ [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) ç± James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon ććžă
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (æ„èȘ Microsoft Research) 䌎éèźșæ [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) ç± Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao ććžă
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (æ„èȘ CMU/Google Brain) 䌎éèźșæ [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) ç± Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le ććžă
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (æ„èȘ ADEPT) 䌎éèźșæ [blog post](https://www.adept.ai/blog/fuyu-8b) ç± Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, SaÄnak TaĆırlar ććžă
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (æ„èȘ Google) 䌎éèźșæ [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) ç± the Gemma Google team ććžă
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (æ„èȘ Microsoft Research) 䌎éèźșæ [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) ç± Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang ććžă
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (æ„èȘ KAIST) 䌎éèźșæ [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) ç± Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim ććžă
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (æ„èȘ OpenAI) 䌎éèźșæ [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) ç± Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever ććžă
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (æ„èȘ EleutherAI) éä»ćș [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) ććžăäœè
äžș Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy ććžă
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (æ„èȘ ABEJA) ç± Shinya Otani, Takayoshi Makabe, Anuj Arora, Kyo Hattoriă
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (æ„èȘ OpenAI) 䌎éèźșæ [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) ç± Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever ććžă
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (æ„èȘ EleutherAI) 䌎éèźșæ [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) ç± Ben Wang and Aran Komatsuzaki ććžă
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (æ„èȘ BigCode) 䌎éèźșæ [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) ç± Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra ććžă
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by ćæŹäżäč(tanreinama).
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (æ„èȘ Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) 䌎éèźșæ [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) ç± Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang ććžă
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (æ„èȘ UCSD, NVIDIA) 䌎éèźșæ [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) ç± Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang ććžă
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (æ„èȘ Allegro.pl, AGH University of Science and Technology) 䌎éèźșæ [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) ç± Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik ććžă
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (æ„èȘ Facebook) 䌎éèźșæ [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) ç± Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed ććžă
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (æ„èȘ Berkeley) 䌎éèźșæ [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) ç± Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer ććžă
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (æ„èȘ Hugging Face) 䌎éèźșæ [IDEFICS2](https://huggingface.co/blog/idefics2) ç± LĂ©o Tronchon, Hugo Laurencon, Victor Sanh ććžă
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (æ„èȘ OpenAI) 䌎éèźșæ [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) ç± Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever ććžă
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (æ„èȘ Salesforce) 䌎éèźșæ [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) ç± Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi ććžă
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (æ„èȘ Microsoft Research Asia) 䌎éèźșæ [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) ç± Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou ććžă
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (æ„èȘ Microsoft Research Asia) 䌎éèźșæ [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) ç± Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou ććžă
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (æ„èȘ Microsoft Research Asia) 䌎éèźșæ [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) ç± Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei ććžă
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (æ„èȘ Microsoft Research Asia) 䌎éèźșæ [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) ç± Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei ććžă
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (æ„èȘ AllenAI) 䌎éèźșæ [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) ç± Iz Beltagy, Matthew E. Peters, Arman Cohan ććžă
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (æ„èȘ Meta AI) 䌎éèźșæ [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) ç± Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, HervĂ© JĂ©gou, Matthijs Douze ććžă
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (æ„èȘ South China University of Technology) 䌎éèźșæ [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) ç± Jiapeng Wang, Lianwen Jin, Kai Ding ććžă
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (æ„èȘ The FAIR team of Meta AI) 䌎éèźșæ [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) ç± Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, TimothĂ©e Lacroix, Baptiste RoziĂšre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample ććžă
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (æ„èȘ The FAIR team of Meta AI) 䌎éèźșæ [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) ç± Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. ććžă
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (æ„èȘ Microsoft Research & University of Wisconsin-Madison) 䌎éèźșæ [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) ç± Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee ććžă
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (æ„èȘ Microsoft Research & University of Wisconsin-Madison) 䌎éèźșæ [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744) ç± Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee ććžă
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (æ„èȘ AllenAI) 䌎éèźșæ [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) ç± Iz Beltagy, Matthew E. Peters, Arman Cohan ććžă
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (æ„èȘ Google AI) released 䌎éèźșæ [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) ç± Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang ććžă
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (æ„èȘ Studio Ousia) 䌎éèźșæ [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) ç± Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto ććžă
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (æ„èȘ UNC Chapel Hill) 䌎éèźșæ [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) ç± Hao Tan and Mohit Bansal ććžă
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (æ„èȘ Facebook) 䌎éèźșæ [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) ç± Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert ććžă
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (æ„èȘ Facebook) 䌎éèźșæ [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) ç± Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin ććžă
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (æ„èȘ Albert Gu and Tri Dao) 䌎éèźșæ [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) ç± Albert Gu and Tri Dao ććžă
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** çš [OPUS](http://opus.nlpl.eu/) æ°æźèźç»çæșćšçż»èŻæšĄćç± Jörg Tiedemann ććžă[Marian Framework](https://marian-nmt.github.io/) ç±ćŸźèœŻçż»èŻćąéćŒćă
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (æ„èȘ Microsoft Research Asia) 䌎éèźșæ [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) ç± Junlong Li, Yiheng Xu, Lei Cui, Furu Wei ććžă
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (æ„èȘ FAIR and UIUC) 䌎éèźșæ [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) ç± Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar ććžă
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (æ„èȘ Google AI) 䌎éèźșæ [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) ç± Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos ććžă
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (æ„èȘ Facebook) 䌎éèźșæ [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) ç± Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer ććžă
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (æ„èȘ Facebook) 䌎éèźșæ [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) ç± Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan ććžă
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (æ„èȘ Facebook) 䌎éèźșæ [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) ç± Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer ććžă
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (æ„èȘ NVIDIA) 䌎éèźșæ [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) ç± Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro ććžă
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (æ„èȘ NVIDIA) 䌎éèźșæ [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) ç± Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro ććžă
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (æ„èȘ Alibaba Research) 䌎éèźșæ [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) ç± Peng Wang, Cheng Da, and Cong Yao ććžă
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed..
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (æ„èȘ Studio Ousia) 䌎éèźșæ [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) ç± Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka ććžă
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (æ„èȘ Facebook) 䌎éèźșæ [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) ç± Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli ććžă
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (æ„èȘ CMU/Google Brain) 䌎éèźșæ [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) ç± Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou ććžă
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (æ„èȘ Google Inc.) 䌎éèźșæ [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) ç± Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam ććžă
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (æ„èȘ Google Inc.) 䌎éèźșæ [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) ç± Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen ććžă
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (æ„èȘ Apple) 䌎éèźșæ [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) ç± Sachin Mehta and Mohammad Rastegari ććžă
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (æ„èȘ Apple) 䌎éèźșæ [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) ç± Sachin Mehta and Mohammad Rastegari ććžă
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (æ„èȘ Microsoft Research) 䌎éèźșæ [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) ç± Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu ććžă
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (æ„èȘ MosaiML) 䌎éèźșæ [llm-foundry](https://github.com/mosaicml/llm-foundry/) ç± the MosaicML NLP Team ććžă
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (æ„èȘ the University of Wisconsin - Madison) 䌎éèźșæ [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) ç± Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh ććžă
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (æ„èȘ Google AI) 䌎éèźșæ [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) ç± Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel ććžă
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (æ„èȘ äžćœäșșæ°ć€§ćŠ AI Box) 䌎éèźșæ [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) ç± Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen ććžă
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (æ„èȘ SHI Labs) 䌎éèźșæ [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) ç± Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi ććžă
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (æ„èȘćäžșèŻșäșæčèćźéȘ柀) 䌎éèźșæ [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) ç± Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu ććžă
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (æ„èȘ Meta) 䌎éèźșæ [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) ç± the NLLB team ććžă
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (æ„èȘ Meta) 䌎éèźșæ [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) ç± the NLLB team ććžă
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (æ„èȘ Meta AI) 䌎éèźșæ [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) ç± Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic ććžă
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (æ„èȘ the University of Wisconsin - Madison) 䌎éèźșæ [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) ç± Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh ććžă
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (æ„èȘ AI2) 䌎éèźșæ [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) ç± Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi ććžă
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (æ„èȘ SHI Labs) 䌎éèźșæ [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) ç± Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi ććžă
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (æ„èȘ [s-JoL](https://huggingface.co/s-JoL)) ç± GitHub (ç°ć·Čć é€).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (æ„èȘ Meta AI) 䌎éèźșæ [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) ç± Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al ććžă
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (æ„èȘ Google AI) 䌎éèźșæ [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) ç± Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby ććžă
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (æ„èȘ Google AI) 䌎éèźșæ [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) ç± Matthias Minderer, Alexey Gritsenko, Neil Houlsby ććžă
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** (æ„èȘ IBM Research) 䌎éèźșæ [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) ç± Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam ććžă
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (æ„èȘ IBM) 䌎éèźșæ [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) ç± Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam ććžă
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (æ„èȘ Google) 䌎éèźșæ [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) ç± Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu ććžă
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (æ„èȘ Google) 䌎éèźșæ [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) ç± Jason Phang, Yao Zhao, Peter J. Liu ććžă
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (æ„èȘ Deepmind) 䌎éèźșæ [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) ç± Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier HĂ©naff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, JoĂŁo Carreira ććžă
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (æ„èȘ ADEPT) 䌎éèźșæ [blog post](https://www.adept.ai/blog/persimmon-8b) ç± Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani ććžă
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio CĂ©sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, SĂ©bastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, SĂ©bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (æ„èȘ VinAI Research) 䌎éèźșæ [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) ç± Dat Quoc Nguyen and Anh Tuan Nguyen ććžă
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (æ„èȘ Google) 䌎éèźșæ [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) ç± Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova ććžă
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (æ„èȘ UCLA NLP) 䌎éèźșæ [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) ç± Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang ććžă
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (æ„èȘ Sea AI Labs) 䌎éèźșæ [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) ç± Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng ććžă
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (æ„èȘ Microsoft Research) 䌎éèźșæ [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) ç± Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou ććžă
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (æ„èȘ Nanjing University, The University of Hong Kong etc.) 䌎éèźșæ [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) ç± Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao ććžă
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (æ„èȘ Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) 䌎éèźșæ [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) ç± Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao ććžă
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (æ„èȘ NVIDIA) 䌎éèźșæ [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) ç± Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius ććžă
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (æ„èȘ the Qwen team, Alibaba Group) 䌎éèźșæ [Qwen Technical Report](https://arxiv.org/abs/2309.16609) ç± Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu ććžă
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (æ„èȘ the Qwen team, Alibaba Group) 䌎éèźșæ [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou ććž.
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (æ„èȘ Facebook) 䌎éèźșæ [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) ç± Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich KĂŒttler, Mike Lewis, Wen-tau Yih, Tim RocktĂ€schel, Sebastian Riedel, Douwe Kiela ććžă
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (æ„èȘ Google Research) 䌎éèźșæ [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) ç± Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang ććžă
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (æ„èȘ Google) 䌎éèźșæ [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) ç± the Griffin, RLHF and Gemma Teams ććžă
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (æ„èȘ Google Research) 䌎éèźșæ [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) ç± Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya ććžă
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr DollĂĄr.
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (æ„èȘ Google Research) 䌎éèźșæ [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) ç± Hyung Won Chung, Thibault FĂ©vry, Henry Tsai, M. Johnson, Sebastian Ruder ććžă
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (æ„èȘ Facebook), 䌎éèźșæ [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) ç± Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov ććžă
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (æ„èȘ Facebook) 䌎éèźșæ [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) ç± Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli ććžă
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (æ„èȘ WeChatAI), 䌎éèźșæ [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) ç± HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou ććžă
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (æ„èȘ ZhuiyiTechnology), 䌎éèźșæ [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) ç± Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu ććžă
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (æ„èȘ Bo Peng) 䌎éèźșæ [this repo](https://github.com/BlinkDL/RWKV-LM) ç± Bo Peng ććžă
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (æ„èȘ NVIDIA) 䌎éèźșæ [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) ç± Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo ććžă
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (æ„èȘ Beijing Academy of Artificial Intelligence (BAAI) 䌎éèźșæ [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) ç± Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang ććžă
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (æ„èȘ Meta AI) 䌎éèźșæ [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) ç± Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick ććžă
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (æ„èȘ ASAPP) 䌎éèźșæ [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) ç± Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi ććžă
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (æ„èȘ ASAPP) 䌎éèźșæ [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) ç± Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi ććžă
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (æ„èȘ Google AI) 䌎éèźșæ [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) ç± Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer ććžă
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (æ„èȘ Microsoft Research) 䌎éèźșæ [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) ç± Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei ććžă
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (æ„èȘ Facebook), 䌎éèźșæ [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) ç± Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino ććžă
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (æ„èȘ Facebook) 䌎éèźșæ [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) ç± Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau ććžă
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (æ„èȘ Tel Aviv University) 䌎éèźșæ [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) ç± Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy ććžă
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (æ„èȘ Berkeley) 䌎éèźșæ [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) ç± Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer ććžă
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (æ„èȘ MBZUAI) 䌎éèźșæ [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) ç± Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan ććžă
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (æ„èȘ Microsoft) 䌎éèźșæ [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) ç± Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo ććžă
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (æ„èȘ Microsoft) 䌎éèźșæ [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) ç± Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo ććžă
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (æ„èȘ University of WĂŒrzburg) 䌎éèźșæ [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) ç± Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte ććžă
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (æ„èȘ Google AI) 䌎éèźșæ [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) ç± Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu ććžă
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (æ„èȘ Google AI) 䌎éèźșæ [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) ç± Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu ććžă
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (æ„èȘ Microsoft Research) 䌎éèźșæ [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) ç± Brandon Smock, Rohith Pesala, Robin Abraham ććžă
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (æ„èȘ Google AI) 䌎éèźșæ [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) ç± Jonathan Herzig, PaweĆ Krzysztof Nowak, Thomas MĂŒller, Francesco Piccinno and Julian Martin Eisenschlos ććžă
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (æ„èȘ Microsoft Research) 䌎éèźșæ [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) ç± Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou ććžă
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (æ„èȘ Google/CMU) 䌎éèźșæ [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) ç± Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov ććžă
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (æ„èȘ Microsoft) 䌎éèźșæ [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) ç± Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei ććžă
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (æ„èȘ UNC Chapel Hill) 䌎éèźșæ [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) ç± Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal ććžă
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (æ„èȘ Intel) 䌎éèźșæ [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) ç± Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding ććž.
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (æ„èȘ Microsoft Research) 䌎éèźșæ [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) ç± Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal ććžă
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (æ„èȘ Google Research) 䌎éèźșæ [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) ç± Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant ććžă
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (æ„èȘ Microsoft Research) 䌎éèźșæ [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) ç± Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang ććžă
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (æ„èȘ Microsoft Research) 䌎éèźșæ [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) ç± Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu ććžă
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (æ„èȘ Peking University) 䌎éèźșæ [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) ç± Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun ććžă
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (æ„èȘ Tsinghua University and Nankai University) 䌎éèźșæ [Visual Attention Network](https://arxiv.org/abs/2202.09741) ç± Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu ććžă
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (æ„èȘ Multimedia Computing Group, Nanjing University) 䌎éèźșæ [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) ç± Zhan Tong, Yibing Song, Jue Wang, Limin Wang ććžă
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (æ„èȘ NAVER AI Lab/Kakao Enterprise/Kakao Brain) 䌎éèźșæ [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) ç± Wonjae Kim, Bokyung Son, Ildoo Kim ććžă
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (æ„èȘ University of WisconsinâMadison) 䌎éèźșæ [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) ç± Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee ććžă
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (æ„èȘ Google AI) 䌎éèźșæ [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) ç± Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby ććžă
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (æ„èȘ UCLA NLP) 䌎éèźșæ [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) ç± Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang ććžă
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (æ„èȘ Google AI) 䌎éèźșæ [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) ç± Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby ććžă
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (æ„èȘ Meta AI) 䌎éèźșæ [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) ç± Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He ććžă
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (æ„èȘ Meta AI) 䌎éèźșæ [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) ç± Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr DollĂĄr, Ross Girshick ććžă
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (æ„èȘ HUST-VL) 䌎éèźșæ [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) ç± Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang ććžă
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (æ„èȘ Meta AI) 䌎éèźșæ [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas ććž.
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (æ„èȘ Kakao Enterprise) 䌎éèźșæ [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) ç± Jaehyeon Kim, Jungil Kong, Juhee Son ććžă
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (æ„èȘ Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) ç± Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (æ„èȘ Facebook AI) 䌎éèźșæ [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) ç± Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli ććžă
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (æ„èȘ Facebook AI) 䌎éèźșæ [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) ç± Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino ććžă
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (æ„èȘ Facebook AI) 䌎éèźșæ [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) ç± Qiantong Xu, Alexei Baevski, Michael Auli ććžă
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (æ„èȘ OpenAI) 䌎éèźșæ [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) ç± Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever ććžă
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (æ„èȘ Microsoft Research) 䌎éèźșæ [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) ç± Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling ććžă
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (æ„èȘ Meta AI) 䌎éèźșæ [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) ç± Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe ććžă
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (æ„èȘ Facebook) 䌎éèźșæ [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) ç± Guillaume Lample and Alexis Conneau ććžă
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (æ„èȘ Microsoft Research) 䌎éèźșæ [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) ç± Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou ććžă
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (æ„èȘ Facebook AI), 䌎éèźșæ [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) ç± Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco GuzmĂĄn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov ććžă
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (æ„èȘ Facebook AI) 䌎éèźșæ [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) ç± Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau ććžă
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (æ„èȘ Meta AI) 䌎éèźșæ [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) ç± Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa ććžă
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (æ„èȘ Google/CMU) 䌎éèźșæ [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) ç± Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le ććžă
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (æ„èȘ Facebook AI) 䌎éèźșæ [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) ç± Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli ććžă
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (æ„èȘ Facebook AI) 䌎éèźșæ [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) ç± Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli ććžă
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (æ„èȘ Huazhong University of Science & Technology) 䌎éèźșæ [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) ç± Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu ććžă
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (æ„èȘ the University of Wisconsin - Madison) 䌎éèźșæ [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) ç± Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh ććžă
-1. æłèŠèŽĄçźæ°çæšĄćïŒæ仏èżéæäžä»œ**èŻŠç»æćŒćæšĄæż**æ„ćŒćŻŒäœ æ·»ć æ°çæšĄćăäœ ćŻä»„ćš [`templates`](./templates) çźćœäžæŸć°ä»ä»Źăèź°ćŸæ„ç [èŽĄçźæć](./CONTRIBUTING.md) ćč¶ćšćŒć§ć PR ćè系绎æ€äșșćæćŒäžäžȘæ°ç issue æ„è·ćŸćéŠă
+đ€ Transformers çźćæŻæćŠäžçæ¶æ: æšĄćæŠèż°èŻ·é
[èżé](https://huggingface.co/docs/transformers/model_summary).
èŠæŁæ„æäžȘæšĄćæŻćŠć·Čæ FlaxăPyTorch æ TensorFlow çćźç°ïŒæć
¶æŻćŠćš đ€ Tokenizers ćșäžæćŻčćșèŻçŹŠććšïŒtokenizerïŒïŒæŹèŻ·ćé
[æ€èĄš](https://huggingface.co/docs/transformers/index#supported-frameworks)ă
diff --git a/README_zh-hant.md b/README_zh-hant.md
index d62727ffcb0034..ae7332eaa25525 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -252,276 +252,7 @@ conda install conda-forge::transformers
çźćçæȘąæ„é»æžéïŒ ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-đ€ Transformers çźćæŻæŽä»„äžçæ¶æ§ïŒæšĄćæŠèŠœè«ćé±[éèŁĄ](https://huggingface.co/docs/transformers/model_summary)ïŒïŒ
-
-1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
-1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from Ăcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
-1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
-1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
-1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz SuĂĄrez*, Yoann Dupont, Laurent Romary, Ăric Villemonte de la Clergerie, DjamĂ© Seddah and BenoĂźt Sagot.
-1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo LĂŒddecke and Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
-1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste RoziÚre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere.
-1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
-1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
-1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DBRX](https://huggingface.co/docs/transformers/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
-1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
-1. **[Depth Anything](https://huggingface.co/docs/transformers/model_doc/depth_anything)** (from University of Hong Kong and TikTok) released with the paper [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao.
-1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp KrĂ€henbĂŒhl.
-1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
-1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT.
-1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER) released with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas OÄuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
-1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
-1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre DĂ©fossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
-1. **[FastSpeech2Conformer](https://huggingface.co/docs/transformers/model_doc/fastspeech2_conformer)** (from ESPnet and Microsoft Research) released with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoßt Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
-1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
-1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, SaÄnak TaĆırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
-1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (from Google) released with the paper [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by the Gemma Google team.
-1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
-1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Ăhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo GarcĂa del RĂo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
-1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by ćæŹäżäč(tanreinama).
-1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
-1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
-1. **[Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2)** (from Hugging Face) released with the paper [IDEFICS2](https://huggingface.co/blog/idefics2) by LĂ©o Tronchon, Hugo Laurencon, Victor Sanh.
-1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
-1. **[Jamba](https://huggingface.co/docs/transformers/model_doc/jamba)** (from AI21 Labs Ltd.) released with the paper [Jamba: A Hybrid Transformer-Mamba Language Model](https://arxiv.org/abs/2403.19887) by Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, Omri Abend, Raz Alon, Tomer Asida, Amir Bergman, Roman Glozman, Michael Gokhman, Avshalom Manevich, Nir Ratner, Noam Rozen, Erez Shwartz, Mor Zusman, Yoav Shoham.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
-1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste RoziÚre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..
-1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[LLaVA-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/abs/2310.03744) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
-1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
-1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
-1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
-1. **[Mamba](https://huggingface.co/docs/transformers/model_doc/mamba)** (from Albert Gu and Tri Dao) released with the paper [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) by Albert Gu and Tri Dao.
-1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
-1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed..
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
-1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
-1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
-1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
-1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
-1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
-1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the paper [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
-1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
-1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre DĂ©fossez.
-1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
-1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noahâs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
-1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
-1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OLMo](https://huggingface.co/docs/transformers/model_doc/olmo)** (from AI2) released with the paper [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) by Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.
-1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
-1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
-1. **[PatchTSMixer](https://huggingface.co/docs/transformers/model_doc/patchtsmixer)** (from IBM Research) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[PatchTST](https://huggingface.co/docs/transformers/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu.
-1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier HĂ©naff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, JoĂŁo Carreira.
-1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released with the paper [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, SaÄnak TaĆırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
-1. **[Phi](https://huggingface.co/docs/transformers/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio CĂ©sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, SĂ©bastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, SĂ©bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
-1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee.
-1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
-1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
-1. **[Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.
-1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich KĂŒttler, Mike Lewis, Wen-tau Yih, Tim RocktĂ€schel, Sebastian Riedel, Douwe Kiela.
-1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[RecurrentGemma](https://huggingface.co/docs/transformers/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams.
-1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Ćukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr DollĂĄr.
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault FĂ©vry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
-1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T â Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
-1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SegGPT](https://huggingface.co/docs/transformers/model_doc/seggpt)** (from Beijing Academy of Artificial Intelligence (BAAI) released with the paper [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
-1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)** (from Google AI) released with the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer.
-1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
-1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
-1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas KrauĂ, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
-1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of WĂŒrzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
-1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
-1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, PaweĆ Krzysztof Nowak, Thomas MĂŒller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
-1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
-1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
-1. **[UDOP](https://huggingface.co/docs/transformers/model_doc/udop)** (from Microsoft Research) released with the paper [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) by Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal.
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
-1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[UnivNet](https://huggingface.co/docs/transformers/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
-1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
-1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (from University of WisconsinâMadison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
-1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
-1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr DollĂĄr, Ross Girshick.
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
-1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
-1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario LuÄiÄ, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
-1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
-1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
-1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco GuzmĂĄn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI) released with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
-1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-1. æłèŠèČąç»æ°çæšĄćïŒæćéèŁĄæäžä»œ**è©łçŽ°æćŒćæšĄæż**äŸćŒć°äœ ć ć
„æ°çæšĄćăäœ ćŻä»„ćš [`templates`](./templates) çźéäžæŸć°ćźćăèšćŸæ„ç[èČąç»æćŒ](./CONTRIBUTING.md)䞊ćšéć§ćŻ« PR ćèŻçč«ç¶è·äșșćĄæéäžćæ°ç issue äŸçČćŸ feedbacksă
+đ€ Transformers çźćæŻæŽä»„äžçæ¶æ§: æšĄćæŠèŠœè«ćé±[éèŁĄ](https://huggingface.co/docs/transformers/model_summary).
èŠæȘąæ„æćæšĄćæŻćŠć·Čæ FlaxăPyTorch æ TensorFlow ç毊äœïŒæć
¶æŻćŠćšđ€ Tokenizers ćœćŒćș«äžæć°æç tokenizerïŒæŹè«ćé±[æ€èĄš](https://huggingface.co/docs/transformers/index#supported-frameworks)ă
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index d2656274485640..b34ab7517821e5 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -48,6 +48,8 @@ RUN python3 -m pip install --no-cache-dir decord av==9.2.0
# For GGUF tests
RUN python3 -m pip install --no-cache-dir gguf
+# Some slow tests require bnb
+RUN python3 -m pip install --no-cache-dir bitsandbytes
# For `dinat` model
# The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
diff --git a/docs/source/de/_toctree.yml b/docs/source/de/_toctree.yml
index 068beccdfe8578..859c4b7b3b3010 100644
--- a/docs/source/de/_toctree.yml
+++ b/docs/source/de/_toctree.yml
@@ -33,8 +33,6 @@
title: Wie kann man zu đ€ Transformers beitragen?
- local: add_new_model
title: Wie fĂŒgt man ein Modell zu đ€ Transformers hinzu?
- - local: add_tensorflow_model
- title: Wie konvertiert man ein đ€ Transformers-Modell in TensorFlow?
- local: add_new_pipeline
title: Wie fĂŒgt man eine Pipeline zu đ€ Transformers hinzu?
- local: testing
diff --git a/docs/source/de/add_new_model.md b/docs/source/de/add_new_model.md
index 3f3317dd8b7e96..3c8987f44254bc 100644
--- a/docs/source/de/add_new_model.md
+++ b/docs/source/de/add_new_model.md
@@ -17,12 +17,6 @@ rendered properly in your Markdown viewer.
Die đ€ Transformers-Bibliothek ist dank der BeitrĂ€ge der Community oft in der Lage, neue Modelle anzubieten. Aber das kann ein anspruchsvolles Projekt sein und erfordert eine eingehende Kenntnis der đ€ Transformers-Bibliothek und des zu implementierenden Modells. Bei Hugging Face versuchen wir, mehr Mitgliedern der Community die Möglichkeit zu geben, aktiv Modelle hinzuzufĂŒgen, und wir haben diese Anleitung zusammengestellt, die Sie durch den Prozess des HinzufĂŒgens eines PyTorch-Modells fĂŒhrt (stellen Sie sicher, dass Sie [PyTorch installiert haben](https://pytorch.org/get-started/locally/)).
-
-
-Wenn Sie daran interessiert sind, ein TensorFlow-Modell zu implementieren, werfen Sie einen Blick in die Anleitung [How to convert a đ€ Transformers model to TensorFlow](add_tensorflow_model)!
-
-
-
Auf dem Weg dorthin, werden Sie:
- Einblicke in bewÀhrte Open-Source-Verfahren erhalten
@@ -404,12 +398,14 @@ In dem speziellen Fall, dass Sie ein Modell hinzufĂŒgen, dessen Architektur gena
Modells ĂŒbereinstimmt, mĂŒssen Sie nur ein Konvertierungsskript hinzufĂŒgen, wie in [diesem Abschnitt](#write-a-conversion-script) beschrieben.
In diesem Fall können Sie einfach die gesamte Modellarchitektur des bereits vorhandenen Modells wiederverwenden.
-Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Sie haben hier zwei Möglichkeiten:
+Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Wir empfehlen die Verwendung des folgenden Skripts, um ein Modell hinzuzufĂŒgen
+ein bestehendes Modell:
-- `transformers-cli add-new-model-like`, um ein neues Modell wie ein bestehendes hinzuzufĂŒgen
-- `transformers-cli add-new-model`, um ein neues Modell aus unserer Vorlage hinzuzufĂŒgen (sieht dann aus wie BERT oder Bart, je nachdem, welche Art von Modell Sie wĂ€hlen)
+```bash
+transformers-cli add-new-model-like
+```
-In beiden FĂ€llen werden Sie mit einem Fragebogen aufgefordert, die grundlegenden Informationen zu Ihrem Modell auszufĂŒllen. FĂŒr den zweiten Befehl mĂŒssen Sie `cookiecutter` installieren, weitere Informationen dazu finden Sie [hier](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model).
+Sie werden mit einem Fragebogen aufgefordert, die grundlegenden Informationen Ihres Modells einzugeben.
**Eröffnen Sie einen Pull Request auf dem Haupt-Repositorium huggingface/transformers**
diff --git a/docs/source/de/add_tensorflow_model.md b/docs/source/de/add_tensorflow_model.md
deleted file mode 100644
index 8488acbe709b64..00000000000000
--- a/docs/source/de/add_tensorflow_model.md
+++ /dev/null
@@ -1,356 +0,0 @@
-
-
-# Wie konvertiert man ein đ€ Transformers-Modell in TensorFlow?
-
-Die Tatsache, dass mehrere Frameworks fĂŒr die Verwendung mit đ€ Transformers zur VerfĂŒgung stehen, gibt Ihnen die FlexibilitĂ€t, deren StĂ€rken beim Entwurf Ihrer Anwendung auszuspielen.
-Ihre Anwendung zu entwerfen, aber das bedeutet auch, dass die KompatibilitĂ€t fĂŒr jedes Modell einzeln hinzugefĂŒgt werden muss. Die gute Nachricht ist, dass
-das HinzufĂŒgen von TensorFlow-KompatibilitĂ€t zu einem bestehenden Modell einfacher ist als [das HinzufĂŒgen eines neuen Modells von Grund auf](add_new_model)!
-Ob Sie ein tieferes VerstĂ€ndnis fĂŒr groĂe TensorFlow-Modelle haben möchten, einen wichtigen Open-Source-Beitrag leisten oder
-TensorFlow fĂŒr das Modell Ihrer Wahl aktivieren wollen, dieser Leitfaden ist fĂŒr Sie.
-
-Dieser Leitfaden befÀhigt Sie, ein Mitglied unserer Gemeinschaft, TensorFlow-Modellgewichte und/oder
-Architekturen beizusteuern, die in đ€ Transformers verwendet werden sollen, und zwar mit minimaler Betreuung durch das Hugging Face Team. Das Schreiben eines neuen Modells
-ist keine Kleinigkeit, aber ich hoffe, dass dieser Leitfaden dazu beitrĂ€gt, dass es weniger eine Achterbahnfahrt đą und mehr ein Spaziergang im Park đ¶ ist.
-Die Nutzung unserer kollektiven Erfahrungen ist absolut entscheidend, um diesen Prozess immer einfacher zu machen, und deshalb möchten wir
-ermutigen Sie daher, VerbesserungsvorschlĂ€ge fĂŒr diesen Leitfaden zu machen!
-
-Bevor Sie tiefer eintauchen, empfehlen wir Ihnen, die folgenden Ressourcen zu lesen, wenn Sie neu in đ€ Transformers sind:
-- [Allgemeiner Ăberblick ĂŒber đ€ Transformers](add_new_model#general-overview-of-transformers)
-- [Die TensorFlow-Philosophie von Hugging Face](https://huggingface.co/blog/tensorflow-philosophy)
-
-Im Rest dieses Leitfadens werden Sie lernen, was nötig ist, um eine neue TensorFlow Modellarchitektur hinzuzufĂŒgen, die
-Verfahren zur Konvertierung von PyTorch in TensorFlow-Modellgewichte und wie Sie Unstimmigkeiten zwischen ML
-Frameworks. Legen Sie los!
-
-
-
-Sind Sie unsicher, ob das Modell, das Sie verwenden möchten, bereits eine entsprechende TensorFlow-Architektur hat?
-
-
-
-ĂberprĂŒfen Sie das Feld `model_type` in der `config.json` des Modells Ihrer Wahl
-([Beispiel](https://huggingface.co/google-bert/bert-base-uncased/blob/main/config.json#L14)). Wenn der entsprechende Modellordner in
-đ€ Transformers eine Datei hat, deren Name mit "modeling_tf" beginnt, bedeutet dies, dass es eine entsprechende TensorFlow
-Architektur hat ([Beispiel](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)).
-
-
-
-
-## Schritt-fĂŒr-Schritt-Anleitung zum HinzufĂŒgen von TensorFlow-Modellarchitektur-Code
-
-Es gibt viele Möglichkeiten, eine groĂe Modellarchitektur zu entwerfen, und viele Möglichkeiten, diesen Entwurf zu implementieren. Wie auch immer,
-Sie erinnern sich vielleicht an unseren [allgemeinen Ăberblick ĂŒber đ€ Transformers](add_new_model#general-overview-of-transformers)
-wissen, dass wir ein meinungsfreudiger Haufen sind - die Benutzerfreundlichkeit von đ€ Transformers hĂ€ngt von konsistenten Designentscheidungen ab. Aus
-Erfahrung können wir Ihnen ein paar wichtige Dinge ĂŒber das HinzufĂŒgen von TensorFlow-Modellen sagen:
-
-- Erfinden Sie das Rad nicht neu! In den meisten FĂ€llen gibt es mindestens zwei Referenzimplementierungen, die Sie ĂŒberprĂŒfen sollten: das
-PyTorch-Ăquivalent des Modells, das Sie implementieren, und andere TensorFlow-Modelle fĂŒr dieselbe Klasse von Problemen.
-- Gute Modellimplementierungen ĂŒberleben den Test der Zeit. Dies geschieht nicht, weil der Code hĂŒbsch ist, sondern eher
-sondern weil der Code klar, einfach zu debuggen und darauf aufzubauen ist. Wenn Sie den Maintainern das Leben mit Ihrer
-TensorFlow-Implementierung leicht machen, indem Sie die gleichen Muster wie in anderen TensorFlow-Modellen nachbilden und die Abweichung
-zur PyTorch-Implementierung minimieren, stellen Sie sicher, dass Ihr Beitrag lange Bestand haben wird.
-- Bitten Sie um Hilfe, wenn Sie nicht weiterkommen! Das đ€ Transformers-Team ist da, um zu helfen, und wir haben wahrscheinlich Lösungen fĂŒr die gleichen
-Probleme gefunden, vor denen Sie stehen.
-
-Hier finden Sie einen Ăberblick ĂŒber die Schritte, die zum HinzufĂŒgen einer TensorFlow-Modellarchitektur erforderlich sind:
-1. WÀhlen Sie das Modell, das Sie konvertieren möchten
-2. Bereiten Sie die Transformers-Entwicklungsumgebung vor.
-3. (Optional) Verstehen Sie die theoretischen Aspekte und die bestehende Implementierung
-4. Implementieren Sie die Modellarchitektur
-5. Implementieren Sie Modelltests
-6. Reichen Sie den Pull-Antrag ein
-7. (Optional) Erstellen Sie Demos und teilen Sie diese mit der Welt
-
-### 1.-3. Bereiten Sie Ihren Modellbeitrag vor
-
-**1. WÀhlen Sie das Modell, das Sie konvertieren möchten**
-
-Beginnen wir mit den Grundlagen: Als erstes mĂŒssen Sie die Architektur kennen, die Sie konvertieren möchten. Wenn Sie
-Sie sich nicht auf eine bestimmte Architektur festgelegt haben, ist es eine gute Möglichkeit, das đ€ Transformers-Team um VorschlĂ€ge zu bitten.
-Wir werden Sie zu den wichtigsten Architekturen fĂŒhren, die auf der TensorFlow-Seite noch fehlen.
-Seite fehlen. Wenn das spezifische Modell, das Sie mit TensorFlow verwenden möchten, bereits eine Implementierung der TensorFlow-Architektur in
-đ€ Transformers, aber es fehlen Gewichte, können Sie direkt in den
-Abschnitt [Gewichtskonvertierung](#hinzufĂŒgen-von-tensorflow-gewichten-zum--hub)
-auf dieser Seite.
-
-Der Einfachheit halber wird im Rest dieser Anleitung davon ausgegangen, dass Sie sich entschieden haben, mit der TensorFlow-Version von
-*BrandNewBert* (dasselbe Beispiel wie in der [Anleitung](add_new_model), um ein neues Modell von Grund auf hinzuzufĂŒgen).
-
-
-
-Bevor Sie mit der Arbeit an einer TensorFlow-Modellarchitektur beginnen, sollten Sie sich vergewissern, dass es keine laufenden BemĂŒhungen in dieser Richtung gibt.
-Sie können nach `BrandNewBert` auf der
-[pull request GitHub page](https://github.com/huggingface/transformers/pulls?q=is%3Apr), um zu bestÀtigen, dass es keine
-TensorFlow-bezogene Pull-Anfrage gibt.
-
-
-
-
-**2. Transformers-Entwicklungsumgebung vorbereiten**
-
-Nachdem Sie die Modellarchitektur ausgewÀhlt haben, öffnen Sie einen PR-Entwurf, um Ihre Absicht zu signalisieren, daran zu arbeiten. Folgen Sie den
-Anweisungen, um Ihre Umgebung einzurichten und einen PR-Entwurf zu öffnen.
-
-1. Forken Sie das [repository](https://github.com/huggingface/transformers), indem Sie auf der Seite des Repositorys auf die SchaltflÀche 'Fork' klicken.
- Seite des Repositorys klicken. Dadurch wird eine Kopie des Codes unter Ihrem GitHub-Benutzerkonto erstellt.
-
-2. Klonen Sie Ihren `transformers` Fork auf Ihre lokale Festplatte und fĂŒgen Sie das Basis-Repository als Remote hinzu:
-
-```bash
-git clone https://github.com/[your Github handle]/transformers.git
-cd transformers
-git remote add upstream https://github.com/huggingface/transformers.git
-```
-
-3. Richten Sie eine Entwicklungsumgebung ein, indem Sie z.B. den folgenden Befehl ausfĂŒhren:
-
-```bash
-python -m venv .env
-source .env/bin/activate
-pip install -e ".[dev]"
-```
-
-AbhÀngig von Ihrem Betriebssystem und da die Anzahl der optionalen AbhÀngigkeiten von Transformers wÀchst, kann es sein, dass Sie bei diesem Befehl einen
-Fehler mit diesem Befehl erhalten. Wenn das der Fall ist, stellen Sie sicher, dass Sie TensorFlow installieren und dann ausfĂŒhren:
-
-```bash
-pip install -e ".[quality]"
-```
-
-**Hinweis:** Sie mĂŒssen CUDA nicht installiert haben. Es reicht aus, das neue Modell auf der CPU laufen zu lassen.
-
-4. Erstellen Sie eine Verzweigung mit einem beschreibenden Namen von Ihrer Hauptverzweigung
-
-```bash
-git checkout -b add_tf_brand_new_bert
-```
-
-5. Abrufen und zurĂŒcksetzen auf die aktuelle Hauptversion
-
-```bash
-git fetch upstream
-git rebase upstream/main
-```
-
-6. FĂŒgen Sie eine leere `.py` Datei in `transformers/src/models/brandnewbert/` mit dem Namen `modeling_tf_brandnewbert.py` hinzu. Dies wird
-Ihre TensorFlow-Modelldatei sein.
-
-7. Ăbertragen Sie die Ănderungen auf Ihr Konto mit:
-
-```bash
-git add .
-git commit -m "initial commit"
-git push -u origin add_tf_brand_new_bert
-```
-
-8. Wenn Sie zufrieden sind, gehen Sie auf die Webseite Ihrer Abspaltung auf GitHub. Klicken Sie auf "Pull request". Stellen Sie sicher, dass Sie das
- GitHub-Handle einiger Mitglieder des Hugging Face-Teams als Reviewer hinzuzufĂŒgen, damit das Hugging Face-Team ĂŒber zukĂŒnftige Ănderungen informiert wird.
- zukĂŒnftige Ănderungen benachrichtigt wird.
-
-9. Ăndern Sie den PR in einen Entwurf, indem Sie auf der rechten Seite der GitHub-Pull-Request-Webseite auf "In Entwurf umwandeln" klicken.
-
-
-Jetzt haben Sie eine Entwicklungsumgebung eingerichtet, um *BrandNewBert* nach TensorFlow in đ€ Transformers zu portieren.
-
-
-**3. (Optional) Verstehen Sie die theoretischen Aspekte und die bestehende Implementierung**
-
-Sie sollten sich etwas Zeit nehmen, um die Arbeit von *BrandNewBert* zu lesen, falls eine solche Beschreibung existiert. Möglicherweise gibt es groĂe
-Abschnitte des Papiers, die schwer zu verstehen sind. Wenn das der Fall ist, ist das in Ordnung - machen Sie sich keine Sorgen! Das Ziel ist
-ist es nicht, ein tiefes theoretisches VerstÀndnis des Papiers zu erlangen, sondern die notwendigen Informationen zu extrahieren, um
-das Modell mit Hilfe von TensorFlow effektiv in đ€ Transformers neu zu implementieren. Das heiĂt, Sie mĂŒssen nicht zu viel Zeit auf die
-viel Zeit auf die theoretischen Aspekte verwenden, sondern sich lieber auf die praktischen Aspekte konzentrieren, nÀmlich auf die bestehende Modelldokumentation
-Seite (z.B. [model docs for BERT](model_doc/bert)).
-
-Nachdem Sie die Grundlagen der Modelle, die Sie implementieren wollen, verstanden haben, ist es wichtig, die bestehende
-Implementierung zu verstehen. Dies ist eine gute Gelegenheit, sich zu vergewissern, dass eine funktionierende Implementierung mit Ihren Erwartungen an das
-Modell entspricht, und um technische Herausforderungen auf der TensorFlow-Seite vorauszusehen.
-
-Es ist ganz natĂŒrlich, dass Sie sich von der Menge an Informationen, die Sie gerade aufgesogen haben, ĂŒberwĂ€ltigt fĂŒhlen. Es ist
-Es ist definitiv nicht erforderlich, dass Sie in dieser Phase alle Facetten des Modells verstehen. Dennoch empfehlen wir Ihnen dringend
-ermutigen wir Sie, alle dringenden Fragen in unserem [Forum](https://discuss.huggingface.co/) zu klÀren.
-
-
-### 4. Implementierung des Modells
-
-Jetzt ist es an der Zeit, endlich mit dem Programmieren zu beginnen. Als Ausgangspunkt empfehlen wir die PyTorch-Datei selbst: Kopieren Sie den Inhalt von
-`modeling_brand_new_bert.py` in `src/transformers/models/brand_new_bert/` nach
-`modeling_tf_brand_new_bert.py`. Das Ziel dieses Abschnitts ist es, die Datei zu Àndern und die Importstruktur von
-đ€ Transformers zu aktualisieren, so dass Sie `TFBrandNewBert` und
-`TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` erfolgreich ein funktionierendes TensorFlow *BrandNewBert* Modell lÀdt.
-
-Leider gibt es kein Rezept, um ein PyTorch-Modell in TensorFlow zu konvertieren. Sie können jedoch unsere Auswahl an
-Tipps befolgen, um den Prozess so reibungslos wie möglich zu gestalten:
-- Stellen Sie `TF` dem Namen aller Klassen voran (z.B. wird `BrandNewBert` zu `TFBrandNewBert`).
-- Die meisten PyTorch-Operationen haben einen direkten TensorFlow-Ersatz. Zum Beispiel entspricht `torch.nn.Linear` der Klasse
- `tf.keras.layers.Dense`, `torch.nn.Dropout` entspricht `tf.keras.layers.Dropout`, usw. Wenn Sie sich nicht sicher sind
- ĂŒber eine bestimmte Operation nicht sicher sind, können Sie die [TensorFlow-Dokumentation](https://www.tensorflow.org/api_docs/python/tf)
- oder die [PyTorch-Dokumentation](https://pytorch.org/docs/stable/).
-- Suchen Sie nach Mustern in der Codebasis von đ€ Transformers. Wenn Sie auf eine bestimmte Operation stoĂen, fĂŒr die es keinen direkten Ersatz gibt
- Ersatz hat, stehen die Chancen gut, dass jemand anderes bereits das gleiche Problem hatte.
-- Behalten Sie standardmĂ€Ăig die gleichen Variablennamen und die gleiche Struktur wie in PyTorch bei. Dies erleichtert die Fehlersuche, die Verfolgung von
- Probleme zu verfolgen und spÀtere Korrekturen vorzunehmen.
-- Einige Ebenen haben in jedem Framework unterschiedliche Standardwerte. Ein bemerkenswertes Beispiel ist die Schicht fĂŒr die Batch-Normalisierung
- epsilon (`1e-5` in [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d)
- und `1e-3` in [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)).
- PrĂŒfen Sie die Dokumentation genau!
-- Die Variablen `nn.Parameter` von PyTorch mĂŒssen in der Regel innerhalb von TF Layer's `build()` initialisiert werden. Siehe das folgende
- Beispiel: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) /
- [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220)
-- Wenn das PyTorch-Modell ein `#copied from ...` am Anfang einer Funktion hat, stehen die Chancen gut, dass Ihr TensorFlow-Modell diese Funktion auch
- diese Funktion von der Architektur ausleihen kann, von der sie kopiert wurde, vorausgesetzt, es hat eine TensorFlow-Architektur.
-- Die korrekte Zuweisung des Attributs `name` in TensorFlow-Funktionen ist entscheidend, um das `from_pt=True` Gewicht zu erreichen
- Cross-Loading. Name" ist fast immer der Name der entsprechenden Variablen im PyTorch-Code. Wenn `name` nicht
- nicht richtig gesetzt ist, sehen Sie dies in der Fehlermeldung beim Laden der Modellgewichte.
-- Die Logik der Basismodellklasse, `BrandNewBertModel`, befindet sich in `TFBrandNewBertMainLayer`, einer Keras
- Schicht-Unterklasse ([Beispiel](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719)).
- TFBrandNewBertModel" ist lediglich ein Wrapper fĂŒr diese Schicht.
-- Keras-Modelle mĂŒssen erstellt werden, um die vorher trainierten Gewichte zu laden. Aus diesem Grund muss `TFBrandNewBertPreTrainedModel`
- ein Beispiel fĂŒr die Eingaben in das Modell enthalten, die `dummy_inputs`
- ([Beispiel](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)).
-- Wenn Sie nicht weiterkommen, fragen Sie nach Hilfe - wir sind fĂŒr Sie da! đ€
-
-Neben der Modelldatei selbst mĂŒssen Sie auch die Verweise auf die Modellklassen und die zugehörigen
-Dokumentationsseiten hinzufĂŒgen. Sie können diesen Teil ganz nach den Mustern in anderen PRs erledigen
-([Beispiel](https://github.com/huggingface/transformers/pull/18020/files)). Hier ist eine Liste der erforderlichen manuellen
-Ănderungen:
-- FĂŒgen Sie alle öffentlichen Klassen von *BrandNewBert* in `src/transformers/__init__.py` ein.
-- FĂŒgen Sie *BrandNewBert* Klassen zu den entsprechenden Auto Klassen in `src/transformers/models/auto/modeling_tf_auto.py` hinzu.
-- FĂŒgen Sie die *BrandNewBert* zugehörigen Klassen fĂŒr trĂ€ges Laden in `src/transformers/utils/dummy_tf_objects.py` hinzu.
-- Aktualisieren Sie die Importstrukturen fĂŒr die öffentlichen Klassen in `src/transformers/models/brand_new_bert/__init__.py`.
-- FĂŒgen Sie die Dokumentationszeiger auf die öffentlichen Methoden von *BrandNewBert* in `docs/source/de/model_doc/brand_new_bert.md` hinzu.
-- FĂŒgen Sie sich selbst zur Liste der Mitwirkenden an *BrandNewBert* in `docs/source/de/model_doc/brand_new_bert.md` hinzu.
-- FĂŒgen Sie schlieĂlich ein grĂŒnes HĂ€kchen â
in der TensorFlow-Spalte von *BrandNewBert* in `docs/source/de/index.md` hinzu.
-
-Wenn Sie mit Ihrer Implementierung zufrieden sind, fĂŒhren Sie die folgende Checkliste aus, um zu bestĂ€tigen, dass Ihre Modellarchitektur
-fertig ist:
-1. Alle Schichten, die sich zur Trainingszeit anders verhalten (z.B. Dropout), werden mit einem `Training` Argument aufgerufen, das
-von den Top-Level-Klassen weitergegeben wird
-2. Sie haben `#copied from ...` verwendet, wann immer es möglich war.
-3. Die Funktion `TFBrandNewBertMainLayer` und alle Klassen, die sie verwenden, haben ihre Funktion `call` mit `@unpack_inputs` dekoriert
-4. `TFBrandNewBertMainLayer` ist mit `@keras_serializable` dekoriert
-5. Ein TensorFlow-Modell kann aus PyTorch-Gewichten mit `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` geladen werden.
-6. Sie können das TensorFlow Modell mit dem erwarteten Eingabeformat aufrufen
-
-
-### 5. Modell-Tests hinzufĂŒgen
-
-Hurra, Sie haben ein TensorFlow-Modell implementiert! Jetzt ist es an der Zeit, Tests hinzuzufĂŒgen, um sicherzustellen, dass sich Ihr Modell wie erwartet verhĂ€lt.
-erwartet. Wie im vorigen Abschnitt schlagen wir vor, dass Sie zunÀchst die Datei `test_modeling_brand_new_bert.py` in
-`tests/models/brand_new_bert/` in die Datei `test_modeling_tf_brand_new_bert.py` zu kopieren und dann die notwendigen
-TensorFlow-Ersetzungen vornehmen. FĂŒr den Moment sollten Sie in allen Aufrufen von `.from_pretrained()` das Flag `from_pt=True` verwenden, um die
-die vorhandenen PyTorch-Gewichte zu laden.
-
-Wenn Sie damit fertig sind, kommt der Moment der Wahrheit: FĂŒhren Sie die Tests durch! đŹ
-
-```bash
-NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
-py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
-```
-
-Das wahrscheinlichste Ergebnis ist, dass Sie eine Reihe von Fehlern sehen werden. Machen Sie sich keine Sorgen, das ist zu erwarten! Das Debuggen von ML-Modellen ist
-notorisch schwierig, und der SchlĂŒssel zum Erfolg ist Geduld (und `breakpoint()`). Nach unserer Erfahrung sind die schwierigsten
-Probleme aus subtilen Unstimmigkeiten zwischen ML-Frameworks, zu denen wir am Ende dieses Leitfadens ein paar Hinweise geben.
-In anderen FĂ€llen kann es sein, dass ein allgemeiner Test nicht direkt auf Ihr Modell anwendbar ist; in diesem Fall empfehlen wir eine Ăberschreibung
-auf der Ebene der Modelltestklasse. Zögern Sie nicht, in Ihrem Entwurf einer Pull-Anfrage um Hilfe zu bitten, wenn
-Sie nicht weiterkommen.
-
-Wenn alle Tests erfolgreich waren, können Sie Ihr Modell in die đ€ Transformers-Bibliothek aufnehmen! đ
-
-### 6.-7. Stellen Sie sicher, dass jeder Ihr Modell verwenden kann
-
-**6. Reichen Sie den Pull Request ein**
-
-Sobald Sie mit der Implementierung und den Tests fertig sind, ist es an der Zeit, eine Pull-Anfrage einzureichen. Bevor Sie Ihren Code einreichen,
-fĂŒhren Sie unser Dienstprogramm zur Codeformatierung, `make fixup` đȘ, aus. Damit werden automatisch alle Formatierungsfehler behoben, die dazu fĂŒhren wĂŒrden, dass
-unsere automatischen PrĂŒfungen fehlschlagen wĂŒrden.
-
-Nun ist es an der Zeit, Ihren Entwurf einer Pull-Anfrage in eine echte Pull-Anfrage umzuwandeln. Klicken Sie dazu auf die SchaltflĂ€che "Bereit fĂŒr
-Review" und fĂŒgen Sie Joao (`@gante`) und Matt (`@Rocketknight1`) als Reviewer hinzu. Eine Modell-Pull-Anfrage benötigt
-mindestens 3 Reviewer, aber sie werden sich darum kĂŒmmern, geeignete zusĂ€tzliche Reviewer fĂŒr Ihr Modell zu finden.
-
-Nachdem alle Gutachter mit dem Stand Ihres PR zufrieden sind, entfernen Sie als letzten Aktionspunkt das Flag `from_pt=True` in
-.from_pretrained()-Aufrufen zu entfernen. Da es keine TensorFlow-Gewichte gibt, mĂŒssen Sie sie hinzufĂŒgen! Lesen Sie den Abschnitt
-unten, um zu erfahren, wie Sie dies tun können.
-
-Wenn schlieĂlich die TensorFlow-Gewichte zusammengefĂŒhrt werden, Sie mindestens 3 Genehmigungen von PrĂŒfern haben und alle CI-Checks grĂŒn sind
-grĂŒn sind, ĂŒberprĂŒfen Sie die Tests ein letztes Mal lokal
-
-```bash
-NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
-py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
-```
-
-und wir werden Ihren PR zusammenfĂŒhren! Herzlichen GlĂŒckwunsch zu dem Meilenstein đ.
-
-**7. (Optional) Erstellen Sie Demos und teilen Sie sie mit der Welt**
-
-Eine der schwierigsten Aufgaben bei Open-Source ist die Entdeckung. Wie können die anderen Benutzer von der Existenz Ihres
-fabelhaften TensorFlow-Beitrags erfahren? Mit der richtigen Kommunikation, natĂŒrlich! đŁ
-
-Es gibt vor allem zwei Möglichkeiten, Ihr Modell mit der Community zu teilen:
-- Erstellen Sie Demos. Dazu gehören Gradio-Demos, Notebooks und andere unterhaltsame Möglichkeiten, Ihr Modell vorzufĂŒhren. Wir raten Ihnen
- ermutigen Sie, ein Notizbuch zu unseren [community-driven demos](https://huggingface.co/docs/transformers/community) hinzuzufĂŒgen.
-- Teilen Sie Geschichten in sozialen Medien wie Twitter und LinkedIn. Sie sollten stolz auf Ihre Arbeit sein und sie mit der
- Ihre Leistung mit der Community teilen - Ihr Modell kann nun von Tausenden von Ingenieuren und Forschern auf der ganzen Welt genutzt werden
- der Welt genutzt werden đ! Wir werden Ihre BeitrĂ€ge gerne retweeten und Ihnen helfen, Ihre Arbeit mit der Community zu teilen.
-
-
-## HinzufĂŒgen von TensorFlow-Gewichten zum đ€ Hub
-
-Unter der Annahme, dass die TensorFlow-Modellarchitektur in đ€ Transformers verfĂŒgbar ist, ist die Umwandlung von PyTorch-Gewichten in
-TensorFlow-Gewichte ist ein Kinderspiel!
-
-Hier sehen Sie, wie es geht:
-1. Stellen Sie sicher, dass Sie in Ihrem Terminal bei Ihrem Hugging Face Konto angemeldet sind. Sie können sich mit dem folgenden Befehl anmelden
- `huggingface-cli login` (Ihre Zugangstoken finden Sie [hier](https://huggingface.co/settings/tokens))
-2. FĂŒhren Sie `transformers-cli pt-to-tf --model-name foo/bar` aus, wobei `foo/bar` der Name des Modell-Repositorys ist
- ist, das die PyTorch-Gewichte enthÀlt, die Sie konvertieren möchten.
-3. Markieren Sie `@joaogante` und `@Rocketknight1` in dem đ€ Hub PR, den der obige Befehl gerade erstellt hat
-
-Das war's! đ
-
-
-## Fehlersuche in verschiedenen ML-Frameworks đ
-
-Irgendwann, wenn Sie eine neue Architektur hinzufĂŒgen oder TensorFlow-Gewichte fĂŒr eine bestehende Architektur erstellen, werden Sie
-stoĂen Sie vielleicht auf Fehler, die sich ĂŒber Unstimmigkeiten zwischen PyTorch und TensorFlow beschweren. Sie könnten sich sogar dazu entschlieĂen, den
-Modellarchitektur-Code fĂŒr die beiden Frameworks zu öffnen, und stellen fest, dass sie identisch aussehen. Was ist denn da los? đ€
-
-Lassen Sie uns zunĂ€chst darĂŒber sprechen, warum es wichtig ist, diese Diskrepanzen zu verstehen. Viele Community-Mitglieder werden đ€
-Transformers-Modelle und vertrauen darauf, dass sich unsere Modelle wie erwartet verhalten. Wenn es eine groĂe Diskrepanz gibt
-zwischen den beiden Frameworks auftritt, bedeutet dies, dass das Modell nicht der Referenzimplementierung fĂŒr mindestens eines der Frameworks folgt.
-der Frameworks folgt. Dies kann zu stillen Fehlern fĂŒhren, bei denen das Modell zwar lĂ€uft, aber eine schlechte Leistung aufweist. Dies ist
-wohl schlimmer als ein Modell, das ĂŒberhaupt nicht lĂ€uft! Aus diesem Grund streben wir an, dass die Abweichung zwischen den Frameworks kleiner als
-1e-5" in allen Phasen des Modells.
-
-Wie bei anderen numerischen Problemen auch, steckt der Teufel im Detail. Und wie bei jedem detailorientierten Handwerk ist die geheime
-Zutat hier Geduld. Hier ist unser Vorschlag fĂŒr den Arbeitsablauf, wenn Sie auf diese Art von Problemen stoĂen:
-1. Lokalisieren Sie die Quelle der Abweichungen. Das Modell, das Sie konvertieren, hat wahrscheinlich bis zu einem gewissen Punkt nahezu identische innere Variablen.
- bestimmten Punkt. Platzieren Sie `Breakpoint()`-Anweisungen in den Architekturen der beiden Frameworks und vergleichen Sie die Werte der
- numerischen Variablen von oben nach unten, bis Sie die Quelle der Probleme gefunden haben.
-2. Nachdem Sie nun die Ursache des Problems gefunden haben, setzen Sie sich mit dem đ€ Transformers-Team in Verbindung. Es ist möglich
- dass wir ein Àhnliches Problem schon einmal gesehen haben und umgehend eine Lösung anbieten können. Als Ausweichmöglichkeit können Sie beliebte Seiten
- wie StackOverflow und GitHub-Probleme.
-3. Wenn keine Lösung in Sicht ist, bedeutet das, dass Sie tiefer gehen mĂŒssen. Die gute Nachricht ist, dass Sie das Problem gefunden haben.
- Problem ausfindig gemacht haben, so dass Sie sich auf die problematische Anweisung konzentrieren und den Rest des Modells ausblenden können! Die schlechte Nachricht ist
- dass Sie sich in die Quellimplementierung der besagten Anweisung einarbeiten mĂŒssen. In manchen FĂ€llen finden Sie vielleicht ein
- Problem mit einer Referenzimplementierung - verzichten Sie nicht darauf, ein Problem im Upstream-Repository zu öffnen.
-
-In einigen FĂ€llen können wir nach RĂŒcksprache mit dem đ€ Transformers-Team zu dem Schluss kommen, dass die Behebung der Abweichung nicht machbar ist.
-Wenn die Abweichung in den Ausgabeschichten des Modells sehr klein ist (aber möglicherweise groà in den versteckten ZustÀnden), können wir
-könnten wir beschlieĂen, sie zu ignorieren und das Modell zu verteilen. Die oben erwĂ€hnte CLI `pt-to-tf` hat ein `--max-error`
-Flag, um die Fehlermeldung bei der Gewichtskonvertierung zu ĂŒberschreiben.
diff --git a/docs/source/de/contributing.md b/docs/source/de/contributing.md
index 4abc301766ee72..4c0e131a352242 100644
--- a/docs/source/de/contributing.md
+++ b/docs/source/de/contributing.md
@@ -98,7 +98,7 @@ Es werden stÀndig neue Modelle veröffentlicht. Wenn Sie ein neues Modell imple
Lassen Sie es uns wissen, wenn Sie bereit sind, das Modell selbst beizutragen. Dann können wir Ihnen helfen, es zu đ€ Transformers hinzuzufĂŒgen!
-Wir haben eine [detaillierte Anleitung und Vorlagen](https://github.com/huggingface/transformers/tree/main/templates) hinzugefĂŒgt, um Ihnen das HinzufĂŒgen eines neuen Modells zu erleichtern, und wir haben auch einen technischen Leitfaden dazu, [wie man ein Modell zu đ€ Transformers hinzufĂŒgt](https://huggingface.co/docs/transformers/add_new_model).
+Wir haben auch einen technischen Leitfaden dazu, [wie man ein Modell zu đ€ Transformers hinzufĂŒgt](https://huggingface.co/docs/transformers/add_new_model).
## Möchten Sie die Dokumentation erweitern?
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 89bf51b2d24043..877ea25938b927 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -189,8 +189,6 @@
title: How to contribute to đ€ Transformers?
- local: add_new_model
title: How to add a model to đ€ Transformers?
- - local: add_tensorflow_model
- title: How to convert a đ€ Transformers model to TensorFlow?
- local: add_new_pipeline
title: How to add a pipeline to đ€ Transformers?
- local: testing
@@ -398,6 +396,8 @@
title: LLaMA
- local: model_doc/llama2
title: Llama2
+ - local: model_doc/llama3
+ title: Llama3
- local: model_doc/longformer
title: Longformer
- local: model_doc/longt5
@@ -462,6 +462,8 @@
title: Persimmon
- local: model_doc/phi
title: Phi
+ - local: model_doc/phi3
+ title: Phi-3
- local: model_doc/phobert
title: PhoBERT
- local: model_doc/plbart
diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md
index efbe4a82759a06..a0a16a14056d14 100644
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@@ -17,12 +17,6 @@ rendered properly in your Markdown viewer.
The đ€ Transformers library is often able to offer new models thanks to community contributors. But this can be a challenging project and requires an in-depth knowledge of the đ€ Transformers library and the model to implement. At Hugging Face, we're trying to empower more of the community to actively add models and we've put together this guide to walk you through the process of adding a PyTorch model (make sure you have [PyTorch installed](https://pytorch.org/get-started/locally/)).
-
-
-If you're interested in implementing a TensorFlow model, take a look at the [How to convert a đ€ Transformers model to TensorFlow](add_tensorflow_model) guide!
-
-
-
Along the way, you'll:
- get insights into open-source best practices
@@ -404,12 +398,14 @@ In the special case that you are adding a model whose architecture exactly match
existing model you only have to add a conversion script as described in [this section](#write-a-conversion-script).
In this case, you can just re-use the whole model architecture of the already existing model.
-Otherwise, let's start generating a new model. You have two choices here:
+Otherwise, let's start generating a new model. We recommend using the following script to add a model starting from
+an existing model:
-- `transformers-cli add-new-model-like` to add a new model like an existing one
-- `transformers-cli add-new-model` to add a new model from our template (will look like BERT or Bart depending on the type of model you select)
+```bash
+transformers-cli add-new-model-like
+```
-In both cases, you will be prompted with a questionnaire to fill in the basic information of your model. The second command requires to install `cookiecutter`, you can find more information on it [here](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model).
+You will be prompted with a questionnaire to fill in the basic information of your model.
**Open a Pull Request on the main huggingface/transformers repo**
diff --git a/docs/source/en/add_tensorflow_model.md b/docs/source/en/add_tensorflow_model.md
deleted file mode 100644
index 23a1e2d17082bb..00000000000000
--- a/docs/source/en/add_tensorflow_model.md
+++ /dev/null
@@ -1,356 +0,0 @@
-
-
-# How to convert a đ€ Transformers model to TensorFlow?
-
-Having multiple frameworks available to use with đ€ Transformers gives you flexibility to play their strengths when
-designing your application, but it implies that compatibility must be added on a per-model basis. The good news is that
-adding TensorFlow compatibility to an existing model is simpler than [adding a new model from scratch](add_new_model)!
-Whether you wish to have a deeper understanding of large TensorFlow models, make a major open-source contribution, or
-enable TensorFlow for your model of choice, this guide is for you.
-
-This guide empowers you, a member of our community, to contribute TensorFlow model weights and/or
-architectures to be used in đ€ Transformers, with minimal supervision from the Hugging Face team. Writing a new model
-is no small feat, but hopefully this guide will make it less of a rollercoaster đą and more of a walk in the park đ¶.
-Harnessing our collective experiences is absolutely critical to make this process increasingly easier, and thus we
-highly encourage that you suggest improvements to this guide!
-
-Before you dive deeper, it is recommended that you check the following resources if you're new to đ€ Transformers:
-- [General overview of đ€ Transformers](add_new_model#general-overview-of-transformers)
-- [Hugging Face's TensorFlow Philosophy](https://huggingface.co/blog/tensorflow-philosophy)
-
-In the remainder of this guide, you will learn what's needed to add a new TensorFlow model architecture, the
-procedure to convert PyTorch into TensorFlow model weights, and how to efficiently debug mismatches across ML
-frameworks. Let's get started!
-
-
-
-Are you unsure whether the model you wish to use already has a corresponding TensorFlow architecture?
-
-
-
-Check the `model_type` field of the `config.json` of your model of choice
-([example](https://huggingface.co/google-bert/bert-base-uncased/blob/main/config.json#L14)). If the corresponding model folder in
-đ€ Transformers has a file whose name starts with "modeling_tf", it means that it has a corresponding TensorFlow
-architecture ([example](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)).
-
-
-
-
-## Step-by-step guide to add TensorFlow model architecture code
-
-There are many ways to design a large model architecture, and multiple ways of implementing said design. However,
-you might recall from our [general overview of đ€ Transformers](add_new_model#general-overview-of-transformers)
-that we are an opinionated bunch - the ease of use of đ€ Transformers relies on consistent design choices. From
-experience, we can tell you a few important things about adding TensorFlow models:
-
-- Don't reinvent the wheel! More often than not, there are at least two reference implementations you should check: the
-PyTorch equivalent of the model you are implementing and other TensorFlow models for the same class of problems.
-- Great model implementations survive the test of time. This doesn't happen because the code is pretty, but rather
-because the code is clear, easy to debug and build upon. If you make the life of the maintainers easy with your
-TensorFlow implementation, by replicating the same patterns as in other TensorFlow models and minimizing the mismatch
-to the PyTorch implementation, you ensure your contribution will be long lived.
-- Ask for help when you're stuck! The đ€ Transformers team is here to help, and we've probably found solutions to the same
-problems you're facing.
-
-Here's an overview of the steps needed to add a TensorFlow model architecture:
-1. Select the model you wish to convert
-2. Prepare transformers dev environment
-3. (Optional) Understand theoretical aspects and the existing implementation
-4. Implement the model architecture
-5. Implement model tests
-6. Submit the pull request
-7. (Optional) Build demos and share with the world
-
-### 1.-3. Prepare your model contribution
-
-**1. Select the model you wish to convert**
-
-Let's start off with the basics: the first thing you need to know is the architecture you want to convert. If you
-don't have your eyes set on a specific architecture, asking the đ€ Transformers team for suggestions is a great way to
-maximize your impact - we will guide you towards the most prominent architectures that are missing on the TensorFlow
-side. If the specific model you want to use with TensorFlow already has a TensorFlow architecture implementation in
-đ€ Transformers but is lacking weights, feel free to jump straight into the
-[weight conversion section](#adding-tensorflow-weights-to--hub)
-of this page.
-
-For simplicity, the remainder of this guide assumes you've decided to contribute with the TensorFlow version of
-*BrandNewBert* (the same example as in the [guide](add_new_model) to add a new model from scratch).
-
-
-
-Before starting the work on a TensorFlow model architecture, double-check that there is no ongoing effort to do so.
-You can search for `BrandNewBert` on the
-[pull request GitHub page](https://github.com/huggingface/transformers/pulls?q=is%3Apr) to confirm that there is no
-TensorFlow-related pull request.
-
-
-
-
-**2. Prepare transformers dev environment**
-
-Having selected the model architecture, open a draft PR to signal your intention to work on it. Follow the
-instructions below to set up your environment and open a draft PR.
-
-1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the 'Fork' button on the
- repository's page. This creates a copy of the code under your GitHub user account.
-
-2. Clone your `transformers` fork to your local disk, and add the base repository as a remote:
-
- ```bash
- git clone https://github.com/[your Github handle]/transformers.git
- cd transformers
- git remote add upstream https://github.com/huggingface/transformers.git
- ```
-
-3. Set up a development environment, for instance by running the following commands:
-
- ```bash
- python -m venv .env
- source .env/bin/activate
- pip install -e ".[dev]"
- ```
-
- Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
- failure with this command. If that's the case make sure to install TensorFlow then do:
-
- ```bash
- pip install -e ".[quality]"
- ```
-
- **Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient.
-
-4. Create a branch with a descriptive name from your main branch:
-
- ```bash
- git checkout -b add_tf_brand_new_bert
- ```
-
-5. Fetch and rebase to current main:
-
- ```bash
- git fetch upstream
- git rebase upstream/main
- ```
-
-6. Add an empty `.py` file in `transformers/src/models/brandnewbert/` named `modeling_tf_brandnewbert.py`. This will
-be your TensorFlow model file.
-
-7. Push the changes to your account using:
-
- ```bash
- git add .
- git commit -m "initial commit"
- git push -u origin add_tf_brand_new_bert
- ```
-
-8. Once you are satisfied, go to the webpage of your fork on GitHub. Click on âPull requestâ. Make sure to add the
- GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
- future changes.
-
-9. Change the PR into a draft by clicking on âConvert to draftâ on the right of the GitHub pull request web page.
-
-
-Now you have set up a development environment to port *BrandNewBert* to TensorFlow in đ€ Transformers.
-
-
-**3. (Optional) Understand theoretical aspects and the existing implementation**
-
-You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large
-sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is
-not to get a deep theoretical understanding of the paper, but to extract the necessary information required to
-effectively re-implement the model in đ€ Transformers using TensorFlow. That being said, you don't have to spend too
-much time on the theoretical aspects, but rather focus on the practical ones, namely the existing model documentation
-page (e.g. [model docs for BERT](model_doc/bert)).
-
-After you've grasped the basics of the models you are about to implement, it's important to understand the existing
-implementation. This is a great chance to confirm that a working implementation matches your expectations for the
-model, as well as to foresee technical challenges on the TensorFlow side.
-
-It's perfectly natural that you feel overwhelmed with the amount of information that you've just absorbed. It is
-definitely not a requirement that you understand all facets of the model at this stage. Nevertheless, we highly
-encourage you to clear any pressing questions in our [forum](https://discuss.huggingface.co/).
-
-
-### 4. Model implementation
-
-Now it's time to finally start coding. Our suggested starting point is the PyTorch file itself: copy the contents of
-`modeling_brand_new_bert.py` inside `src/transformers/models/brand_new_bert/` into
-`modeling_tf_brand_new_bert.py`. The goal of this section is to modify the file and update the import structure of
-đ€ Transformers such that you can import `TFBrandNewBert` and
-`TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` successfully loads a working TensorFlow *BrandNewBert* model.
-
-Sadly, there is no prescription to convert a PyTorch model into TensorFlow. You can, however, follow our selection of
-tips to make the process as smooth as possible:
-- Prepend `TF` to the name of all classes (e.g. `BrandNewBert` becomes `TFBrandNewBert`).
-- Most PyTorch operations have a direct TensorFlow replacement. For example, `torch.nn.Linear` corresponds to
- `tf.keras.layers.Dense`, `torch.nn.Dropout` corresponds to `tf.keras.layers.Dropout`, etc. If you're not sure
- about a specific operation, you can use the [TensorFlow documentation](https://www.tensorflow.org/api_docs/python/tf)
- or the [PyTorch documentation](https://pytorch.org/docs/stable/).
-- Look for patterns in the đ€ Transformers codebase. If you come across a certain operation that doesn't have a direct
- replacement, the odds are that someone else already had the same problem.
-- By default, keep the same variable names and structure as in PyTorch. This will make it easier to debug, track
- issues, and add fixes down the line.
-- Some layers have different default values in each framework. A notable example is the batch normalization layer's
- epsilon (`1e-5` in [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d)
- and `1e-3` in [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)).
- Double-check the documentation!
-- PyTorch's `nn.Parameter` variables typically need to be initialized within TF Layer's `build()`. See the following
- example: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) /
- [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220)
-- If the PyTorch model has a `#copied from ...` on top of a function, the odds are that your TensorFlow model can also
- borrow that function from the architecture it was copied from, assuming it has a TensorFlow architecture.
-- Assigning the `name` attribute correctly in TensorFlow functions is critical to do the `from_pt=True` weight
- cross-loading. `name` is almost always the name of the corresponding variable in the PyTorch code. If `name` is not
- properly set, you will see it in the error message when loading the model weights.
-- The logic of the base model class, `BrandNewBertModel`, will actually reside in `TFBrandNewBertMainLayer`, a Keras
- layer subclass ([example](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719)).
- `TFBrandNewBertModel` will simply be a wrapper around this layer.
-- Keras models need to be built in order to load pretrained weights. For that reason, `TFBrandNewBertPreTrainedModel`
- will need to hold an example of inputs to the model, the `dummy_inputs`
- ([example](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)).
-- If you get stuck, ask for help - we're here to help you! đ€
-
-In addition to the model file itself, you will also need to add the pointers to the model classes and related
-documentation pages. You can complete this part entirely following the patterns in other PRs
-([example](https://github.com/huggingface/transformers/pull/18020/files)). Here's a list of the needed manual
-changes:
-- Include all public classes of *BrandNewBert* in `src/transformers/__init__.py`
-- Add *BrandNewBert* classes to the corresponding Auto classes in `src/transformers/models/auto/modeling_tf_auto.py`
-- Add the lazy loading classes related to *BrandNewBert* in `src/transformers/utils/dummy_tf_objects.py`
-- Update the import structures for the public classes in `src/transformers/models/brand_new_bert/__init__.py`
-- Add the documentation pointers to the public methods of *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.md`
-- Add yourself to the list of contributors to *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.md`
-- Finally, add a green tick â
to the TensorFlow column of *BrandNewBert* in `docs/source/en/index.md`
-
-When you're happy with your implementation, run the following checklist to confirm that your model architecture is
-ready:
-1. All layers that behave differently at train time (e.g. Dropout) are called with a `training` argument, which is
-propagated all the way from the top-level classes
-2. You have used `#copied from ...` whenever possible
-3. `TFBrandNewBertMainLayer` and all classes that use it have their `call` function decorated with `@unpack_inputs`
-4. `TFBrandNewBertMainLayer` is decorated with `@keras_serializable`
-5. A TensorFlow model can be loaded from PyTorch weights using `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`
-6. You can call the TensorFlow model using the expected input format
-
-
-### 5. Add model tests
-
-Hurray, you've implemented a TensorFlow model! Now it's time to add tests to make sure that your model behaves as
-expected. As in the previous section, we suggest you start by copying the `test_modeling_brand_new_bert.py` file in
-`tests/models/brand_new_bert/` into `test_modeling_tf_brand_new_bert.py`, and continue by making the necessary
-TensorFlow replacements. For now, in all `.from_pretrained()` calls, you should use the `from_pt=True` flag to load
-the existing PyTorch weights.
-
-After you're done, it's time for the moment of truth: run the tests! đŹ
-
-```bash
-NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
-py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
-```
-
-The most likely outcome is that you'll see a bunch of errors. Don't worry, this is expected! Debugging ML models is
-notoriously hard, and the key ingredient to success is patience (and `breakpoint()`). In our experience, the hardest
-problems arise from subtle mismatches between ML frameworks, for which we have a few pointers at the end of this guide.
-In other cases, a general test might not be directly applicable to your model, in which case we suggest an override
-at the model test class level. Regardless of the issue, don't hesitate to ask for help in your draft pull request if
-you're stuck.
-
-When all tests pass, congratulations, your model is nearly ready to be added to the đ€ Transformers library! đ
-
-### 6.-7. Ensure everyone can use your model
-
-**6. Submit the pull request**
-
-Once you're done with the implementation and the tests, it's time to submit a pull request. Before pushing your code,
-run our code formatting utility, `make fixup` đȘ. This will automatically fix any formatting issues, which would cause
-our automatic checks to fail.
-
-It's now time to convert your draft pull request into a real pull request. To do so, click on the "Ready for
-review" button and add Joao (`@gante`) and Matt (`@Rocketknight1`) as reviewers. A model pull request will need
-at least 3 reviewers, but they will take care of finding appropriate additional reviewers for your model.
-
-After all reviewers are happy with the state of your PR, the final action point is to remove the `from_pt=True` flag in
-`.from_pretrained()` calls. Since there are no TensorFlow weights, you will have to add them! Check the section
-below for instructions on how to do it.
-
-Finally, when the TensorFlow weights get merged, you have at least 3 reviewer approvals, and all CI checks are
-green, double-check the tests locally one last time
-
-```bash
-NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
-py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
-```
-
-and we will merge your PR! Congratulations on the milestone đ
-
-**7. (Optional) Build demos and share with the world**
-
-One of the hardest parts about open-source is discovery. How can the other users learn about the existence of your
-fabulous TensorFlow contribution? With proper communication, of course! đŁ
-
-There are two main ways to share your model with the community:
-- Build demos. These include Gradio demos, notebooks, and other fun ways to show off your model. We highly
- encourage you to add a notebook to our [community-driven demos](https://huggingface.co/docs/transformers/community).
-- Share stories on social media like Twitter and LinkedIn. You should be proud of your work and share
- your achievement with the community - your model can now be used by thousands of engineers and researchers around
- the world đ! We will be happy to retweet your posts and help you share your work with the community.
-
-
-## Adding TensorFlow weights to đ€ Hub
-
-Assuming that the TensorFlow model architecture is available in đ€ Transformers, converting PyTorch weights into
-TensorFlow weights is a breeze!
-
-Here's how to do it:
-1. Make sure you are logged into your Hugging Face account in your terminal. You can log in using the command
- `huggingface-cli login` (you can find your access tokens [here](https://huggingface.co/settings/tokens))
-2. Run `transformers-cli pt-to-tf --model-name foo/bar`, where `foo/bar` is the name of the model repository
- containing the PyTorch weights you want to convert
-3. Tag `@joaogante` and `@Rocketknight1` in the đ€ Hub PR the command above has just created
-
-That's it! đ
-
-
-## Debugging mismatches across ML frameworks đ
-
-At some point, when adding a new architecture or when creating TensorFlow weights for an existing architecture, you
-might come across errors complaining about mismatches between PyTorch and TensorFlow. You might even decide to open the
-model architecture code for the two frameworks, and find that they look identical. What's going on? đ€
-
-First of all, let's talk about why understanding these mismatches matters. Many community members will use đ€
-Transformers models out of the box, and trust that our models behave as expected. When there is a large mismatch
-between the two frameworks, it implies that the model is not following the reference implementation for at least one
-of the frameworks. This might lead to silent failures, in which the model runs but has poor performance. This is
-arguably worse than a model that fails to run at all! To that end, we aim at having a framework mismatch smaller than
-`1e-5` at all stages of the model.
-
-As in other numerical problems, the devil is in the details. And as in any detail-oriented craft, the secret
-ingredient here is patience. Here is our suggested workflow for when you come across this type of issues:
-1. Locate the source of mismatches. The model you're converting probably has near identical inner variables up to a
- certain point. Place `breakpoint()` statements in the two frameworks' architectures, and compare the values of the
- numerical variables in a top-down fashion until you find the source of the problems.
-2. Now that you've pinpointed the source of the issue, get in touch with the đ€ Transformers team. It is possible
- that we've seen a similar problem before and can promptly provide a solution. As a fallback, scan popular pages
- like StackOverflow and GitHub issues.
-3. If there is no solution in sight, it means you'll have to go deeper. The good news is that you've located the
- issue, so you can focus on the problematic instruction, abstracting away the rest of the model! The bad news is
- that you'll have to venture into the source implementation of said instruction. In some cases, you might find an
- issue with a reference implementation - don't abstain from opening an issue in the upstream repository.
-
-In some cases, in discussion with the đ€ Transformers team, we might find that fixing the mismatch is infeasible.
-When the mismatch is very small in the output layers of the model (but potentially large in the hidden states), we
-might decide to ignore it in favor of distributing the model. The `pt-to-tf` CLI mentioned above has a `--max-error`
-flag to override the error message at weight conversion time.
diff --git a/docs/source/en/deepspeed.md b/docs/source/en/deepspeed.md
index eacd6e1c1071c8..868021a9cd2e27 100644
--- a/docs/source/en/deepspeed.md
+++ b/docs/source/en/deepspeed.md
@@ -659,7 +659,7 @@ You could also use the [`Trainer`]'s `--save_on_each_node` argument to automatic
For [torchrun](https://pytorch.org/docs/stable/elastic/run.html), you have to ssh to each node and run the following command on both of them. The launcher waits until both nodes are synchronized before launching the training.
```bash
-python -m torch.run --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \
+torchrun --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \
--master_port=9901 your_program.py --deepspeed ds_config.json
```
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 912bbad1d2d5ea..419d3d5b1dc2cc 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -177,6 +177,7 @@ Flax), PyTorch, and/or TensorFlow.
| [LiLT](model_doc/lilt) | â
| â | â |
| [LLaMA](model_doc/llama) | â
| â | â
|
| [Llama2](model_doc/llama2) | â
| â | â
|
+| [Llama3](model_doc/llama3) | â
| â | â
|
| [LLaVa](model_doc/llava) | â
| â | â |
| [LLaVA-NeXT](model_doc/llava_next) | â
| â | â |
| [Longformer](model_doc/longformer) | â
| â
| â |
@@ -235,6 +236,7 @@ Flax), PyTorch, and/or TensorFlow.
| [Perceiver](model_doc/perceiver) | â
| â | â |
| [Persimmon](model_doc/persimmon) | â
| â | â |
| [Phi](model_doc/phi) | â
| â | â |
+| [Phi3](model_doc/phi3) | â
| â | â |
| [PhoBERT](model_doc/phobert) | â
| â
| â
|
| [Pix2Struct](model_doc/pix2struct) | â
| â | â |
| [PLBart](model_doc/plbart) | â
| â | â |
diff --git a/docs/source/en/model_doc/llama3.md b/docs/source/en/model_doc/llama3.md
new file mode 100644
index 00000000000000..1a7546c7e68a4f
--- /dev/null
+++ b/docs/source/en/model_doc/llama3.md
@@ -0,0 +1,85 @@
+
+
+# Llama3
+
+
+## Overview
+
+The Llama3 model was proposed in [Introducing Meta Llama 3: The most capable openly available LLM to date](https://ai.meta.com/blog/meta-llama-3/) by the meta AI team.
+
+The abstract from the blogpost is the following:
+
+*Today, weâre excited to share the first two models of the next generation of Llama, Meta Llama 3, available for broad use. This release features pretrained and instruction-fine-tuned language models with 8B and 70B parameters that can support a broad range of use cases. This next generation of Llama demonstrates state-of-the-art performance on a wide range of industry benchmarks and offers new capabilities, including improved reasoning. We believe these are the best open source models of their class, period. In support of our longstanding open approach, weâre putting Llama 3 in the hands of the community. We want to kickstart the next wave of innovation in AI across the stackâfrom applications to developer tools to evals to inference optimizations and more. We canât wait to see what you build and look forward to your feedback.*
+
+Checkout all Llama3 model checkpoints [here](https://huggingface.co/models?search=llama3).
+The original code of the authors can be found [here](https://github.com/meta-llama/llama3).
+
+## Usage tips
+
+
+
+The `Llama3` models were trained using `bfloat16`, but the original inference uses `float16`. The checkpoints uploaded on the Hub use `torch_dtype = 'float16'`, which will be
+used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`.
+
+The `dtype` of the online weights is mostly irrelevant unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online), then it will be casted to the default `dtype` of `torch` (becomes `torch.float32`), and finally, if there is a `torch_dtype` provided in the config, it will be used.
+
+Training the model in `float16` is not recommended and is known to produce `nan`; as such, the model should be trained in `bfloat16`.
+
+
+
+Tips:
+
+- Weights for the Llama3 models can be obtained by filling out [this form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
+- The architecture is exactly the same as Llama2.
+- The tokenizer is a BPE model based on [tiktoken](https://github.com/openai/tiktoken) (vs the one based on sentencepiece implementation for Llama2). The main difference that it ignores BPE merge rules when an input token is part of the vocab. This means that if no merge exist to produce `"hugging"`, instead of having the smallest units, like `["hug","ging"] form 2 tokens, if `"hugging"` is part of the vocab, it will be automatically returned as a token.
+- The original model uses `pad_id = -1` which means that there is no padding token. We can't have the same logic, make sure to add a padding token using `tokenizer.add_special_tokens({"pad_token":""})` and resize the token embedding accordingly. You should also set the `model.config.pad_token_id`. The `embed_tokens` layer of the model is initialized with `self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx)`, which makes sure that encoding the padding token will output zeros, so passing it when initializing is recommended.
+- The original checkpoint can be converted using the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:
+
+```bash
+python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+ --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path --llama_version 3
+```
+
+- After conversion, the model and tokenizer can be loaded via:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("/output/path")
+model = AutoModelForCausalLM.from_pretrained("/output/path")
+```
+
+Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 75B model, it's thus 145GB of RAM needed.
+
+
+- When using Flash Attention 2 via `attn_implementation="flash_attention_2"`, don't pass `torch_dtype` to the `from_pretrained` class method and use Automatic Mixed-Precision training. When using `Trainer`, it is simply specifying either `fp16` or `bf16` to `True`. Otherwise, make sure you are using `torch.autocast`. This is required because the Flash Attention only support `fp16` and `bf16` data type.
+
+## Quick usage
+
+```py3
+import transformers
+import torch
+
+model_id = "meta-llama/Meta-Llama-3-8B"
+
+pipeline = transformers.pipeline("text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
+pipeline("Hey how are you doing today?")
+```
+
+## Resources
+A ton of cool resources are already available on the documentation page of [~llama2], inviting contributors to add new recourses curated for Llama3 here! đ€
diff --git a/docs/source/en/model_doc/phi3.md b/docs/source/en/model_doc/phi3.md
new file mode 100644
index 00000000000000..4f6d7acad178e5
--- /dev/null
+++ b/docs/source/en/model_doc/phi3.md
@@ -0,0 +1,92 @@
+
+
+# Phi-3
+
+## Overview
+
+The Phi-3 model was proposed in [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219) by Microsoft.
+
+### Summary
+
+The abstract from the Phi-3 paper is the following:
+
+We introduce phi-3-mini, a 3.8 billion parameter language model trained on 3.3 trillion tokens, whose overall performance, as measured by both academic benchmarks and internal testing, rivals that of models such as Mixtral 8x7B and GPT-3.5 (e.g., phi-3-mini achieves 69% on MMLU and 8.38 on MT-bench), despite being small enough to be deployed on a phone. The innovation lies entirely in our dataset for training, a scaled-up version of the one used for phi-2, composed of heavily filtered web data and synthetic data. The model is also further aligned for robustness, safety, and chat format. We also provide some initial parameter-scaling results with a 7B and 14B models trained for 4.8T tokens, called phi-3-small and phi-3-medium, both significantly more capable than phi-3-mini (e.g., respectively 75% and 78% on MMLU, and 8.7 and 8.9 on MT-bench).
+
+The original code for Phi-3 can be found [here](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
+
+## Usage tips
+
+- This model is very similar to `Llama` with the main difference of [`Phi3SuScaledRotaryEmbedding`] and [`Phi3YarnScaledRotaryEmbedding`], where they are used to extend the context of the rotary embeddings. The query, key and values are fused, and the MLP's up and gate projection layers are also fused.
+- The tokenizer used for this model is identical to the [`LlamaTokenizer`], with the exception of additional tokens.
+
+## How to use Phi-3
+
+
+
+Phi-3 has been integrated in the development version (4.40.0.dev) of `transformers`. Until the official version is released through `pip`, ensure that you are doing one of the following:
+
+* When loading the model, ensure that `trust_remote_code=True` is passed as an argument of the `from_pretrained()` function.
+
+* Update your local `transformers` to the development version: `pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers`. The previous command is an alternative to cloning and installing from the source.
+
+
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+
+>>> messages = [{"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."},{"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}]
+>>> inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
+
+>>> outputs = model.generate(inputs, max_new_tokens=32)
+>>> text = tokenizer.batch_decode(outputs)[0]
+>>> print(text)
+<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Absolutely! Bananas and dragonfruits are both delicious fruits that can be combined in various ways to create tasty and nutrit
+```
+
+## Phi3Config
+
+[[autodoc]] Phi3Config
+
+
+
+
+## Phi3Model
+
+[[autodoc]] Phi3Model
+ - forward
+
+## Phi3ForCausalLM
+
+[[autodoc]] Phi3ForCausalLM
+ - forward
+ - generate
+
+## Phi3ForSequenceClassification
+
+[[autodoc]] Phi3ForSequenceClassification
+ - forward
+
+## Phi3ForTokenClassification
+
+[[autodoc]] Phi3ForTokenClassification
+ - forward
+
+
+
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 83cb699c2dc9fd..494ba660fa763d 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -65,6 +65,7 @@ FlashAttention-2 is currently supported for the following architectures:
* [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
* [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
* [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
+* [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model)
* [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
* [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model)
* [Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2Model)
diff --git a/docs/source/en/tasks/asr.md b/docs/source/en/tasks/asr.md
index a1a96271102ba4..3222f70c4d298a 100644
--- a/docs/source/en/tasks/asr.md
+++ b/docs/source/en/tasks/asr.md
@@ -28,13 +28,8 @@ This guide will show you how to:
2. Use your finetuned model for inference.
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-[Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [M-CTC-T](../model_doc/mctct), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-BERT](../model_doc/wav2vec2-bert), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm)
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/automatic-speech-recognition)
diff --git a/docs/source/en/tasks/audio_classification.md b/docs/source/en/tasks/audio_classification.md
index 5ea3567f4c3c6c..c50107e44f1e17 100644
--- a/docs/source/en/tasks/audio_classification.md
+++ b/docs/source/en/tasks/audio_classification.md
@@ -28,13 +28,8 @@ This guide will show you how to:
2. Use your finetuned model for inference.
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-[Audio Spectrogram Transformer](../model_doc/audio-spectrogram-transformer), [Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-BERT](../model_doc/wav2vec2-bert), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm), [Whisper](../model_doc/whisper)
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/audio-classification)
diff --git a/docs/source/en/tasks/document_question_answering.md b/docs/source/en/tasks/document_question_answering.md
index 3d3acf0541dbf9..54c0cd5aef3f3f 100644
--- a/docs/source/en/tasks/document_question_answering.md
+++ b/docs/source/en/tasks/document_question_answering.md
@@ -30,13 +30,7 @@ This guide illustrates how to:
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-
-[LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3)
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/image-to-text)
diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md
index 25f232bc00a728..81ff45c4c8d5aa 100644
--- a/docs/source/en/tasks/image_classification.md
+++ b/docs/source/en/tasks/image_classification.md
@@ -30,13 +30,8 @@ This guide illustrates how to:
2. Use your fine-tuned model for inference.
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [CLIP](../model_doc/clip), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [PVTv2](../model_doc/pvt_v2), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SigLIP](../model_doc/siglip), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/image-classification)
diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md
index 5c7bcd8595ca2e..af26ab1e44a0f6 100644
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@@ -33,16 +33,8 @@ This guide will show you how to:
2. Use your finetuned model for inference.
-You can finetune other architectures for causal language modeling following the same steps in this guide.
-Choose one of the following architectures:
-
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [Cohere](../model_doc/cohere), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DBRX](../model_doc/dbrx), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [Gemma](../model_doc/gemma), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [Jamba](../model_doc/jamba), [LLaMA](../model_doc/llama), [Mamba](../model_doc/mamba), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [Mixtral](../model_doc/mixtral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MusicGen Melody](../model_doc/musicgen_melody), [MVP](../model_doc/mvp), [OLMo](../model_doc/olmo), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [Phi](../model_doc/phi), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Qwen2](../model_doc/qwen2), [Qwen2MoE](../model_doc/qwen2_moe), [RecurrentGemma](../model_doc/recurrent_gemma), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [StableLm](../model_doc/stablelm), [Starcoder2](../model_doc/starcoder2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [Whisper](../model_doc/whisper), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
-
-
-
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/text-generation)
diff --git a/docs/source/en/tasks/masked_language_modeling.md b/docs/source/en/tasks/masked_language_modeling.md
index 1736e858eeb36e..5987e0193f10a8 100644
--- a/docs/source/en/tasks/masked_language_modeling.md
+++ b/docs/source/en/tasks/masked_language_modeling.md
@@ -30,14 +30,8 @@ This guide will show you how to:
2. Use your finetuned model for inference.
-You can finetune other architectures for masked language modeling following the same steps in this guide.
-Choose one of the following architectures:
-
-
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [Perceiver](../model_doc/perceiver), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Wav2Vec2](../model_doc/wav2vec2), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/fill-mask)
diff --git a/docs/source/en/tasks/monocular_depth_estimation.md b/docs/source/en/tasks/monocular_depth_estimation.md
index aea18299893196..d3cc8f3c3c89be 100644
--- a/docs/source/en/tasks/monocular_depth_estimation.md
+++ b/docs/source/en/tasks/monocular_depth_estimation.md
@@ -26,13 +26,8 @@ in the scene and the corresponding depth information, which can be affected by f
occlusion, and texture.
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-[Depth Anything](../model_doc/depth_anything), [DPT](../model_doc/dpt), [GLPN](../model_doc/glpn)
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/depth-anything)
diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md
index 9baa0eea5d5934..4adcad523284c9 100644
--- a/docs/source/en/tasks/multiple_choice.md
+++ b/docs/source/en/tasks/multiple_choice.md
@@ -25,17 +25,6 @@ This guide will show you how to:
1. Finetune [BERT](https://huggingface.co/google-bert/bert-base-uncased) on the `regular` configuration of the [SWAG](https://huggingface.co/datasets/swag) dataset to select the best answer given multiple options and some context.
2. Use your finetuned model for inference.
-
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-
-[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
-
-
-
Before you begin, make sure you have all the necessary libraries installed:
```bash
diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index 2513591f545238..273484bbb3ef02 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -33,13 +33,8 @@ In this guide, you will learn how to:
2. Use your finetuned model for inference.
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/object-detection)
diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md
index 724e51d0dc9f5d..367e35b121164f 100644
--- a/docs/source/en/tasks/question_answering.md
+++ b/docs/source/en/tasks/question_answering.md
@@ -31,15 +31,8 @@ This guide will show you how to:
2. Use your finetuned model for inference.
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/question-answering)
diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md
index 048a1d38d003b6..ac44473001818c 100644
--- a/docs/source/en/tasks/semantic_segmentation.md
+++ b/docs/source/en/tasks/semantic_segmentation.md
@@ -201,13 +201,8 @@ We will now:
2. Use your fine-tuned model for inference.
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet)
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/image-segmentation)
diff --git a/docs/source/en/tasks/sequence_classification.md b/docs/source/en/tasks/sequence_classification.md
index 67fde97d090368..572d6493ba4f32 100644
--- a/docs/source/en/tasks/sequence_classification.md
+++ b/docs/source/en/tasks/sequence_classification.md
@@ -28,16 +28,8 @@ This guide will show you how to:
2. Use your finetuned model for inference.
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [CodeLlama](../model_doc/code_llama), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [Gemma](../model_doc/gemma), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [Jamba](../model_doc/jamba), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [Mixtral](../model_doc/mixtral), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [Persimmon](../model_doc/persimmon), [Phi](../model_doc/phi), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Qwen2](../model_doc/qwen2), [Qwen2MoE](../model_doc/qwen2_moe), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [StableLm](../model_doc/stablelm), [Starcoder2](../model_doc/starcoder2), [T5](../model_doc/t5), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/text-classification).
diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md
index 37a305a4ac008e..e9e77189d4613a 100644
--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@@ -31,13 +31,8 @@ This guide will show you how to:
2. Use your finetuned model for inference.
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SeamlessM4T](../model_doc/seamless_m4t), [SeamlessM4Tv2](../model_doc/seamless_m4t_v2), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/summarization)
diff --git a/docs/source/en/tasks/token_classification.md b/docs/source/en/tasks/token_classification.md
index d0e4e87963f9b1..444d8421727d80 100644
--- a/docs/source/en/tasks/token_classification.md
+++ b/docs/source/en/tasks/token_classification.md
@@ -28,13 +28,8 @@ This guide will show you how to:
2. Use your finetuned model for inference.
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [BROS](../model_doc/bros), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [Phi](../model_doc/phi), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/token-classification).
diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md
index c03ed34e58a3a5..e7838ea6be9625 100644
--- a/docs/source/en/tasks/translation.md
+++ b/docs/source/en/tasks/translation.md
@@ -28,13 +28,8 @@ This guide will show you how to:
2. Use your finetuned model for inference.
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SeamlessM4T](../model_doc/seamless_m4t), [SeamlessM4Tv2](../model_doc/seamless_m4t_v2), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/translation).
diff --git a/docs/source/en/tasks/video_classification.md b/docs/source/en/tasks/video_classification.md
index 1a0b8deeb1d34a..e3e998c7d67b6b 100644
--- a/docs/source/en/tasks/video_classification.md
+++ b/docs/source/en/tasks/video_classification.md
@@ -26,13 +26,8 @@ This guide will show you how to:
2. Use your fine-tuned model for inference.
-The task illustrated in this tutorial is supported by the following model architectures:
-
-
-[TimeSformer](../model_doc/timesformer), [VideoMAE](../model_doc/videomae), [ViViT](../model_doc/vivit)
-
-
+To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/video-classification).
diff --git a/docs/source/es/tasks/language_modeling.md b/docs/source/es/tasks/language_modeling.md
index 73bfc4d650f131..9516876a00633e 100644
--- a/docs/source/es/tasks/language_modeling.md
+++ b/docs/source/es/tasks/language_modeling.md
@@ -30,8 +30,6 @@ Esta guĂa te mostrarĂĄ cĂłmo realizar fine-tuning [DistilGPT2](https://huggingf
-Puedes realizar fine-tuning a otras arquitecturas para modelos de lenguaje como [GPT-Neo](https://huggingface.co/EleutherAI/gpt-neo-125M), [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) y [BERT](https://huggingface.co/google-bert/bert-base-uncased) siguiendo los mismos pasos presentados en esta guĂa!
-
Mira la [pågina de tarea](https://huggingface.co/tasks/text-generation) para generación de texto y la [pågina de tarea](https://huggingface.co/tasks/fill-mask) para modelos de lenguajes por enmascaramiento para obtener mås información sobre los modelos, datasets, y métricas asociadas.
diff --git a/docs/source/it/add_new_model.md b/docs/source/it/add_new_model.md
index f6daeeaf85d350..9403aa46a2183b 100644
--- a/docs/source/it/add_new_model.md
+++ b/docs/source/it/add_new_model.md
@@ -351,13 +351,14 @@ Nel caso speciale in cui stiate aggiungendo un modello, la cui architettura sia
dovrete solo aggiugnere uno script di conversione, come descritto [qui](#write-a-conversion-script).
In questo caso, potete riutilizzare l'intera architettura del modello gia esistente.
-Se questo non Ă© il caso, cominciamo con il generare un nuovo modello. Avrete due opzioni:
+Se questo non Ă© il caso, cominciamo con il generare un nuovo modello. Ti consigliamo di utilizzare il seguente script per aggiungere un modello a partire da
+un modello esistente:
-- `transformers-cli add-new-model-like` per aggiungere un nuovo modello come uno che gia esiste
-- `transformers-cli add-new-model` per aggiungere un nuovo modello da un nostro template (questo assomigliera a BERT o Bart, in base al modello che selezionerete)
+```bash
+transformers-cli add-new-model-like
+```
-In entrambi i casi, l'output vi darĂ un questionario da riempire con informazioni basi sul modello. Il secondo comando richiede di installare
-un `cookiecutter` - maggiori informazioni [qui](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model).
+Ti verrĂ richiesto con un questionario di compilare le informazioni di base del tuo modello.
**Aprire una Pull Request in main huggingface/transformers repo**
diff --git a/docs/source/ja/_toctree.yml b/docs/source/ja/_toctree.yml
index 354e22344a904a..cbc19313f3a03e 100644
--- a/docs/source/ja/_toctree.yml
+++ b/docs/source/ja/_toctree.yml
@@ -169,8 +169,6 @@
- sections:
- local: add_new_model
title: đ€ Transformersă«ăąăă«ăèżœć ăăæčæł
- - local: add_tensorflow_model
- title: đ€ Transformersăąăă«ăTensorFlowă«ć€æăăæčæł
- local: testing
title: ăăčă
- local: pr_checks
diff --git a/docs/source/ja/add_new_model.md b/docs/source/ja/add_new_model.md
index 0701e973deeb3a..1067cbaac72eca 100644
--- a/docs/source/ja/add_new_model.md
+++ b/docs/source/ja/add_new_model.md
@@ -20,12 +20,6 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
Hugging Faceă§ăŻăăłăă„ăăăŁăźć€ăăźäșșă
ă«ç©æ„”çă«ăąăă«ăèżœć ăăćăäžăăăăšćȘćăăŠăăă
ăăźăŹă€ăăăŸăšăăŠăPyTorchăąăă«ăèżœć ăăăăă»ăčăèȘŹæăăŸăïŒ[PyTorchăă€ăłăčăăŒă«ăăăŠăăăăšăçąșèȘăăŠăă ăă](https://pytorch.org/get-started/locally/)ïŒă
-
-
-TensorFlowăąăă«ăćźèŁ
ăăèćłăăăć ŽćăŻă[đ€ Transformersăąăă«ăTensorFlowă«ć€æăăæčæł](add_tensorflow_model)ăŹă€ăăćç
§ăăŠăżăŠăă ăăïŒ
-
-
-
ăăźéçšă§ă仄äžăźăăšăćŠăłăŸăïŒ
- ăȘăŒăăłăœăŒăčăźăăčăăă©ăŻăăŁăčă«éąăăæŽćŻ
@@ -313,14 +307,15 @@ cd transformers
[ăăźă»ăŻă·ă§ăł](#write-a-conversion-script)ă§èȘŹæăăăŠăăăăă«ăć€æăčăŻăȘăăăèżœć ăăă ăă§æžăżăŸăă
ăăźć Žćăæąćăźăąăă«ăźćźć
šăȘăąăă«ăąăŒăăăŻăăŁăćć©çšă§ăăŸăă
-ăă仄ć€ăźć Žćăæ°ăăăąăă«ăźçæăéć§ăăŸăăăăă§2ă€ăźéžæèąăăăăŸăïŒ
-- `transformers-cli add-new-model-like`ăäœżçšăăŠæąćăźăąăă«ăźăăăȘæ°ăăăąăă«ăèżœć ăăŸă
-- `transformers-cli add-new-model`ăäœżçšăăŠăăăłăăŹăŒăăăæ°ăăăąăă«ăèżœć ăăŸăïŒăąăă«ăźăżă€ăă«ćżăăŠBERTăŸăăŻBartăźăăă«èŠăăŸăïŒ
+ăă仄ć€ăźć ŽćăŻăæ°ăăăąăă«ăźçæăéć§ăăŸăăăă æŹĄăźăčăŻăȘăăăäœżçšăăŠă仄äžăăć§ăŸăăąăă«ăèżœć ăăăăšăăć§ăăăŸăă
+æąćăźăąăă«:
+
+```bash
+transformers-cli add-new-model-like
+```
-ă©ăĄăăźć Žćă§ăăăąăă«ăźćșæŹæ
ć ±ăć
„ćăăăăăźèłȘćäșé
ăèĄšç€șăăăŸăă
-2çȘçźăźăłăăłăăćźèĄăăă«ăŻă`cookiecutter`ăă€ăłăčăăŒă«ăăćż
èŠăăăăŸăă
-è©łçŽ°ă«ă€ăăŠăŻ[ăăĄă](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model)ăă芧ăă ăăă
+ăąăă«ăźćșæŹæ
ć ±ăć
„ćăăăăăźăąăłă±ăŒăăèĄšç€șăăăŸăă
**äž»èŠăȘ huggingface/transformers ăȘăăžăăȘă§ăă«ăȘăŻăšăčăăéă**
diff --git a/docs/source/ja/add_tensorflow_model.md b/docs/source/ja/add_tensorflow_model.md
deleted file mode 100644
index 8bc7ed0d9ee740..00000000000000
--- a/docs/source/ja/add_tensorflow_model.md
+++ /dev/null
@@ -1,296 +0,0 @@
-
-
-
-# How to convert a đ€ Transformers model to TensorFlow?
-
-đ€ Transformersăäœżçšăăăăă«è€æ°ăźăăŹăŒă ăŻăŒăŻăć©çšćŻèœă§ăăăăšăŻăăąăăȘă±ăŒă·ă§ăłăèšèšăăéă«ăăăăăźćŒ·ăżă掻ăăæè»æ§ăæäŸăăŸăăă
-äșææ§ăăąăă«ăăšă«èżœć ăăćż
èŠăăăăăšăæćłăăŸăăăăăăćčžăăȘăăšă«
-æąćăźăąăă«ă«TensorFlowäșææ§ăèżœć ăăăăšăŻă[ăŒăăăæ°ăăăąăă«ăèżœć ăăăăš](add_new_model)ăăăç°Ąćă§ăïŒ
-性èŠæšĄăȘTensorFlowăąăă«ăźè©łçŽ°ăçè§Łăăăăäž»èŠăȘăȘăŒăăłăœăŒăčăźèČąçźăèĄăŁăăă
-éžæăăăąăă«ăTensorFlowă§æćčă«ăăăăăźăŹă€ăă§ăă
-
-ăăźăŹă€ăăŻăăłăă„ăăăŁăźăĄăłăăŒă§ăăăăȘăă«ăTensorFlowăąăă«ăźéăżăăăł/ăŸăăŻ
-ăąăŒăăăŻăăŁăđ€ Transformersă§äœżçšăăăăă«ăHugging FaceăăŒă ăăăźæć°éăźçŁèŠă§èČąçźă§ăăćăäžăăŸăăæ°ăăăąăă«ăæžăăăšăŻć°ăăȘćæ„ă§ăŻăăăŸăăăă
-ăăźăŹă€ăăèȘăăăšă§ăăăăăăŒă©ăŒăłăŒăčăżăŒăźăăăȘăăźăăæŁæ©ăźăăăȘăăźă«ăȘăăăšăéĄăŁăŠăăŸăđąđ¶ă
-ăăźăăă»ăčăăŸăăŸăç°Ąćă«ăăăăă«ăç§ăăĄăźć
±éăźç”éšă掻çšăăăăšăŻéćžžă«éèŠă§ăăźă§ă
-ăăźăŹă€ăăźæčćăææĄăăăăšăćŒ·ăăć§ăăăŸăïŒ
-
-ăăă«è©łăăèȘżăčăćă«ă仄äžăźăȘăœăŒăčăăă§ăăŻăăăăšăăć§ăăăŸăăđ€ TransformersăćăăŠăźć ŽćïŒ
-
-- [đ€ TransformersăźäžèŹçăȘæŠèŠ](add_new_model#general-overview-of-transformers)
-- [Hugging FaceăźTensorFlowćČćŠ](https://huggingface.co/blog/tensorflow-philosophy)
-
-ăăźăŹă€ăăźæźăăźéšćă§ăŻăæ°ăăTensorFlowăąăă«ăąăŒăăăŻăăŁăèżœć ăăăăă«ćż
èŠăȘăăźă
-PyTorchăTensorFlowăąăă«ăźéăżă«ć€æăăæé ăăăăłMLăăŹăŒă ăŻăŒăŻéăźäžäžèŽăćčççă«ăăăă°ăăæčæłă«ă€ăăŠćŠăłăŸăăăăă§ăŻć§ăăŸăăăïŒ
-
-
-
-äœżçšăăăăąăă«ă«ćŻŸćżăăTensorFlowăąăŒăăăŻăăŁăăă§ă«ććšăăăă©ăăăăăăȘăă§ăăïŒ
-
-
-
-éžæăăăąăă«ăź`config.json`ăź`model_type`ăăŁăŒă«ăăăă§ăăŻăăŠăżăŠăă ăă
-ïŒ[äŸ](https://huggingface.co/google-bert/bert-base-uncased/blob/main/config.json#L14)ïŒă
-đ€ Transformersăźè©Čćœăăăąăă«ăă©ă«ăă«ăććă"modeling_tf"ă§ć§ăŸăăăĄă€ă«ăăăć ŽćăăăăŻćŻŸćżăăTensorFlow
-ăąăŒăăăŻăăŁăæăŁăŠăăăăšăæćłăăŸăïŒ[äŸ](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)ïŒă
-
-
-
-## Step-by-step guide to add TensorFlow model architecture code
-
-性èŠæšĄăȘăąăă«ăąăŒăăăŻăăŁăèšèšăăæčæłăŻăăŸăăŸă§ăăăăăźèšèšăćźèŁ
ăăæčæłăăăŸăăŸă§ăă
-ăăăă[đ€ TransformersăźäžèŹçăȘæŠèŠ](add_new_model#general-overview-of-transformers)ăă
-æăćșăăŠăăă ăăăăăăăŸăăăăç§ăăĄăŻæèŠăźăăă°ă«ăŒăă§ă - đ€ TransformersăźäœżăăăăăŻäžèČ«æ§ăźăăèšèšăźéžæèąă«äŸćăăŠăăŸăăç”éšăăăTensorFlowăąăă«ăèżœć ăăéă«éèŠăȘăăšăăăă€ăăäŒăă§ăăŸăïŒ
-
-- è»èŒȘăćçșæăăȘăă§ăă ăăïŒă»ăšăă©ăźć ŽćăçąșèȘăăčăć°ăȘăăšă2ă€ăźćç
§ćźèŁ
ăăăăŸăăăăăŻă
-ăăȘăăćźèŁ
ăăŠăăăąăă«ăźPyTorchăăŒăžă§ăłăšăćăçšźéĄăźćéĄă«ćŻŸăăä»ăźTensorFlowăąăă«ă§ăă
-- ćȘăăăąăă«ćźèŁ
ăŻæéăźè©Šç·Žăäčăè¶ăăŸăăăăăŻăăłăŒăăăăăă ăăă§ăŻăȘăăăłăŒăăæçąșă§ăăăăă°ăăăăă
-æ§çŻăăăăăăă§ăăTensorFlowćźèŁ
ă§PyTorchćźèŁ
ăšäžèŽăăăăżăŒăłăè€èŁœăăPyTorchćźèŁ
ăšăźäžäžèŽăæć°éă«æăăăăšă§ă
-ăăȘăăźèČąçźăé·æéă«ăăăŁăŠæçšă§ăăăăšăäżèšŒăăŸăă
-- èĄăè©°ăŸăŁăăć©ăăæ±ăăŠăă ăăïŒ đ€ TransformersăăŒă ăŻăăă«ăăŸăăăăăăăăăȘăăçŽéąăăŠăăćăćéĄă«ćŻŸăăè§Łæ±șçăèŠă€ăăŠăăŸăă
-
-TensorFlowăąăă«ăąăŒăăăŻăăŁăèżœć ăăăăă«ćż
èŠăȘăčăăăăźæŠèŠăŻæŹĄăźăšăăă§ăïŒ
-1. ć€æăăăăąăă«ăéžæ
-2. transformersăźéçșç°ćąăæșć
-3. ïŒăȘăă·ă§ăłïŒçè«çăȘćŽéąăšæąćăźćźèŁ
ăçè§Ł
-4. ăąăă«ăąăŒăăăŻăăŁăćźèŁ
-5. ăąăă«ăźăăčăăćźèŁ
-6. ăă«ăȘăŻăšăčăăæćș
-7. ïŒăȘăă·ă§ăłïŒăăąăæ§çŻăăŠäžçăšć
±æ
-
-### 1.-3. Prepare your model contribution
-
-**1. ć€æăăăăąăă«ăéžæăă**
-
-ăŸăăćșæŹăăć§ăăŸăăăăæćă«ç„ăŁăŠăăćż
èŠăăăăăšăŻăć€æăăăăąăŒăăăŻăăŁă§ăă
-çčćźăźăąăŒăăăŻăăŁăæ±șăăŠăăȘăć Žćăđ€ Transformers ăăŒă ă«ææĄăæ±ăăăăšăŻăćœ±éżăæ性éă«ăăçŽ æŽăăăæčæłă§ăă
-ăăŒă ăŻăTensorFlow ă”ă€ăă§äžè¶łăăŠăăæăæłšçźăăăăąăŒăăăŻăăŁă«ćăăŠăŹă€ăăăŸăă
-TensorFlow ă§äœżçšăăăçčćźăźăąăă«ă«ăđ€ Transformers ă«æąă« TensorFlow ăąăŒăăăŻăăŁăźćźèŁ
ăććšăăŠăăăăéăżăäžè¶łăăŠăăć Žćă
-ăăźăăŒăžăź[éăżăźèżœć ă»ăŻă·ă§ăł](#adding-tensorflow-weights-to--hub)ă«çŽæ„移ćăăŠăă ăăă
-
-ç°Ąćă«ăăăăă«ăăăźăŹă€ăăźæźăăźéšćă§ăŻăTensorFlow ăăŒăžă§ăłăź *BrandNewBert* ăèČąçźăăăăšăæ±șćźăăăšä»źćźăăŠăăŸă
-ïŒăăăŻă[æ°ăăăąăă«ăźèżœć ăŹă€ă](add_new_model)ă§ăźäŸăšćăă§ăïŒă
-
-
-
-TensorFlow ăąăă«ăźăąăŒăăăŻăăŁă«ćăç”ăćă«ăăăăèĄăăăăźéČèĄäžăźćăç”ăżăăȘăăăćçąșèȘăăŠăă ăăă
-GitHub ăăŒăžăź[ăă«ăȘăŻăšăčă](https://github.com/huggingface/transformers/pulls?q=is%3Apr)㧠`BrandNewBert` ăæ€çŽąăăŠă
-TensorFlow éąéŁăźăă«ăȘăŻăšăčăăăȘăăăšăçąșèȘă§ăăŸăă
-
-
-
-
-**2. transformers éçșç°ćąăźæșć**
-
-ăąăă«ăąăŒăăăŻăăŁăéžæăăăăæćăç€șăăăă«ăă©ăă PR ăéăăăăźç°ćąăèšćźăăŠăă ăăă
-仄äžăźæé ă«ćŸăŁăŠăç°ćąăèšćźăăăă©ăă PR ăéăăŠăă ăăă
-
-1. ăȘăăžăăȘăźăăŒăžă§ 'Fork' ăăżăłăăŻăȘăăŻăăŠă[ăȘăăžăăȘ](https://github.com/huggingface/transformers)ăăă©ăŒăŻăăŸăă
- ăăă«ăăăăłăŒăăźăłăăŒă GitHub ăŠăŒă¶ăŒăąă«ăŠăłăăźäžă«äœæăăăŸăă
-
-2. ăăŒă«ă«ăăŁăčăŻă«ăă 'transformers' ăă©ăŒăŻăăŻăăŒăłăăăăŒăčăȘăăžăăȘăăȘăąăŒăăšăăŠèżœć ăăŸă:
-
-```bash
-git clone https://github.com/[your Github handle]/transformers.git
-cd transformers
-git remote add upstream https://github.com/huggingface/transformers.git
-```
-
-3. éçșç°ćąăèšćźăăŸăăăăšăă°ă仄äžăźăłăăłăăćźèĄăăŠăă ăăïŒ
-
-```bash
-git clone https://github.com/[your Github handle]/transformers.git
-cd transformers
-git remote add upstream https://github.com/huggingface/transformers.git
-```
-
-äŸćéąäżăćąăăŠăăăăăOSă«ćżăăŠăTransformersăźăȘăă·ă§ăłăźäŸćéąäżăźæ°ăćąăăăăăăăŸăăăăăźć ŽćăŻăTensorFlowăă€ăłăčăăŒă«ăăŠăăæŹĄăźăłăăłăăćźèĄăăŠăă ăăă
-
-```bash
-pip install -e ".[quality]"
-```
-
-**æłšæ:** CUDAăă€ăłăčăăŒă«ăăćż
èŠăŻăăăŸăăăæ°ăăăąăă«ăCPUă§ćäœăăăăăšăććă§ăă
-
-4. ăĄă€ăłăă©ăłăăăăăăăăăććăźăă©ăłăăäœæăăŠăă ăăă
-
-```bash
-git checkout -b add_tf_brand_new_bert
-```
-5. çŸćšăźmainăă©ăłăă«ăă§ăăăăŠăȘăăŒăčăă
-
-```bash
-git fetch upstream
-git rebase upstream/main
-```
-
-6. `transformers/src/models/brandnewbert/`ă«`modeling_tf_brandnewbert.py`ăšăăććăźç©șăź`.py`ăăĄă€ă«ăèżœć ăăŸăăăăăŻăăȘăăźTensorFlowăąăă«ăăĄă€ă«ă§ăă
-
-7. 仄äžăäœżçšăăŠć€æŽć
ćźčăăąă«ăŠăłăă«ăăă·ă„ăăŸăïŒ
-
-```bash
-git add .
-git commit -m "initial commit"
-git push -u origin add_tf_brand_new_bert
-```
-
-8. GitHubäžă§ăă©ăŒăŻăăăŠă§ăăăŒăžă«ç§»ćăăăăă«ăȘăŻăšăčăăăăŻăȘăăŻăăŸăăć°æ„ăźć€æŽă«ćăăŠăHugging Face ăăŒă ăźăĄăłăăŒăźGitHubăăłăă«ăăŹăă„ăąăŒăšăăŠèżœć ăăŠăă ăăă
-
-9. GitHubăźăă«ăȘăŻăšăčăăŠă§ăăăŒăžăźćłćŽă«ăăăăă©ăăă«ć€æăăăŻăȘăăŻăăŠăăă«ăȘăŻăšăčăăăă©ăăă«ć€æŽăăŸăă
-
-ăăă§ăđ€ Transformersć
ă«*BrandNewBert*ăTensorFlowă«ç§»æ€ăăăăăźéçșç°ćąăèšćźăăăŸăăă
-
-**3. (ä»»æ) çè«çăȘćŽéąăšæąćăźćźèŁ
ăçè§Łăă**
-
-*BrandNewBert*ăźè«æăććšăăć Žćăăăźèšèż°çăȘäœæ„ăèȘăæéăćăăčăă§ăăè«æă«ăŻçè§ŁăéŁăă性ăăȘă»ăŻă·ă§ăłăăăăăăăăŸăăăăăźć Žćă§ăćéĄăăăŸăă - ćżé
ăăȘăă§ăă ăăïŒçźæšăŻè«æăźçè«çăȘçè§Łăæ·±ăăăăšă§ăŻăȘăăđ€ TransformersăäœżçšăăŠTensorFlowă§ăąăă«ăćčæçă«ććźèŁ
ăăăăă«ćż
èŠăȘæ
ć ±ăæœćșăăăăšă§ăăăšăŻèšăăçè«çăȘćŽéąă«ăăŸăæéăăăăćż
èŠăŻăăăŸăăă代ăăă«ăæąćăźăąăă«ăźăăă„ăĄăłăăŒă·ă§ăłăăŒăžïŒăăšăă°ă[BERTăźăąăă«ăăă„ăĄăłă](model_doc/bert)ăȘă©ïŒă«çŠçčăćœăŠăăčăă§ăă
-
-ćźèŁ
ăăăąăă«ăźćșæŹăææĄăăćŸăæąćăźćźèŁ
ăçè§ŁăăăăšăŻéèŠă§ăăăăăŻăćäœăăćźèŁ
ăăąăă«ă«ćŻŸăăæćŸ
ăšäžèŽăăăăšăçąșèȘăăç”¶ć„œăźæ©äŒă§ăăăTensorFlowćŽă§ăźæèĄçăȘèȘČéĄăäșæžŹăăăăšăă§ăăŸăă
-
-æ
ć ±ăźć€ăă«ć§ćăăăŠăăăšæăăăźăŻćźć
šă«èȘç¶ă§ăăăăźæź”éă§ăŻăąăă«ăźăăčăŠăźćŽéąăçè§Łăăćż
èŠăŻăăăŸăăăăă ăă[ăă©ăŒă©ă ](https://discuss.huggingface.co/)ă§æ„ăȘèłȘćăè§Łæ±șăăăăšăćŒ·ăăć§ăăăŸăă
-
-
-### 4. Model implementation
-
-ăăăăăăăăłăŒăăŁăłă°ăć§ăăŸăăăăăć§ăăăćșçșçčăŻăPyTorchăăĄă€ă«ăăźăăźă§ăă
-`src/transformers/models/brand_new_bert/`ć
ăź`modeling_brand_new_bert.py`ăźć
ćźčă
-`modeling_tf_brand_new_bert.py`ă«ăłăăŒăăŸăăăăźă»ăŻă·ă§ăłăźçźæšăŻă
-đ€ Transformersăźă€ăłăăŒăæ§é ăæŽæ°ăă`TFBrandNewBert`ăš
-`TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`ăæŁćžžă«èȘăżèŸŒăćäœăăTensorFlow *BrandNewBert*ăąăă«ă
-ă€ăłăăŒăă§ăăăăă«ăăăăšă§ăă
-
-æźćż”ăȘăăăPyTorchăąăă«ăTensorFlowă«ć€æăăæçąșăȘæčæłăŻăăăŸăăăăă ăăăăă»ăčăă§ăăă ăăčă ăŒășă«ăăăăăźăăłăă仄äžă«ç€șăăŸăïŒ
-
-- ăăčăŠăźăŻă©ăčăźććăźćă« `TF` ăä»ăăŸăïŒäŸïŒ `BrandNewBert` 㯠`TFBrandNewBert` ă«ăȘăăŸăïŒă
-- ă»ăšăă©ăźPyTorchăźæäœă«ăŻăçŽæ„TensorFlowăźä»ŁæżăăăăŸăăăăšăă°ă`torch.nn.Linear` 㯠`tf.keras.layers.Dense` ă«ćŻŸćżăă`torch.nn.Dropout` 㯠`tf.keras.layers.Dropout` ă«ćŻŸćżăăŸăăçčćźăźæäœă«ă€ăăŠäžæçąșăȘć ŽćăŻă[TensorFlowăźăăă„ăĄăłă](https://www.tensorflow.org/api_docs/python/tf)ăŸăăŻ[PyTorchăźăăă„ăĄăłă](https://pytorch.org/docs/stable/)ăćç
§ă§ăăŸăă
-- đ€ TransformersăźăłăŒăăăŒăčă«ăăżăŒăłăèŠă€ăăăŸăăçčćźăźæäœă«çŽæ„çăȘ代æżăăȘăć ŽćăèȘ°ăăăă§ă«ćăćéĄă«ćŻŸćŠăăŠăăćŻèœæ§ăé«ăă§ăă
-- ăăă©ă«ăă§ăŻăPyTorchăšćăć€æ°ćăšæ§é ăç¶æăăŸăăăăă«ăăăăăăă°ăćéĄăźèżœè·ĄăäżźæŁăźèżœć ăćźčæă«ăȘăăŸăă
-- äžéšăźăŹă€ă€ăŒă«ăŻăćăăŹăŒă ăŻăŒăŻă§ç°ăȘăăăă©ă«ăć€ăăăăŸăăæłšçźăăčăäŸăŻăăăăæŁèŠćăŹă€ă€ăŒăź epsilon ă§ăïŒPyTorchă§ăŻ`1e-5`ă[TensorFlowă§ăŻ](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization) `1e-3` ă§ăïŒăăăă„ăĄăłăăćçąșèȘăăŠăă ăăïŒ
-- PyTorchăź `nn.Parameter` ć€æ°ăŻéćžžăTF Layerăź `build()` ć
ă§ćæćăăćż
èŠăăăăŸăăæŹĄăźäŸăćç
§ăăŠăă ăăïŒ[PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) / [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220)
-- PyTorchăąăă«ă«éąæ°ăźäžéšă« `#copied from ...` ăăăć ŽćăTensorFlowăąăă«ăćăăąăŒăăăŻăăŁăăăăźéąæ°ăćăăăăšăă§ăăćŻèœæ§ăé«ăă§ăăTensorFlowăąăŒăăăŻăăŁăăăć Žćă§ăă
-- TensorFlowéąæ°ć
㧠`name`ć±æ§ăæŁăăèšćźăăăăšăŻă`from_pt=True`ăźăŠă§ă€ăăźăŻăăčăăŒăăăŒăăèĄăăăă«éèŠă§ăăéćžžă`name`ăŻPyTorchăłăŒăć
ăźćŻŸćżăăć€æ°ăźććă§ăă`name`ăæŁăăèšćźăăăŠăăȘăć Žćăăąăă«ăŠă§ă€ăăźăăŒăæă«ăšă©ăŒăĄăă»ăŒăžă§èĄšç€șăăăŸăă
-- ăăŒăčăąăă«ăŻă©ăč `BrandNewBertModel` ăźăăžăăŻăŻćźéă«ăŻ `TFBrandNewBertMainLayer` ă«ăăăŸăăăăăŻKerasăŹă€ă€ăŒăźă”ăăŻă©ăčă§ăïŒ[äŸ](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719)ïŒă`TFBrandNewBertModel` ăŻăćă«ăăźăŹă€ă€ăŒăźă©ăăăŒă§ăă
-- ăąăă«ăèȘăżèŸŒăăăă«ăŻăKerasăąăă«ăăă«ăăăćż
èŠăăăăŸăăăăźăăă`TFBrandNewBertPreTrainedModel` ăŻăąăă«ăžăźć
„ćăźäŸă`dummy_inputs` ăæă€ćż
èŠăăăăŸăïŒ[äŸ](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)ïŒă
-- èĄšç€șăæąăŸăŁăć ŽćăŻăć©ăăæ±ăăŠăă ăăăç§ăăĄăŻăăȘăăźăæäŒăă«ăăă«ăăŸăïŒ đ€
-
-ăąăă«ăăĄă€ă«èȘäœă ăă§ăȘăăăąăă«ăŻă©ăčăšéąéŁăăăăă„ăĄăłăăŒă·ă§ăłăăŒăžăžăźăă€ăłăżăŒăèżœć ăăćż
èŠăăăăŸăăä»ăźPRăźăăżăŒăłă«ćŸăŁăŠăăźéšćăćźäșă§ăăŸă
-ïŒ[äŸ](https://github.com/huggingface/transformers/pull/18020/files)ïŒă
-仄äžăŻæćă§ăźć€æŽăćż
èŠăȘäžèŠ§ă§ăïŒ
-- *BrandNewBert*ăźăăčăŠăźăăăȘăăŻăŻă©ăčă `src/transformers/__init__.py` ă«ć«ăă
-- *BrandNewBert*ăŻă©ăčă `src/transformers/models/auto/modeling_tf_auto.py` ăźćŻŸćżăăAutoăŻă©ăčă«èżœć
-- ăăă„ăĄăłăăŒă·ă§ăłăăčăăăĄă€ă«ăźăȘăčăă«ăąăăȘăłă°ăăĄă€ă«ăèżœć ăă `utils/documentation_tests.txt`
-- `src/transformers/utils/dummy_tf_objects.py` ă«éąéŁăă *BrandNewBert* ă«éąéŁăăé
滶ăăŒăăŻă©ăčăèżœć
-- `src/transformers/models/brand_new_bert/__init__.py` ă§ăăăȘăăŻăŻă©ăčăźă€ăłăăŒăæ§é ăæŽæ°
-- `docs/source/en/model_doc/brand_new_bert.md` ă« *BrandNewBert* ăźăăăȘăăŻăĄăœăăăźăăă„ăĄăłăăŒă·ă§ăłăă€ăłăżăŒăèżœć
-- `docs/source/en/model_doc/brand_new_bert.md` ăź *BrandNewBert* ăźèČąçźè
ăȘăčăă«èȘćèȘèș«ăèżœć
-- æćŸă«ă`docs/source/en/index.md` ăź *BrandNewBert* ăźTensorFlowćă«ç·èČăźăă§ăăŻăăŒăŻ â
ăèżœć
-
-ăąăă«ăąăŒăăăŻăăŁăæșćă§ăăŠăăăăšăçąșèȘăăăăă«ă仄äžăźăă§ăăŻăȘăčăăćźèĄăăŠăă ăăïŒ
-1. èšç·Žæă«ç°ăȘăćäœăăăăăčăŠăźăŹă€ă€ăŒïŒäŸïŒDropoutïŒăŻă`training`ćŒæ°ăäœżçšăăŠćŒăłćșăăăăăăæäžäœăŻă©ăčăăäŒæăăăŸăă
-2. ćŻèœăȘéă `#copied from ...` ăäœżçšăăŸăă
-3. `TFBrandNewBertMainLayer` ăăăłăăăäœżçšăăăăčăŠăźăŻă©ăčăź `call` éąæ°ă `@unpack_inputs` ă§ăăłăŹăŒăăăăŠăăŸă
-4. `TFBrandNewBertMainLayer` 㯠`@keras_serializable` ă§ăăłăŹăŒăăăăŠăăŸă
-5. PyTorchăŠă§ă€ăăăTensorFlowăŠă§ă€ăăäœżçšăăŠTensorFlowăąăă«ăăăŒăă§ăăŸă `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`
-6. äșæăăăć
„ććœąćŒăäœżçšăăŠTensorFlowăąăă«ăćŒăłćșăăăšăă§ăăŸă
-
-
-### 5. Add model tests
-
-ăăŁăăăTensorFlowăąăă«ăćźèŁ
ăăŸăăïŒ
-ä»ćșŠăŻăăąăă«ăæćŸ
éăă«ćäœăăăăšăçąșèȘăăăăăźăăčăăèżœć ăăæéă§ăă
-ćăźă»ăŻă·ă§ăłăšćæ§ă«ă`tests/models/brand_new_bert/`ăăŁăŹăŻăăȘć
ăź`test_modeling_brand_new_bert.py`ăăĄă€ă«ă`test_modeling_tf_brand_new_bert.py`ă«ăłăăŒăăćż
èŠăȘTensorFlowăźçœźæăèĄăăăšăăć§ăăăŸăă
-ä»ăźæź”éă§ăŻăăăčăŠăź`.from_pretrained()`ćŒăłćșăă§ăæąćăźPyTorchăźéăżăăăŒăăăăăă«`from_pt=True`ăă©ă°ăäœżçšăăćż
èŠăăăăŸăă
-
-äœæ„ăćźäșăăăăăăčăăćźèĄăăæșćăæŽăăŸăăïŒ đŹ
-
-```bash
-NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
-py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
-```
-
-æăćŻèœæ§ăźé«ăç”æăŻăć€ăăźăšă©ăŒăèĄšç€șăăăăăšă§ăăćżé
ăăȘăă§ăă ăăăăăăŻäșæłăăăćäœă§ăïŒ
-MLăąăă«ăźăăăă°ăŻéćžžă«éŁăăăšăăăŠăăăæćăźé”ăŻćżèćïŒăš`breakpoint()`ïŒă§ăăç§ăăĄăźç”éšă§ăŻă
-æăéŁăăćéĄăŻMLăăŹăŒă ăŻăŒăŻéăźćŸźćŠăȘäžäžèŽăăçșçăăăăă«ă€ăăŠăŻăăźăŹă€ăăźæćŸă«ăăă€ăăźăă€ăłăżăç€șăăŸăă
-ä»ăźć Žćă§ăŻăäžèŹçăȘăăčăăçŽæ„ăąăă«ă«é©çšă§ăăȘăć Žćăăăăăăźć ŽćăŻăąăă«ăźăăčăăŻă©ăčăŹăă«ă§ăȘăŒăăŒă©ă€ăăææĄăăŸăă
-ćéĄăźçšźéĄă«éąäżăȘăăè©°ăŸăŁăć ŽćăŻăăă©ăăăźăă«ăȘăŻăšăčăă§ć©ăăæ±ăăăăšăăăăăăȘăă§ăă ăăă
-
-ăăčăŠăźăăčăăăăčăăăăăăă§ăšăăăăăŸăăăăȘăăźăąăă«ăŻă»ăŒđ€ Transformersă©ă€ăă©ăȘă«èżœć ăăæșćăæŽăăŸăăïŒđ
-
-**6. ăă«ăȘăŻăšăčăăæćșăă**
-
-ćźèŁ
ăšăăčăăćźäșăăăăăă«ăȘăŻăšăčăăæćșăăæșćăæŽăăŸăăăăłăŒăăăăă·ă„ăăćă«ă
-ăłăŒăăă©ăŒăăăăŠăŒăăŁăȘăăŁă§ăă `make fixup` đȘ ăćźèĄăăŠăă ăăă
-ăăă«ăăăèȘćçăȘăă§ăăŻă«ć€±æăăćŻèœæ§ăźăăăă©ăŒăăăăźćéĄăèȘćçă«äżźæŁăăăŸăă
-
-ăăă§ăăă©ăăăă«ăȘăŻăšăčăăćźéăźăă«ăȘăŻăšăčăă«ć€æăăæșćăæŽăăŸăăă
-ăăăèĄăă«ăŻăăăŹăă„ăŒćŸ
ăĄăăăżăłăăŻăȘăăŻăăJoaoïŒ`@gante`ïŒăšMattïŒ`@Rocketknight1`ïŒăăŹăă„ăŻăŒăšăăŠèżœć ăăŸăă
-ăąăă«ăă«ăȘăŻăšăčăă«ăŻć°ăȘăăšă3äșșăźăŹăă„ăŻăŒăćż
èŠă§ăăăăąăă«ă«é©ćăȘèżœć ăźăŹăă„ăŻăŒăèŠă€ăăăźăŻćœŒăăźèČŹä»»ă§ăă
-
-ăăčăŠăźăŹăă„ăŻăŒăăă«ăȘăŻăšăčăăźç¶æ
ă«æșè¶łăăăăæćŸăźăąăŻă·ă§ăłăă€ăłăăŻă`.from_pretrained()` ćŒăłćșă㧠`from_pt=True` ăă©ă°ăćé€ăăăăšă§ăă
-TensorFlowăźăŠă§ă€ăăććšăăȘăăăăăăăăèżœć ăăćż
èŠăăăăŸăïŒăăăèĄăæčæłă«ă€ăăŠăŻă仄äžăźă»ăŻă·ă§ăłăçąșèȘăăŠăă ăăă
-
-æćŸă«ăTensorFlowăźăŠă§ă€ăăăăŒăžăăăć°ăȘăăšă3äșșăźăŹăă„ăŒăąăæżèȘăăăăčăŠăźCIăă§ăăŻă
-æćăăć ŽćăăăčăăăăŒă«ă«ă§æćŸă«ăăäžćșŠçąșèȘăăŠăă ăăă
-
-```bash
-NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
-py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
-```
-
-ăăăŠăăăȘăăźPRăăăŒăžăăŸăïŒăă€ă«ăčăăŒăłéæăăă§ăšăăăăăŸă đ
-
-**7. (Optional) ăăąăäœæăăŠäžçăšć
±æ**
-
-ăȘăŒăăłăœăŒăčăźæăéŁăăéšćăź1ă€ăŻăçșèŠă§ăăăăȘăăźçŽ æŽăăăTensorFlowăźèČąçźăććšăăăăšăä»ăźăŠăŒă¶ăŒăă©ăźăăă«ç„ăăăšăă§ăăă§ăăăăïŒé©ćăȘăłăă„ăă±ăŒă·ă§ăłă§ăïŒ đŁ
-
-ăłăă„ăăăŁăšăąăă«ăć
±æăăäž»èŠăȘæčæłăŻ2ă€ăăăŸăă
-- ăăąăäœæăăŸăăăăă«ăŻGradioăăąăăăŒăăăăŻăăăăłăąăă«ăçŽčä»ăăăăăźä»ăźæ„œăăæčæłăć«ăŸăăŸăă[ăłăă„ăăăŁé§ćăźăăą](https://huggingface.co/docs/transformers/community)ă«ăăŒăăăăŻăèżœć ăăăăšăćŒ·ăăć§ăăăŸăă
-- TwitterăLinkedInăȘă©ăźăœăŒă·ăŁă«ăĄăăŁăąă§ăčăăŒăȘăŒăć
±æăăŸăăăăȘăăźä»äșă«èȘăăæăĄăăłăă„ăăăŁăšăăȘăăźææăć
±æăăăčăă§ă - ăăȘăăźăąăă«ăŻä»ăäžçäžăźäœćäșșăăźăšăłăžăăąăç 究è
ă«ăăŁăŠäœżçšăăăćŻèœæ§ăăăăŸă đïŒç§ăăĄăŻăăȘăăźæçšżăăȘăă€ăŒăăăŠć
±ćäœăšć
±æăăăæäŒăăćăă§ăăŸăă
-
-## Adding TensorFlow weights to đ€ Hub
-
-TensorFlowăąăă«ăźăąăŒăăăŻăăŁăđ€ Transformersă§ć©çšćŻèœăȘć ŽćăPyTorchăźéăżăTensorFlowăźéăżă«ć€æăăăăšăŻç°Ąćă§ăïŒ
-
-仄äžăăăźæčæłă§ăïŒ
-1. ăżăŒăăă«ă§Hugging Faceăąă«ăŠăłăă«ăă°ă€ăłăăŠăăăăšăçąșèȘăăŠăă ăăăăłăăłă`huggingface-cli login`ăäœżçšăăŠăă°ă€ăłă§ăăŸăïŒăąăŻă»ăčăăŒăŻăłăŻ[ăăĄă](https://huggingface.co/settings/tokens)ă§èŠă€ăăăăšăă§ăăŸăïŒă
-2. `transformers-cli pt-to-tf --model-name foo/bar`ăšăăăłăăłăăćźèĄăăŸăăăăă§ă`foo/bar`ăŻć€æăăăPyTorchăźéăżăć«ăăąăă«ăȘăăžăăȘăźććă§ăă
-3. äžèšăźăłăăłăă§äœæăăăđ€ Hub PRă«`@joaogante`ăš`@Rocketknight1`ăăżă°ä»ăăăŸăă
-
-ăăă ăă§ăïŒ đ
-
-## Debugging mismatches across ML frameworks đ
-
-æ°ăăăąăŒăăăŻăăŁăèżœć ăăăăæąćăźăąăŒăăăŻăăŁăźTensorFlowăźéăżăäœæăăăăăéăPyTorchăšTensorFlowéăźäžäžèŽă«ă€ăăŠăźăšă©ăŒă«ééăăăăšăăăăŸăă
-ć Žćă«ăăŁăŠăŻăPyTorchăšTensorFlowăźăąăă«ăąăŒăăăŻăăŁăă»ăŒćäžă§ăăă«ăăăăăăăäžäžèŽăææăăăšă©ăŒăèĄšç€șăăăăăšăăăăŸăă
-ă©ăăăŠă§ăăăăïŒ đ€
-
-ăŸăæćă«ăăȘăăăăăźäžäžèŽăçè§ŁăăăăšăéèŠăă«ă€ăăŠè©±ăăŸăăăăć€ăăźăłăă„ăăăŁăĄăłăăŒăŻđ€ Transformersăąăă«ăăăźăŸăŸäœżçšăăăąăă«ăæćŸ
ă©ăăă«ćäœăăăšäżĄé ŒăăŠăăŸăă
-2ă€ăźăăŹăŒă ăŻăŒăŻéă§ć€§ăăȘäžäžèŽăăăăšăć°ăȘăăšă1ă€ăźăăŹăŒă ăŻăŒăŻăźăȘăăĄăŹăłăčćźèŁ
ă«ćŸăŁăŠăąăă«ăćäœăăȘăăăšăæćłăăŸăă
-ăăă«ăăăăąăă«ăŻćźèĄăăăŸăăæ§èœăäœäžăăćŻèœæ§ăăăăéăăȘ怱æăçșçăăćŻèœæ§ăăăăŸăăăăăŻăć
šăćźèĄăăăȘăăąăă«ăăăæȘăăšèšăăăăăăăŸăăïŒăăźăăăăąăă«ăźăăčăŠăźæź”éă§ăźăăŹăŒă ăŻăŒăŻăźäžäžèŽă`1e-5`æȘæșă§ăăăăšăçźæăăŠăăŸăă
-
-æ°ć€èšçźăźćéĄăšćæ§ă«ăè©łçŽ°ă«ă€ăăŠăŻçŽ°ăăăšăăă«ăăăŸăăăăăŠăè©łçŽ°æćăźæèĄă§ăă仄äžăç§ćŻăźèŠçŽ ăŻćżèă§ăă
-ăăźçšźăźćéĄă«ééăăć Žćăźăć§ăăźăŻăŒăŻăăăŒăŻæŹĄăźăšăăă§ăïŒ
-1. äžäžèŽăźćć ăçčćźăăŸăăć€æäžăźăąăă«ă«ăŻăăăăçčćźăźçčăŸă§ă»ăŒćäžăźć
éšć€æ°ăăăăŸăă
- 䞥æčăźăăŹăŒă ăŻăŒăŻăźăąăŒăăăŻăăŁă«`breakpoint()`ăčăăŒăăĄăłăăé
çœźăăăăăăăŠăłăźæčæłă§æ°ć€ć€æ°ăźć€ăæŻèŒăăćéĄăźćć ăèŠă€ăăŸăă
-2. ćéĄăźćć ăçčćźăăăăđ€ TransformersăăŒă ăšéŁç”ĄăćăăŸăăăăćæ§ăźćéĄă«ééăăăăšăăăăăăăăăèż
éă«è§Łæ±șçăæäŸă§ăăăăăăăŸăăăæç”ææź”ăšăăŠăStackOverflowăGitHubăźćéĄăȘă©ăäșșæ°ăźăăăăŒăžăăčăăŁăłăăŸăă
-3. è§Łæ±șçăèŠćœăăăȘăć ŽćăćéĄăæăäžăăćż
èŠăăăăăšăæćłăăŸăăèŻăăă„ăŒăčăŻăćéĄăźćć ăçčćźăăăăšă§ăăăăăăŁăŠăćéĄăźăăćœä»€ă«çŠçčăćœăŠăăąăă«ăźæźăăæœè±Ąćă§ăăŸăïŒæȘăăă„ăŒăčăŻăăăźćœä»€ăźăœăŒăčćźèŁ
ă«éČăćż
èŠăăăăăšă§ăăäžéšăźć Žćă§ăŻăăȘăăĄăŹăłăčćźèŁ
ă«ćéĄăăăăăăăăŸăă - äžæ”ăȘăăžăăȘă§ćéĄăéăăźăæ§ăăȘăă§ăă ăăă
-
-đ€ TransformersăăŒă ăšăźè©±ăćăă§ăäžäžèŽăäżźæŁăăăăšăć°éŁă§ăăăăšăć€æăăăăšăăăăŸăă
-ćșćăŹă€ă€ăŒăźăąăă«ă§äžäžèŽăéćžžă«ć°ăăć ŽćïŒăă ăăé ăăç¶æ
ă§ăŻć€§ăăćŻèœæ§ăăăïŒăăąăă«ăé
ćžăăăăă«ăăăçĄèŠăăăăšă«ăăăăăăăŸăăă
-äžèšă§èšćăă`pt-to-tf` CLIă«ăŻăéăżć€ææă«ăšă©ăŒăĄăă»ăŒăžăçĄèŠăăăăăź`--max-error`ăă©ă°ăăăăŸăă
-
-
-
-
-
-
diff --git a/docs/source/ja/tasks/asr.md b/docs/source/ja/tasks/asr.md
index 6d5f65461d215b..9226f5b414fdfd 100644
--- a/docs/source/ja/tasks/asr.md
+++ b/docs/source/ja/tasks/asr.md
@@ -28,13 +28,8 @@ rendered properly in your Markdown viewer.
2. ćŸźèȘżæŽăăăąăă«ăæšè«ă«äœżçšăăŸăă
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-[Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [M-CTC-T](../model_doc/mctct), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/automatic-speech-recognition) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ja/tasks/audio_classification.md b/docs/source/ja/tasks/audio_classification.md
index 6f4d0dd171846a..d32050072f962e 100644
--- a/docs/source/ja/tasks/audio_classification.md
+++ b/docs/source/ja/tasks/audio_classification.md
@@ -29,18 +29,11 @@ rendered properly in your Markdown viewer.
2. ćŸźèȘżæŽăăăąăă«ăæšè«ă«äœżçšăăŸăă
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-[Audio Spectrogram Transformer](../model_doc/audio-spectrogram-transformer), [Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm), [Whisper](../model_doc/whisper)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/audio-classification) ăçąșèȘăăăăšăăć§ăăăŸăă
-ć§ăăćă«ăćż
èŠăȘă©ă€ăă©ăȘăăăčăŠă€ăłăčăăŒă«ăăăŠăăăăšăçąșèȘăăŠăă ăăă
-
```bash
pip install transformers datasets evaluate
```
diff --git a/docs/source/ja/tasks/document_question_answering.md b/docs/source/ja/tasks/document_question_answering.md
index ec88f262086cf5..847ec8441ccf76 100644
--- a/docs/source/ja/tasks/document_question_answering.md
+++ b/docs/source/ja/tasks/document_question_answering.md
@@ -30,14 +30,7 @@ rendered properly in your Markdown viewer.
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-
-
-[LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/image-to-text) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ja/tasks/image_classification.md b/docs/source/ja/tasks/image_classification.md
index f16e46c26fc316..2202dc3a4f6498 100644
--- a/docs/source/ja/tasks/image_classification.md
+++ b/docs/source/ja/tasks/image_classification.md
@@ -31,13 +31,8 @@ rendered properly in your Markdown viewer.
2. ćŸźèȘżæŽăăăąăă«ăæšè«ă«äœżçšăăŸăă
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/image-classification) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ja/tasks/language_modeling.md b/docs/source/ja/tasks/language_modeling.md
index 1d1bcab0b3757a..b65d60102ef1ca 100644
--- a/docs/source/ja/tasks/language_modeling.md
+++ b/docs/source/ja/tasks/language_modeling.md
@@ -37,14 +37,7 @@ rendered properly in your Markdown viewer.
-ăăźăŹă€ăăšćăæé ă«ćŸăŁăŠăć æèšèȘăąăăȘăłă°çšă«ä»ăźăąăŒăăăŻăăŁăćŸźèȘżæŽă§ăăŸăă
-æŹĄăźăąăŒăăăŻăăŁăźăăăăăéžæăăŸăă
-
-
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
-
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/text-generation) ăçąșèȘăăăăšăăć§ăăăŸăău
diff --git a/docs/source/ja/tasks/masked_language_modeling.md b/docs/source/ja/tasks/masked_language_modeling.md
index 29488d5c71e44e..29d7b73ae5d026 100644
--- a/docs/source/ja/tasks/masked_language_modeling.md
+++ b/docs/source/ja/tasks/masked_language_modeling.md
@@ -30,14 +30,8 @@ rendered properly in your Markdown viewer.
2. ćŸźèȘżæŽăăăąăă«ăæšè«ă«äœżçšăăŸăă
-ăăźăŹă€ăăšćăæé ă«ćŸăŁăŠăăăčăŻăăăèšèȘăąăăȘăłă°çšă«ä»ăźăąăŒăăăŻăăŁăćŸźèȘżæŽă§ăăŸăă
-æŹĄăźăąăŒăăăŻăăŁăźăăăăăéžæăăŸăă
-
-
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [Perceiver](../model_doc/perceiver), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Wav2Vec2](../model_doc/wav2vec2), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/fill-mask) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ja/tasks/monocular_depth_estimation.md b/docs/source/ja/tasks/monocular_depth_estimation.md
index 984631fd3d5500..e7a3a994a60ebc 100644
--- a/docs/source/ja/tasks/monocular_depth_estimation.md
+++ b/docs/source/ja/tasks/monocular_depth_estimation.md
@@ -26,13 +26,8 @@ rendered properly in your Markdown viewer.
ăȘăŻă«ăŒăžă§ăłăšăăŻăčăăŁă
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-[DPT](../model_doc/dpt), [GLPN](../model_doc/glpn)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/depth-estimation) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ja/tasks/multiple_choice.md b/docs/source/ja/tasks/multiple_choice.md
index 045c9112932dba..98e258f161b712 100644
--- a/docs/source/ja/tasks/multiple_choice.md
+++ b/docs/source/ja/tasks/multiple_choice.md
@@ -25,17 +25,6 @@ rendered properly in your Markdown viewer.
1. [SWAG](https://huggingface.co/datasets/swag) ăăŒăżă»ăăăźăéćžžăæ§æ㧠[BERT](https://huggingface.co/google-bert/bert-base-uncased) ăćŸźèȘżæŽăăŠăæé©ăȘăăŒăżă»ăăăéžæăăŸăè€æ°ăźéžæèąăšäœăăăźăłăłăăăčăăèæ
źăăŠćçăăŸăă
2. ćŸźèȘżæŽăăăąăă«ăæšè«ă«äœżçšăăŸăă
-
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-
-[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
-
-
-
ć§ăăćă«ăćż
èŠăȘă©ă€ăă©ăȘăăăčăŠă€ăłăčăăŒă«ăăăŠăăăăšăçąșèȘăăŠăă ăăă
```bash
diff --git a/docs/source/ja/tasks/object_detection.md b/docs/source/ja/tasks/object_detection.md
index 389e7bdf2f455e..1b1bfb3f8158a4 100644
--- a/docs/source/ja/tasks/object_detection.md
+++ b/docs/source/ja/tasks/object_detection.md
@@ -33,13 +33,8 @@ rendered properly in your Markdown viewer.
2. ćŸźèȘżæŽăăăąăă«ăæšè«ă«äœżçšăăŸăă
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/object-detection) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ja/tasks/question_answering.md b/docs/source/ja/tasks/question_answering.md
index d7feac56076ffa..b039272f45e80a 100644
--- a/docs/source/ja/tasks/question_answering.md
+++ b/docs/source/ja/tasks/question_answering.md
@@ -31,15 +31,8 @@ rendered properly in your Markdown viewer.
2. ćŸźèȘżæŽăăăąăă«ăæšè«ă«äœżçšăăŸăă
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/question-answering) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ja/tasks/semantic_segmentation.md b/docs/source/ja/tasks/semantic_segmentation.md
index 572280c1962ede..56fb47d52f7e37 100644
--- a/docs/source/ja/tasks/semantic_segmentation.md
+++ b/docs/source/ja/tasks/semantic_segmentation.md
@@ -29,13 +29,7 @@ rendered properly in your Markdown viewer.
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-
-[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/image-segmentation) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ja/tasks/sequence_classification.md b/docs/source/ja/tasks/sequence_classification.md
index c97644ca10fad6..4c2a70ab8a303d 100644
--- a/docs/source/ja/tasks/sequence_classification.md
+++ b/docs/source/ja/tasks/sequence_classification.md
@@ -28,13 +28,8 @@ rendered properly in your Markdown viewer.
2. ćŸźèȘżæŽăăăąăă«ăæšè«ă«äœżçšăăŸăă
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/text-classification) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ja/tasks/summarization.md b/docs/source/ja/tasks/summarization.md
index 04f1a53d13f2c6..a4385f73792fc9 100644
--- a/docs/source/ja/tasks/summarization.md
+++ b/docs/source/ja/tasks/summarization.md
@@ -31,13 +31,8 @@ rendered properly in your Markdown viewer.
2. ćŸźèȘżæŽăăăąăă«ăæšè«ă«äœżçšăăŸăă
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/summarization) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ja/tasks/token_classification.md b/docs/source/ja/tasks/token_classification.md
index 497584674252ad..a7f5097f685918 100644
--- a/docs/source/ja/tasks/token_classification.md
+++ b/docs/source/ja/tasks/token_classification.md
@@ -28,12 +28,8 @@ rendered properly in your Markdown viewer.
2. ćŸźèȘżæŽăăăăąăă«ăæšè«ă«äœżçšăăŸăă
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [BROS](../model_doc/bros), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/token-classification) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ja/tasks/translation.md b/docs/source/ja/tasks/translation.md
index b68cddd86e5abe..f683581cd1116c 100644
--- a/docs/source/ja/tasks/translation.md
+++ b/docs/source/ja/tasks/translation.md
@@ -28,13 +28,8 @@ rendered properly in your Markdown viewer.
2. ćŸźèȘżæŽăăăăąăă«ăæšè«ă«äœżçšăăŸăă
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/translation) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ja/tasks/video_classification.md b/docs/source/ja/tasks/video_classification.md
index 688cb701496f79..ecfae843f2ae37 100644
--- a/docs/source/ja/tasks/video_classification.md
+++ b/docs/source/ja/tasks/video_classification.md
@@ -27,13 +27,8 @@ rendered properly in your Markdown viewer.
2. ćŸźèȘżæŽăăăąăă«ăæšè«ă«äœżçšăăŸăă
-ăăźăă„ăŒăăȘăąă«ă§èȘŹæăăăżăčăŻăŻăæŹĄăźăąăă« ăąăŒăăăŻăăŁă§ă”ăăŒăăăăŠăăŸăă
-
-
-[TimeSformer](../model_doc/timesformer), [VideoMAE](../model_doc/videomae), [ViViT](../model_doc/vivit)
-
-
+ăăźăżăčăŻăšäșææ§ăźăăăăčăŠăźăąăŒăăăŻăăŁăšăă§ăăŻăă€ăłăăçąșèȘăăă«ăŻă[ăżăčăŻăăŒăž](https://huggingface.co/tasks/video-classification) ăçąșèȘăăăăšăăć§ăăăŸăă
diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 86d9dc112a3d94..6b4a3001f2d83e 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -185,8 +185,6 @@
title: đ€ Transformersì êž°ìŹíë ë°©ëČ
- local: add_new_model
title: đ€ Transformersì ìëĄìŽ ëȘšëžì ì¶ê°íë ë°©ëČ
- - local: add_tensorflow_model
- title: ìŽë»êČ đ€ Transformers ëȘšëžì TensorFlowëĄ ëłííëì?
- local: add_new_pipeline
title: ìŽë»êČ đ€ Transformersì íìŽíëŒìžì ì¶ê°íëì?
- local: testing
diff --git a/docs/source/ko/add_new_model.md b/docs/source/ko/add_new_model.md
index 752bbd4e4e3aae..d5834777d31eef 100644
--- a/docs/source/ko/add_new_model.md
+++ b/docs/source/ko/add_new_model.md
@@ -17,12 +17,6 @@ rendered properly in your Markdown viewer.
Hugging Face Transformers ëŒìŽëžëŹëŠŹë ì»€ëź€ëí° êž°ìŹìë€ ëë¶ì ìëĄìŽ ëȘšëžì ì êł”í ì ìë êČœì°ê° ë§ì”ëë€. íì§ë§ ìŽë ëì ì ìž íëĄì ížìŽë©° Hugging Face Transformers ëŒìŽëžëŹëŠŹì ê”Źíí ëȘšëžì ëí êčì ìŽíŽê° íìí©ëë€. Hugging Faceììë ë ë§ì ì»€ëź€ëí° ë©€ëČê° ëȘšëžì ì ê·čì ìŒëĄ ì¶ê°í ì ìëëĄ ì§ìíêł ì íë©°, ìŽ ê°ìŽëë„Œ í”íŽ PyTorch ëȘšëžì ì¶ê°íë êłŒì ì ìëŽíêł ìì”ëë€ (PyTorchê° ì€ìčëìŽ ìëì§ íìžíŽìŁŒìžì).
-
-
-TensorFlow ëȘšëžì ê”Źííêł ì íë êČœì° [đ€ Transformers ëȘšëžì TensorFlowëĄ ëłííë ë°©ëČ](add_tensorflow_model) ê°ìŽëë„Œ ìŽíŽëłŽìžì!
-
-
-
ìŽ êłŒì ì ì§ííë©Ž ë€ìêłŒ ê°ì ëŽì©ì ìŽíŽíêČ ë©ëë€:
- ì€í ìì€ì ëȘšëČ ìŹëĄì ëí í”ì°°ë „ì ì»ì”ëë€.
@@ -274,12 +268,14 @@ cd transformers
ë€ìêłŒ ê°ìŽ ìŽëŻž ìĄŽìŹíë ëȘšëžì ëȘšëž ìí€í
ìČì ì íí ìŒìčíë ëȘšëžì ì¶ê°íë íčëłí êČœì°ìë [ìŽ ìčì
](#write-a-conversion-script)ì ì€ëȘ
ëëëĄ ëłí ì€íŹëŠœížë§ ì¶ê°íë©Ž ë©ëë€. ìŽ êČœì°ìë ìŽëŻž ìĄŽìŹíë ëȘšëžì ì ìČŽ ëȘšëž ìí€í
ìČë„Œ ê·žëëĄ ìŹìŹì©í ì ìì”ëë€.
-ê·žë ì§ ììŒë©Ž ìëĄìŽ ëȘšëž ìì±ì ììí©ìë€. ìŹêž°ìì ë ê°ì§ ì íì§ê° ìì”ëë€:
+ê·žë ì§ ììŒë©Ž ì ëȘšëž ìì±ì ììíêČ ì”ëë€. ë€ì ì€íŹëŠœížë„Œ ìŹì©íìŹ ë€ììì ììíë ëȘšëžì ì¶ê°íë êČìŽ ìąì”ëë€.
+êž°ìĄŽ ëȘšëž:
-- `transformers-cli add-new-model-like`ë„Œ ìŹì©íìŹ êž°ìĄŽ ëȘšëžêłŒ ì ìŹí ìëĄìŽ ëȘšëž ì¶ê°íêž°
-- `transformers-cli add-new-model`ì ìŹì©íìŹ í
í늿ì êž°ë°ìŒëĄ í ìëĄìŽ ëȘšëž ì¶ê°íêž° (ì íí ëȘšëž ì íì ë°ëŒ BERT ëë Bartì ì ìŹí ëȘšì”ìŒ êČì
ëë€)
+```bash
+transformers-cli add-new-model-like
+```
-ë êČœì° ëȘšë, ëȘšëžì êž°ëłž ì ëłŽë„Œ ì
ë „íë ì€ëŹžìĄ°ìŹê° ì ìë©ëë€. ë ëČ짞 ëȘ
ë čìŽë `cookiecutter`ë„Œ ì€ìčíŽìŒ í©ëë€. ììží ì 볎ë [ìŹêž°](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model)ìì íìží ì ìì”ëë€.
+ëȘšëžì êž°ëłž ì ëłŽë„Œ ì
ë „íë ì€ëŹžì§ê° íìë©ëë€.
**huggingface/transformers ë©ìž ì ì„ìì Pull Request ìŽêž°**
diff --git a/docs/source/ko/add_tensorflow_model.md b/docs/source/ko/add_tensorflow_model.md
deleted file mode 100644
index 22980b1320c55b..00000000000000
--- a/docs/source/ko/add_tensorflow_model.md
+++ /dev/null
@@ -1,262 +0,0 @@
-
-
-# ìŽë»êČ đ€ Transformers ëȘšëžì TensorFlowëĄ ëłííëì? [[how-to-convert-a-transformers-model-to-tensorflow]]
-
-đ€ TransformersìììČëŒ ìŹì©í ì ìë ìŹëŹ ê°ì§ íë ììíŹê° ìë€ë êČì ì í늏ìŒìŽì
ì ì€êłí ë ê·žë€ì ê°ì ì ì ì°íêČ ìŽì©í ì ìë€ë ì„ì ìŽ ìì§ë§, ëȘšëž ëłëĄ ížíì±ì ì¶ê°íŽìŒ íë€ë ëšì ëí ìĄŽìŹíë€ë êČì ì믞í©ëë€. ìąì ììì êž°ìĄŽ ëȘšëžì TensorFlow ížíì±ì ì¶ê°íë êČìŽ [ìČìë¶í° ìëĄìŽ ëȘšëžì ì¶ê°íë êČ](add_new_model)볎ë€ë ê°ëšíë€ë êČì
ëë€!
-
-ë§ìœ ëê·ëȘš TensorFlow ëȘšëžì ë êčìŽ ìŽíŽíë €ê±°ë, ì€í ìì€ì í° êž°ìŹë„Œ íë €ê±°ë, ì íí ëȘšëžì Tensorflowë„Œ íì©íë €íë€ë©Ž, ìŽ ìëŽìë ìŹëŹë¶ê» ëììŽ ë êČì
ëë€.
-
-ìŽ ê°ìŽëë Hugging Face íì ì”ìíì ê°ë
ìëìì đ€ Transformersìì ìŹì©ëë TensorFlow ëȘšëž ê°ì€ìčì/ëë ìí€í
ìČë„Œ êž°ìŹí ì ìë ì»€ëź€ëí° ê”Źì±ììž ìŹëŹë¶ì ëììŒëĄ í©ëë€.
-ìëĄìŽ ëȘšëžì ìì±íë êČì ìŹìŽ ìŒìŽ ìëì§ë§, ìŽ ê°ìŽëë„Œ í”íŽ ìĄ°êž ë íë€êł íšìŹ ìŹìŽ ìì
ìŒëĄ ë§ë€ ì ìì”ëë€.
-ëȘšëì êČœíì ëȘšìŒë êČì ìŽ ìì
ì ì ì°šì ìŒëĄ ë ìœêČ ë§ëë ë° ê”ì„í ì€ìíêž° ë돞ì, ìŽ ê°ìŽëë„Œ ê°ì ìíŹë§í ì ììŽ ë ì€ë„Žë©Ž êł”ì íìë걞 ì ê·čì ìŒëĄ ê¶ì„í©ëë€!
-
-ë êčìŽ ììëłŽêž° ì ì, đ€ Transformersë„Œ ìČì ì íë êČœì° ë€ì ìëŁë„Œ íìžíë êČìŽ ìąì”ëë€:
-- [đ€ Transformersì ìŒë° ê°ì](add_new_model#general-overview-of-transformers)
-- [Hugging Faceì TensorFlow ìČ í](https://huggingface.co/blog/tensorflow-philosophy)
-
-ìŽ ê°ìŽëì ëëšžì§ ë¶ë¶ììë ìëĄìŽ TensorFlow ëȘšëž ìí€í
ìČë„Œ ì¶ê°íë ë° íìí ëšêł, Pytorchë„Œ TensorFlow ëȘšëž ê°ì€ìčëĄ ëłííë ì ì°š ë° ML íë ììíŹ ê°ì ë¶ìŒìčë„Œ íšìšì ìŒëĄ ëëČêč
íë ë°©ëČì ìêČ ë êČì
ëë€. ììíŽëŽ
ìë€!
-
-
-
-ìŹì©íë €ë ëȘšëžìŽ ìŽëŻž íŽëčíë TensorFlow ìí€í
ìČê° ìëì§ íì€íì§ ìëì?
-
-ì íí ëȘšëž([ì](https://huggingface.co/google-bert/bert-base-uncased/blob/main/config.json#L14))ì `config.json`ì `model_type` íëë„Œ íìžíŽëłŽìžì. đ€ Transformersì íŽëč ëȘšëž íŽëìë "modeling_tf"ëĄ ììíë íìŒìŽ ìë êČœì°, íŽëč ëȘšëžìë íŽëč TensorFlow ìí€í
ìČ([ì](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert))ê° ìë€ë ì믞ì
ëë€.
-
-
-
-## TensorFlow ëȘšëž ìí€í
ìČ ìœë ì¶ê°íë ëšêłëł ê°ìŽë [[step-by-step-guide-to add-tensorFlow-model-architecture-code]]
-
-ëê·ëȘš ìí€í
ìČë„Œ ê°ì§ ëȘšëžì ì€êłíë ë°©ëČìë ìŹëŹê°ì§ê° ììŒë©°, íŽëč ì€êłë„Œ ê”Źííë ë°©ëČë ìŹëŹ ê°ì§ì
ëë€.
-ê·žëŹë ì°ëŠŹë [đ€ Transformers ìŒë° ê°ì](add_new_model#general-overview-of-transformers)ìì ìžêží ëëĄ ìŒêŽë ì€êł ì íì ë°ëŒìŒì§ë§ đ€ Transformersë„Œ ìŹì©íêž° íží êČìŽëŒë íêł í ìêČŹì ê°ì§êł ìì”ëë€.
-ì°ëŠŹì êČœíì í”íŽ TensorFlow ëȘšëžì ì¶ê°íë ë° êŽë šë ì€ìí ëȘ ê°ì§ ìŹíì ìë € ë늎 ì ìì”ëë€:
-
-- ìŽëŻž ìë걞 ë€ì ê°ë°íë € íì§ ë§ìžì! ì”ìí 2ê°ì ìŽëŻž ê”Źíë ëȘšëžì ëê° ì°žìĄ°íŽìŒ í©ëë€. ê”Źííë €ë ëȘšëžêłŒ êž°ë„ì ëìŒí Pytorch ëȘšëž íëì ê°ì 돞ì ì íì íêł ìë ë€ë„ž TensorFlow ëȘšëž íëë„Œ ìŽíŽëłŽìžì.
-- ì°ìí ëȘšëž ê”Źíì ìê°ìŽ ì§ëë ëšììì”ëë€. ìŽêČì ìœëê° ìëŠë”ë€ë ìŽì ê° ìëëŒ ìœëê° ëȘ
ííêł ëëČêč
ë° ê°ì ìŽ ìœêž° ë돞ì
ëë€. TensorFlow ê”Źíìì ë€ë„ž ëȘšëžë€êłŒ íšíŽì ëê°ìŽ íêł Pytorch ê”ŹíêłŒì ë¶ìŒìčë„Œ ì”ìííìŹ ë©ìží
ìŽëì ì
ëŹŽë„Œ ìœêČ íë€ë©Ž, êž°ìŹí ìœëê° ì€ëëëĄ ì ì§ë ì ìì”ëë€.
-- íìíë€ë©Ž ëìì ììČíìžì! đ€ Transformers íì ìŹëŹë¶ì ëêž° ìíŽ ììŒë©°, ìŹëŹë¶ìŽ ì§ë©Ží ëìŒí 돞ì ì ëí íŽêČ°ì±
ì ìŽëŻž ì°Ÿì êČœì°ë ìì ì ìì”ëë€.
-
-TensorFlow ëȘšëž ìí€í
ìČë„Œ ì¶ê°íë ë° íìí ëšêłë„Œ ê°ë”ì ìŒëĄ ìšëłŽë©Ž:
-1. ëłííë €ë ëȘšëž ì í
-2. transformers ê°ë° íêČœ ì€ëč
-3. (ì í ìŹí) ìŽëĄ ì ìžĄë©Ž ë° êž°ìĄŽ ê”Źí ìŽíŽ
-4. ëȘšëž ìí€í
ìČ ê”Źí
-5. ëȘšëž í
ì€íž ê”Źí
-6. PR (pull request) ì ì¶
-7. (ì í ìŹí) ë°ëȘš ëčë ë° êł”ì
-
-### 1.-3. ëȘšëž êž°ìŹ ì€ëč [[1.-3.-prepare-your-model-contribution]]
-
-**1. ëłííë €ë ëȘšëž ì í**
-
-ì°ì êž°ëłž ìŹíë¶í° ììíŽ ëłŽêČ ì”ëë€. 뚌ì ëłííë €ë ìí€í
ìČë„Œ ìììŒ í©ëë€.
-íčì ìí€í
ìČì ëí êŽìŹ ìë êČœì°, đ€ Transformers íìêČ ì ìì ììČíë êČì ìŹëŹë¶ì ìí„ë „ì ê·čëííë ìąì ë°©ëČì
ëë€.
-ì°ëŠŹë TensorFlowìì ëč ì ž ìë ê°ì„ ì ëȘ
í ìí€í
ìČëĄ ìŽëìŽ ë늏êČ ì”ëë€.
-TensorFlowìì ìŹì©í ëȘšëžìŽ ìŽëŻž đ€ Transformersì TensorFlow ìí€í
ìČ ê”ŹíìŽ ìì§ë§ ê°ì€ìčê° ìë êČœì°,
-ìŽ íìŽì§ì [ê°ì€ìč ì¶ê° ìčì
](#adding-tensorflow-weights-to-hub)ìŒëĄ ë°ëĄ ìŽëíì
ë ë©ëë€.
-
-ê°ëší ë§íŽì, ìŽ ìëŽìì ëëšžì§ ë¶ë¶ì TensorFlow ëČì ì *BrandNewBert*([ê°ìŽë](add_new_model)ì ëìŒí ìì )ë„Œ êž°ìŹíë €êł êČ°ì íë€êł ê°ì í©ëë€.
-
-
-
-TensorFlow ëȘšëž ìí€í
ìČì ìì
ì ììíêž° ì ì íŽëč ìì
ìŽ ì§í ì€ìžì§ íìžíìžì.
-`BrandNewBert`ë„Œ êČìíìŹ
-[pull request GitHub íìŽì§](https://github.com/huggingface/transformers/pulls?q=is%3Apr)ìì TensorFlow êŽë š pull requestê° ìëì§ íìží ì ìì”ëë€.
-
-
-
-**2. transformers ê°ë° íêČœ ì€ëč**
-
-
-ëȘšëž ìí€í
ìČë„Œ ì íí í, êŽë š ìì
ì ìíí ìëë„Œ 믞늏 ìëŠŹêž° ìíŽ Draft PRì ìŹìžì. ìë ì§ìčšëëĄ íìë©Ž íêČœì ì€ì íêł Draft PRì ìŽ ì ìì”ëë€.
-
-1. 'Fork' ëČíŒì íŽëŠíìŹ [늏íŹì§í°ëŠŹ](https://github.com/huggingface/transformers)ë„Œ íŹíŹíìžì. ìŽë êČ íë©Ž GitHub ìŹì©ì êłì ì ìœëì ìŹëłžìŽ ìì±ë©ëë€.
-
-
-2. `transformers` íŹíŹë„Œ ëĄì»Ź ëì€íŹì íŽëĄ íêł ìëłž 늏íŹì§í°ëŠŹë„Œ ìêČ© 늏íŹì§í°ëŠŹëĄ ì¶ê°íìžì.
-
-```bash
-git clone https://github.com/[your Github handle]/transformers.git
-cd transformers
-git remote add upstream https://github.com/huggingface/transformers.git
-```
-
-3. ê°ë° íêČœì ì€ì íìžì. ìë„Œ ë€ìŽ, ë€ì ëȘ
ë čì ì€ííìŹ ê°ë° íêČœì ì€ì í ì ìì”ëë€.
-
-```bash
-python -m venv .env
-source .env/bin/activate
-pip install -e ".[dev]"
-```
-
-ìŽì ìČŽì ì ë°ëŒì Transformersì ì íì ìą
ìì±ìŽ ìŠê°íë©Žì ì ëȘ
ë čìŽ ì€íší ìë ìì”ëë€. ê·žë° êČœì° TensorFlowë„Œ ì€ìčí í ë€ìì ì€ííìžì.
-
-```bash
-pip install -e ".[quality]"
-```
-
-**ì°žêł :** CUDAë„Œ ì€ìčí íìë ìì”ëë€. ìëĄìŽ ëȘšëžìŽ CPUìì ìëíëëĄ ë§ëë êČë§ìŒëĄ 충ë¶í©ëë€.
-
-4. ë©ìž ëžëìčìì ë§ëë €ë êž°ë„ìŽ ì ííëë ìŽëŠìŒëĄ ëžëìčë„Œ ë§ëëë€.
-
-```bash
-git checkout -b add_tf_brand_new_bert
-```
-
-5. ë©ìž ëžëìčì íìŹ ìíë„Œ íìč(fetch)íêł ëŠŹëČ ìŽì€íìžì.
-
-```bash
-git fetch upstream
-git rebase upstream/main
-```
-
-6. `transformers/src/models/brandnewbert/`ì `modeling_tf_brandnewbert.py`ëŒë ëč `.py` íìŒì ì¶ê°íìžì. ìŽ íìŒìŽ TensorFlow ëȘšëž íìŒìŽ ë êČì
ëë€.
-
-7. ëłêČœ ìŹíì êłì ì ížìíìžì.
-
-```bash
-git add .
-git commit -m "initial commit"
-git push -u origin add_tf_brand_new_bert
-```
-
-8. ë§ìĄ±ì€ëŹìŽ êČœì° GitHubìì íŹíŹë ìč íìŽì§ëĄ ìŽëí©ëë€. "Pull request"ë„Œ íŽëŠí©ëë€. Hugging Face íì GitHub IDë„Œ 늏뷰ìŽëĄ ì¶ê°íŽì, ììŒëĄì ëłêČœ ìŹíì ëíŽ Hugging Face íìŽ ì늌ì ë°ì ì ìëëĄ í©ëë€.
-
-
-9. GitHub Pull Requests íìŽì§ì ì€ë„žìȘœì ìë "Convert to draft"ë„Œ íŽëŠíìŹ PRì ìŽììŒëĄ ëłêČœíìžì.
-
-ìŽì đ€ Transformersìì *BrandNewBert*ë„Œ TensorFlowëĄ ëłíí ê°ë° íêČœì ì€ì íì”ëë€.
-
-
-**3. (ì í ìŹí) ìŽëĄ ì ìžĄë©Ž ë° êž°ìĄŽ ê”Źí ìŽíŽ**
-
-
-*BrandNewBert*ìČëŒ ììží êžìŽ ìë€ë©Ž ìê°ì ëŽìŽ ë
ŒëŹžì ìœë걞 ì¶ìČë늜ëë€. ìŽíŽíêž° ìŽë €ìŽ ë¶ë¶ìŽ ë§ì ì ìì”ëë€. ê·žë ë€êł íŽì ê±±ì íì§ ë§ìžì! ëȘ©íë ë
ŒëŹžì ìŹëìë ìŽëĄ ì ìŽíŽê° ìëëŒ TensorFlowë„Œ ìŹì©íìŹ đ€ Transformersì ëȘšëžì íšêłŒì ìŒëĄ ë€ì ê”Źííë ë° íìí íì ì ëłŽë„Œ ì¶ì¶íë êČì
ëë€. ë§ì ìê°ì ìŽëĄ ì ìŽíŽì íŹìí íìë ìì§ë§ ì€ì©ì ìž ìžĄë©Žìì íìŹ ìĄŽìŹíë ëȘšëž 돞ì íìŽì§(e.g. [model docs for BERT](model_doc/bert))ì ì§ì€íë êČìŽ ìąì”ëë€.
-
-
-ëȘšëžì êž°ëłž ìŹíì ìŽíŽí í, êž°ìĄŽ ê”Źíì ìŽíŽíë êČìŽ ì€ìí©ëë€. ìŽë ìì
ì€ìž ëȘšëžì ëí ì€ì ê”ŹíìŽ ìŹëŹë¶ì êž°ëì ìŒìčíšì íìžíêł , TensorFlow ìžĄë©Žììì êž°ì ì 돞ì ë„Œ ììí ì ìì”ëë€.
-
-ë§ëí ìì ì ëłŽë„Œ ìČììŒëĄ íì”í ë ìëëčíë êČì ìì°ì€ëŹìŽ ìŒì
ëë€. ìŽ ëšêłìì ëȘšëžì ëȘšë ìžĄë©Žì ìŽíŽíŽìŒ íë íìë ì í ìì”ëë€. ê·žëŹë ì°ëŠŹë Hugging Faceì [íŹëŒ](https://discuss.huggingface.co/)ì í”íŽ ì§ëŹžìŽ ìë êČœì° ëë”ì ê”Źí êČì ê¶ì„í©ëë€.
-
-### 4. ëȘšëž ê”Źí [[4-model-implementation]]
-
-
-ìŽì ëëìŽ ìœë©ì ììí ìê°ì
ëë€. ì°ëŠŹì ì ìë ììì ì PyTorch íìŒ ììČŽì
ëë€: `modeling_brand_new_bert.py`ì ëŽì©ì
-`src/transformers/models/brand_new_bert/` ëŽë¶ì
-`modeling_tf_brand_new_bert.py`ì ëł”ìŹí©ëë€. ìŽ ìčì
ì ëȘ©íë íìŒì ìì íêł đ€ Transformersì import ê”ŹìĄ°ë„Œ ì
ë°ìŽížíìŹ `TFBrandNewBert` ë° `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`ê° ì±êł”ì ìŒëĄ ìëíë TensorFlow *BrandNewBert* ëȘšëžì ê°ì žìŹ ì ìëëĄ íë êČì
ëë€.
-
-ì ê°ì€ëœêČë, PyTorch ëȘšëžì TensorFlowëĄ ëłííë ê·ìčì ìì”ëë€. ê·žëŹë íëĄìžì€ë„Œ ê°ë„íí ìííêČ ë§ë€êž° ìíŽ ë€ì íì ë°ë„Œ ì ìì”ëë€.
-
-- ëȘšë íŽëì€ ìŽëŠ ìì `TF`ë„Œ ë¶ì
ëë€(ì: `BrandNewBert`ë `TFBrandNewBert`ê° ë©ëë€).
-- ëë¶ë¶ì PyTorch ìì
ìë ì§ì ì ìž TensorFlow ëìČŽê° ìì”ëë€. ìë„Œ ë€ìŽ, `torch.nn.Linear`ë `tf.keras.layers.Dense`ì íŽëčíêł , `torch.nn.Dropout`ì `tf.keras.layers.Dropout`ì íŽëčí©ëë€. íčì ìì
ì ëíŽ íì ìŽ ìë êČœì° [TensorFlow 돞ì](https://www.tensorflow.org/api_docs/python/tf)ë [PyTorch 돞ì](https://pytorch.org/docs/stable/)ë„Œ ì°žìĄ°í ì ìì”ëë€.
-- đ€ Transformers ìœëëČ ìŽì€ìì íšíŽì ì°ŸìŒìžì. ì§ì ì ìž ëìČŽê° ìë íčì ìì
ì ë§ëë©Ž ë€ë„ž ìŹëìŽ ìŽëŻž ëìŒí 돞ì ë„Œ íŽêČ°í êČœì°ê° ë§ì”ëë€.
-- êž°ëłžì ìŒëĄ PyTorchì ëìŒí ëłì ìŽëŠêłŒ ê”ŹìĄ°ë„Œ ì ì§íìžì. ìŽë êČ íë©Ž ëëČêč
êłŒ 돞ì ì¶ì , ê·žëŠŹêł ëŹžì íŽêČ° ì¶ê°ê° ë ìŹìì§ëë€.
-- ìŒë¶ ë ìŽìŽë ê° íë ììíŹë§ë€ ë€ë„ž êž°ëłžê°ì ê°ì§êł ìì”ëë€. ëíì ìž ìëĄ ë°°ìč ì ê·í ë ìŽìŽì epsilonì [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d)ìì `1e-5`ìŽêł [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)ìì `1e-3`ì
ëë€. 돞ìë„Œ ëȘšë íìžíìžì!
-- PyTorchì `nn.Parameter` ëłìë ìŒë°ì ìŒëĄ TF ë ìŽìŽì `build()` ëŽìì ìŽêž°ííŽìŒ í©ëë€. ë€ì ìë„Œ ì°žìĄ°íìžì: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) /
- [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220)
-- PyTorch ëȘšëžì íšì ìëšì `#copied from ...`ê° ìë êČœì°, TensorFlow ëȘšëžì TensorFlow ìí€í
ìČê° ìë€ë©Ž TensorFlow ëȘšëžìŽ íŽëč íšìë„Œ ëł”ìŹí ìí€í
ìČìì ìŹì©í ì ìì”ëë€.
-- TensorFlow íšììì `name` ìì±ì ìŹë°ë„ŽêČ í ëčíë êČì `from_pt=True` ê°ì€ìč ê”ì°š ëĄë©ì ìííë ë° ì€ìí©ëë€. `name`ì ëë¶ë¶ PyTorch ìœëì íŽëč ëłìì ìŽëŠì
ëë€. `name`ìŽ ì ëëĄ ì€ì ëì§ ììŒë©Ž ëȘšëž ê°ì€ìčë„Œ ëĄëí ë ì€ë„ ë©ìì§ìì íìží ì ìì”ëë€.
-- êž°ëłž ëȘšëž íŽëì€ìž `BrandNewBertModel`ì ëĄì§ì ì€ì ëĄ Keras ë ìŽìŽ ìëžíŽëì€([ìì](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719))ìž `TFBrandNewBertMainLayer`ì ìì”ëë€. `TFBrandNewBertModel`ì ìŽ ë ìŽìŽë„Œ ê°ìžêž°ë§ íë ëíŒ ìí ì í©ëë€.
-- Keras ëȘšëžì ìŹì íë šë ê°ì€ìčë„Œ ëĄëíêž° ìíŽ ëčëëìŽìŒ í©ëë€. ë°ëŒì `TFBrandNewBertPreTrainedModel`ì ëȘšëžì ì
ë „ ìì ìž `dummy_inputs`([ìì](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)) ì ì§íŽìŒ í©ëë€.
-- ëììŽ íìí êČœì° ëìì ììČíìžì. ì°ëŠŹë ìŹêž° ììŽì ëìì ëëŠŹêž° ìíŽ ìë êČì
ëë€! đ€
-
-ëȘšëž íìŒ ììČŽ ìžìë ëȘšëž íŽëì€ ë° êŽë š 돞ì íìŽì§ì ëí íŹìží°ë„Œ ì¶ê°íŽìŒ í©ëë€. ìŽ ë¶ë¶ì ë€ë„ž PR([ìì](https://github.com/huggingface/transformers/pull/18020/files))ì íšíŽì ë°ëŒ ìì í ìëŁí ì ìì”ëë€. ë€ìì íìí ìë ëłêČœ ëȘ©ëĄì
ëë€.
-
-- `src/transformers/__init__.py`ì *BrandNewBert*ì ëȘšë êł”ê° íŽëì€ë„Œ íŹíší©ëë€.
-- `src/transformers/models/auto/modeling_tf_auto.py`ìì *BrandNewBert* íŽëì€ë„Œ íŽëč Auto íŽëì€ì ì¶ê°í©ëë€.
-- `src/transformers/utils/dummy_tf_objects.py`ì *BrandNewBert*ì êŽë šë ë ìŽì§ ëĄë© íŽëì€ë„Œ ì¶ê°í©ëë€.
-- `src/transformers/models/brand_new_bert/__init__.py`ìì êł”ê° íŽëì€ì ëí import ê”ŹìĄ°ë„Œ ì
ë°ìŽíží©ëë€.
-- `docs/source/en/model_doc/brand_new_bert.md`ìì *BrandNewBert*ì êł”ê° ë©ìëì ëí 돞ì íŹìží°ë„Œ ì¶ê°í©ëë€.
-- `docs/source/en/model_doc/brand_new_bert.md`ì *BrandNewBert* êž°ìŹì ëȘ©ëĄì ìì ì ì¶ê°í©ëë€.
-- ë§ì§ë§ìŒëĄ â
ë
čì ìČŽíŹë°ì€ë„Œ TensorFlow ìŽ docs/source/en/index.md ì BrandNewBertì ì¶ê°í©ëë€.
-
-ê”ŹíìŽ ë§ìĄ±íë©Ž ë€ì ìČŽíŹëŠŹì€ížë„Œ ì€ííìŹ ëȘšëž ìí€í
ìČê° ì€ëčëìëì§ íìžíìžì.
-
-1. íë š ìê°ì ë€ë„ŽêČ ëìíë `training` ìžìëĄ ë¶ëŠŹë ëȘšë ë ìŽìŽ(ì: Dropout)ë ì”ìì íŽëì€ìì ì íë©ëë€.
-2. #copied from ...ê°ë„í ëë§ë€ ìŹì©íì”ëë€.
-3. `TFBrandNewBertMainLayer`ì ê·žêČì ìŹì©íë ëȘšë íŽëì€ë `call`íšìëĄ `@unpack_inputs`ì íšê» ë°ìœë ìŽí° ë©ëë€.
-4. `TFBrandNewBertMainLayer`ë `@keras_serializable`ëĄ ë°ìœë ìŽí° ë©ëë€.
-5. TensorFlow ëȘšëžì `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`ë„Œ ìŹì©íìŹ PyTorch ê°ì€ìčìì ëĄëí ì ìì”ëë€.
-6. ìì ì
ë „ íìì ìŹì©íìŹ TensorFlow ëȘšëžì ížì¶í ì ìì”ëë€.
-
-### 5. ëȘšëž í
ì€íž ê”Źí [[5-add-model-tests]]
-
-TensorFlow ëȘšëž ìí€í
ìČë„Œ ê”Źííë ë° ì±êł”íì”ëë€! ìŽì TensorFlow ëȘšëžì í
ì€ížíë ê”Źíì ìì±í ì°šëĄì
ëë€. ìŽë„Œ í”íŽ ëȘšëžìŽ ììëëĄ ìëíëì§ íìží ì ìì”ëë€. ìŽì ì ì°ëŠŹë `test_modeling_brand_new_bert.py` íìŒì `tests/models/brand_new_bert/ into test_modeling_tf_brand_new_bert.py`ì ëł”ìŹí ë€, TensorFlowëĄ ê”ìČŽíë êČìŽ ìąì”ëë€. ì§êžì, ëȘšë `.from_pretrained()`ì `from_pt=True`ë„Œ ìŹì©íìŹ ìĄŽìŹíë Pytorch ê°ì€ìčë„Œ ê°ì žì€ëëĄ íŽìŒí©ëë€.
-
-ìëŁíì
šìŒë©Ž, ìŽì ì§ì€ì ìê°ìŽ ì°Ÿììì”ëë€: í
ì€ížë„Œ ì€ííŽ ëłŽìžì! đŹ
-
-```bash
-NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
-py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
-```
-
-ì€ë„ê° ë§ìŽ ëíë êČìŽì§ë§ êŽì°źì”ëë€! êž°êł íì” ëȘšëžì ëëČêč
íë êČì ì
ëȘ
ëêČ ìŽë €ì°ë©° ì±êł”ì í”ìŹ ììë ìžëŽìŹì
ëë€ (`breakpoint()`ë íìí©ëë€). ì°ëŠŹì êČœíììŒëĄë ML íë ììíŹ ìŹìŽì 믞ëŹí ë¶ìŒìčëĄ ìžíŽ ê°ì„ ìŽë €ìŽ 돞ì ê° ë°ìí©ëë€. ìŽì ëí ëȘ ê°ì§ ì§ìčšìŽ ìŽ ê°ìŽëì ë ë¶ë¶ì ìì”ëë€. ë€ë„ž êČœì°ìë ìŒë° í
ì€ížê° ì§ì ëȘšëžì ì ì©ëì§ ìì ì ììŒë©°, ìŽ êČœì° ëȘšëž í
ì€íž íŽëì€ ë ëČšìì ìŹì ìë„Œ ì ìí©ëë€. 돞ì ê° ëŹŽììŽë ì§ ìêŽììŽ ëŹžì ê° ììŒë©Ž ëčì ìŽ êł ëŠœëìë€ë©Ž draft pull requestìì ëìì ììČíë êČìŽ ìąì”ëë€.
-
-ëȘšë í
ì€ížê° í”êłŒëë©Ž ì¶íí©ëë€. ìŽì ëȘšëžì đ€ Transformers ëŒìŽëžëŹëŠŹì ì¶ê°í ì€ëčê° ê±°ì ìëŁë êČì
ëë€! đ
-
-
-í
ì€ížë„Œ ì¶ê°íë ë°©ëČì ëí ììží ëŽì©ì [đ€ Transformersì í
ì€íž ê°ìŽë](https://huggingface.co/transformers/contributing.html#running-tests)ë„Œ ì°žìĄ°íìžì.
-
-### 6.-7. ëȘšë ìŹì©ìê° ëčì ì ëȘšëžì ìŹì©í ì ìêČ íêž° [[6.-7.-ensure-everyone -can-use-your-model]]
-
-**6. í ììČ ì ì¶íêž°**
-
-ê”ŹíêłŒ í
ì€ížê° ìëŁëë©Ž í ììČì ì ì¶í ìê°ì
ëë€. ìœëë„Œ ížìíêž° ì ì ìœë ìì ë§ì¶êž° ì ížëŠŹí°ìž `make fixup` đȘ ë„Œ ì€ííìžì. ìŽë êČ íë©Ž ìëìŒëĄ ìì ì€ë„ë„Œ ìì íë©° ìë êČìŹê° ì€íšíë êČì ë°©ì§í ì ìì”ëë€.
-
-ìŽì ëëííž í ììČì ì€ì í ììČìŒëĄ ëłííë ìê°ì
ëë€. "늏뷰 ì€ëčëš" ëČíŒì íŽëŠíêł Joao (`@gante`)ì Matt (`@Rocketknight1`)ë„Œ 늏뷰ìŽëĄ ì¶ê°íìžì. ëȘšëž í ììČìë ì ìŽë 3ëȘ
ì 늏뷰ìŽê° íìíì§ë§, ê·žë€ìŽ ëčì ì ëȘšëžì ì ì í ì¶ê° 늏뷰ìŽë„Œ ì°Ÿì êČì
ëë€.
-
-ëȘšë 늏뷰ìŽë€ìŽ PR ìíì ë§ìĄ±íë©Ž ë§ì§ë§ìŒëĄ `.from_pretrained()` ížì¶ìì `from_pt=True` íëê·žë„Œ ì ê±°íë êČì
ëë€. TensorFlow ê°ì€ìčê° ìêž° ë돞ì ìŽë„Œ ì¶ê°íŽìŒ í©ëë€! ìŽë„Œ ìííë ë°©ëČì ìë ìčì
ì ì§ìčšì íìžíìžì.
-
-ë§ìčšëŽ TensorFlow ê°ì€ìčê° ëłí©ëêł , ì ìŽë 3ëȘ
ì ëŠŹë·°ìŽ ìčìžì ë°ììŒë©° ëȘšë CI êČìŹê° í”êłŒëìë€ë©Ž, ëĄì»ŹëĄ í
ì€ížë„Œ í ëČ ë íìžíìžì.
-
-```bash
-NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
-py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
-```
-
-ê·žëŠŹêł ì°ëŠŹë ëčì ì PRì ëłí©í êČì
ëë€! ë§ìŒì€í€ ëŹì±ì ì¶íë늜ëë€! đ
-
-**7. (ì í ìŹí) ë°ëȘšë„Œ ë§ë€êł ìžìêłŒ êł”ì íêž°**
-
-ì€í ìì€ì ê°ì„ ìŽë €ìŽ ë¶ë¶ ì€ íëë ë°êČŹì
ëë€. ë€ë„ž ìŹì©ìë€ìŽ ëčì ì ë©ì§ TensorFlow êž°ìŹë„Œ ìŽë»êČ ì ì ììêčì? ëŹŒëĄ ì ì í ì»€ëź€ëìŒìŽì
ìŒëĄ ê°ë„í©ëë€! đŁ
-
-ì»€ëź€ëí°ì ëȘšëžì êł”ì íë ë ê°ì§ ìŁŒì ë°©ëČìŽ ìì”ëë€:
-- ë°ëȘš ë§ë€êž°. Gradio ë°ëȘš, ë
žížë¶ ë° ëȘšëžì ìëíë ë€ë„ž ìŹëŻžìë ë°©ëČì íŹíší©ëë€. [ì»€ëź€ëí° êž°ë° ë°ëȘš](https://huggingface.co/docs/transformers/community)ì ë
žížë¶ì ì¶ê°íë êČì ì ê·č ê¶ì„í©ëë€.
-- Twitterì LinkedInêłŒ ê°ì ìì
믞ëìŽì ìŽìŒêž° êł”ì íêž°. ëčì ì ìì
ì ìëì€ëŹìíêł ì»€ëź€ëí°ì ëčì ì ì
ì ì êł”ì íŽìŒ í©ëë€. ìŽì ëčì ì ëȘšëžì ì ìžêłì ììČ ëȘ
ì ìì§ëìŽì ì°ê”Źìë€ì ìíŽ ìŹì©ë ì ìì”ëë€ đ! ì°ëŠŹë ëčì ì êČìëŹŒì 늏ížìíêł ì»€ëź€ëí°ì íšê» ëčì ì ìì
ì êł”ì íë ë° ëììŽ ë êČì
ëë€.
-
-
-## đ€ íëžì TensorFlow ê°ì€ìč ì¶ê°íêž° [[adding-tensorFlow-weights-to-đ€-hub]]
-
-TensorFlow ëȘšëž ìí€í
ìČê° đ€ Transformersìì ìŹì© ê°ë„íë€êł ê°ì íêł , PyTorch ê°ì€ìčë„Œ TensorFlow ê°ì€ìčëĄ ëłííë êČì ìœì”ëë€!
-
-ë€ìì ê·ž ë°©ëČì
ëë€:
-1. í°ëŻžëìì Hugging Face êłì ìŒëĄ ëĄê·žìžëìŽ ìëì§ íìžíììì€. `huggingface-cli login` ëȘ
ë čìŽë„Œ ìŹì©íìŹ ëĄê·žìží ì ìì”ëë€. (ìĄìžì€ í í°ì [ìŹêž°](https://huggingface.co/settings/tokens)ìì ì°Ÿì ì ìì”ëë€.)
-2. `transformers-cli pt-to-tf --model-name foo/bar`ë„Œ ì€ííììì€. ìŹêž°ì `foo/bar`ë ëłííë €ë PyTorch ê°ì€ìčê° ìë ëȘšëž ì ì„ìì ìŽëŠì
ëë€.
-3. ë°©êž ë§ë đ€ íëž PRìì `@joaogante`ì `@Rocketknight1`ì íê·ží©ëë€.
-
-ê·žêČ ë€ì
ëë€! đ
-
-
-## ML íë ììíŹ ê° ëëČêč
đ[[debugging-mismatches-across-ml-frameworks]]
-
-ìëĄìŽ ìí€í
ìČë„Œ ì¶ê°íê±°ë êž°ìĄŽ ìí€í
ìČì ëí TensorFlow ê°ì€ìčë„Œ ìì±í ë, PyTorchì TensorFlow ê°ì ë¶ìŒìčëĄ ìží ì€ë„ê° ë°ìí ì ìì”ëë€. ìŹì§ìŽ ë íë ììíŹì ëȘšëž ìí€í
ìČ ìœëê° ëìŒíŽ ëłŽìŒ ìë ìì”ëë€. ëŹŽìš ìŒìŽ ëČìŽì§êł ìë 걞êčì? đ€
-
-뚌ì , ìŽëŹí ë¶ìŒìčë„Œ ìŽíŽíë ìŽì ì ëíŽ ìŽìŒêž°íŽ 볎êČ ì”ëë€. ë§ì ì»€ëź€ëí° ë©€ëČë€ì đ€ Transformers ëȘšëžì ê·žëëĄ ìŹì©íêł , ì°ëŠŹì ëȘšëžìŽ ììëëĄ ìëí êČìŽëŒêł 믿ì”ëë€. ë íë ììíŹ ê°ì í° ë¶ìŒìčê° ììŒë©Ž ëȘšëžìŽ ì ìŽë íëì íë ììíŹì ëí ì°žìĄ° ê”Źíì ë°ë„Žì§ ììì ì믞í©ëë€. ìŽë ëȘšëžìŽ ìëí ëëĄ ìëíì§ ìì ì ììì ëíë
ëë€. ìŽë ìì ì€íëì§ ìë ëȘšëžëłŽë€ ëì ì ìì”ëë€! ë°ëŒì ì°ëŠŹë ëȘšë ëȘšëžì íë ììíŹ ë¶ìŒìčë„Œ `1e-5`ëłŽë€ ìêČ ì ì§íë êČì ëȘ©íëĄ í©ëë€.
-
-êž°í ì«ì 돞ì ì ë§ì°Źê°ì§ëĄ, ìžìží 돞ì ê° ìì”ëë€. ê·žëŠŹêł ìžìžíšì ì§ì€íë êł”ì ìì íì ììë ìžëŽìŹì
ëë€. ìŽëŹí ìą
ë„ì 돞ì ê° ë°ìí ë ê¶ì„ëë ìì
íëŠì ë€ìêłŒ ê°ì”ëë€:
-1. ë¶ìŒìčì ììžì ì°Ÿì볎ììì€. ëłí ì€ìž ëȘšëžì ìë§ë íčì ì§ì êčì§ ê±°ì ëìŒí ëŽë¶ ëłìë„Œ ê°ì§êł ìì êČì
ëë€. ë íë ììíŹì ìí€í
ìČì `breakpoint()` 돞ì ëŁêł , ììì ìëëĄ ì«ì ëłìì ê°ì ëčê”íìŹ ëŹžì ì ê·Œìì ì°Ÿìë
ëë€.
-2. ìŽì 돞ì ì ê·Œìì ì°ŸììŒëŻëĄ đ€ Transformers íì ì°ëœíìžì. ì°ëŠŹë ëčì·í 돞ì ë„Œ ìŽì ì êČȘìì ì ììŒë©° ëč ë„ŽêČ íŽêČ°ì±
ì ì êł”í ì ìì”ëë€. ììžì ìž êČœì°ìë StackOverflowì GitHub ìŽìì ê°ì ìžêž°ìë íìŽì§ë„Œ íìžíììì€.
-3. ë ìŽì íŽêČ°ì±
ìŽ ìë êČœì°, ë êčìŽ ë€ìŽê°ìŒ í©ëë€. ìąì ììì 돞ì ì ììžì ì°ŸììŒëŻëĄ ëëšžì§ ëȘšëžì ì¶ìííêł ëŹžì ê° ìë ëȘ
ë čìŽì ìŽì ì ë§ì¶ ì ìì”ëë€! ëì ììì íŽëč ëȘ
ë čìŽì ìì€ ê”Źíì ëíŽ ììëŽìŒ íë€ë êČì
ëë€. ìŒë¶ êČœì°ìë ì°žìĄ° ê”Źíì 돞ì ê° ìì ìë ììŒë ì
ì€ížëŠŒ ì ì„ììì ìŽìë„Œ ìŽêž°ë„Œ êșŒëŠŹì§ ë§ììì€.
-
-ìŽë€ êČœì°ìë đ€ Transformers íêłŒì í ëĄ ì í”íŽ ë¶ìŒìčë„Œ ìì í ì ìì ìë ìì”ëë€. ëȘšëžì ì¶ë „ ë ìŽìŽìì ë¶ìŒìčê° ë§€ì° ìì§ë§ ìšêČšì§ ìíìì íŹêČ ëíë ì ìêž° ë돞ì
ëë€. ìŽ êČœì° ëȘšëžì ë°°íŹíë êČì ì°ì ìíêž° ìíŽ ë¶ìŒìčë„Œ 돎ìíêž°ëĄ êČ°ì í ìë ìì”ëë€. ììì ìžêží `pt-to-tf` CLIìë ê°ì€ìč ëłí ì ì€ë„ ë©ìì§ë„Œ 돎ìíë `--max-error` íëê·žê° ìì”ëë€.
diff --git a/docs/source/ko/contributing.md b/docs/source/ko/contributing.md
index 56e51b326644f2..f5003eff07c02e 100644
--- a/docs/source/ko/contributing.md
+++ b/docs/source/ko/contributing.md
@@ -99,7 +99,7 @@ python src/transformers/commands/transformers_cli.py env
ë§ìœ ëȘšëžì ì§ì êž°ìŹíêł ì¶ìŒìë€ë©Ž, ìë €ìŁŒìžì. đ€ Transformersì ì¶ê°í ì ìëëĄ ëìë늏êČ ì”ëë€!
-ìëĄìŽ ëȘšëžì ì¶ê°íë ë°©ëČì ëí [ììž ìëŽìì í
í늿](https://github.com/huggingface/transformers/tree/main/templates)ì ì êł”íêł ììŒë©°, [đ€ Transformersì ìëĄìŽ ëȘšëžì ì¶ê°íë ë°©ëČ](https://huggingface.co/docs/transformers/add_new_model)ì ëí êž°ì ì ìž ìëŽìë ìì”ëë€.
+[đ€ Transformersì ìëĄìŽ ëȘšëžì ì¶ê°íë ë°©ëČ](https://huggingface.co/docs/transformers/add_new_model)ì ëí êž°ì ì ìž ìëŽìë ìì”ëë€.
## 돞ìë„Œ ì¶ê°íêł ì¶ìŒì ê°ì? [[do-you-want-to-add-documentation]]
diff --git a/docs/source/ko/tasks/asr.md b/docs/source/ko/tasks/asr.md
index 474d60bf2d1a19..2247537678abea 100644
--- a/docs/source/ko/tasks/asr.md
+++ b/docs/source/ko/tasks/asr.md
@@ -29,13 +29,8 @@ Siriì Alexaì ê°ì ê°ì ìŽìì€íŽížë ASR ëȘšëžì ìŹì©íìŹ ìŒ
2. ëŻžìž ìĄ°ì í ëȘšëžì ì¶ëĄ ì ìŹì©í©ëë€.
-ìŽ íí 늏ìŒìì ì€ëȘ
íë ìì
ì ë€ì ëȘšëž ìí€í
ìČì ìíŽ ì§ìë©ëë€:
-
-
-[Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [M-CTC-T](../model_doc/mctct), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm)
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/automatic-speech-recognition)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/audio_classification.md b/docs/source/ko/tasks/audio_classification.md
index c9ef810e8ef4f4..73932100b0cb3a 100644
--- a/docs/source/ko/tasks/audio_classification.md
+++ b/docs/source/ko/tasks/audio_classification.md
@@ -28,13 +28,8 @@ rendered properly in your Markdown viewer.
2. ì¶ëĄ ì ëŻžìž ìĄ°ì ë ëȘšëžì ìŹì©íìžì.
-ìŽ íí 늏ìŒìì ì€ëȘ
íë ìì
ì ìëì ëȘšëž ìí€í
ìČìì ì§ìë©ëë€:
-
-
-[Audio Spectrogram Transformer](../model_doc/audio-spectrogram-transformer), [Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm), [Whisper](../model_doc/whisper)
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/audio-classification)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/document_question_answering.md b/docs/source/ko/tasks/document_question_answering.md
index 920eb99ea52960..3d943ab96e6765 100644
--- a/docs/source/ko/tasks/document_question_answering.md
+++ b/docs/source/ko/tasks/document_question_answering.md
@@ -29,13 +29,7 @@ rendered properly in your Markdown viewer.
-ìŽ íí 늏ìŒìì ì€ëȘ
íë íì€íŹë ë€ìêłŒ ê°ì ëȘšëž ìí€í
ìČìì ì§ìë©ëë€:
-
-
-
-[LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3)
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/image-to-text)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/image_classification.md b/docs/source/ko/tasks/image_classification.md
index d647b4512b038a..91ff3a9ca9b848 100644
--- a/docs/source/ko/tasks/image_classification.md
+++ b/docs/source/ko/tasks/image_classification.md
@@ -30,12 +30,8 @@ rendered properly in your Markdown viewer.
2. ì¶ëĄ ì ìíŽ ëŻžìž ìĄ°ì ëȘšëžì ìŹì©í©ëë€.
-ìŽ íí 늏ìŒìì ì€ëȘ
íë ìì
ì ë€ì ëȘšëž ìí€í
ìČì ìíŽ ì§ìë©ëë€:
-
-
-[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/image-classification)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/language_modeling.md b/docs/source/ko/tasks/language_modeling.md
index b98c64dcc3adae..ff2a47c24ece2a 100644
--- a/docs/source/ko/tasks/language_modeling.md
+++ b/docs/source/ko/tasks/language_modeling.md
@@ -33,14 +33,8 @@ rendered properly in your Markdown viewer.
2. ëŻžìž ìĄ°ì ë ëȘšëžì ì¶ëĄ ì ìŹì©
-ìŽ ìëŽìì ëšêłì ëìŒí ë°©ëČìŒëĄ ìžêłŒ ìžìŽ ëȘšëžë§ì ìíŽ ë€ë„ž ìí€í
ìČë„Œ ëŻžìž ìĄ°ì í ì ìì”ëë€.
-ë€ì ìí€í
ìČ ì€ íëë„Œ ì ííìžì:
-
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
-
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/text-generation)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/masked_language_modeling.md b/docs/source/ko/tasks/masked_language_modeling.md
index c710dbf168ed01..74df085c5b558f 100644
--- a/docs/source/ko/tasks/masked_language_modeling.md
+++ b/docs/source/ko/tasks/masked_language_modeling.md
@@ -30,15 +30,8 @@ rendered properly in your Markdown viewer.
2. ì¶ëĄ ìì ì§ì ëŻžìž ìĄ°ì í ëȘšëžì ìŹì©í©ëë€.
-ìŽëČ ê°ìŽëìììČëŒ ë€ë„ž ìí€í
ìČë„Œ ëŻžìž ìĄ°ì íŽ ë§ì€íčë ìžìŽ ëȘšëžë§ì í ì ìì”ëë€.
-ë€ì ìí€í
ìł ì€ íëë„Œ ì ííìžì:
-
-
-
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [Perceiver](../model_doc/perceiver), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Wav2Vec2](../model_doc/wav2vec2), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/fill-mask)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/monocular_depth_estimation.md b/docs/source/ko/tasks/monocular_depth_estimation.md
index e02dd5466b7d54..2c640d2a86db3d 100644
--- a/docs/source/ko/tasks/monocular_depth_estimation.md
+++ b/docs/source/ko/tasks/monocular_depth_estimation.md
@@ -24,13 +24,8 @@ rendered properly in your Markdown viewer.
-ìŽ íí 늏ìŒìì ë€ëŁšë ìì
ì ë€ì ëȘšëž ìí€í
ìČìì ì§ìë©ëë€:
-
-
-[DPT](../model_doc/dpt), [GLPN](../model_doc/glpn)
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/depth-estimation)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/multiple_choice.md b/docs/source/ko/tasks/multiple_choice.md
index b28654ea4f1438..607bc047479ce1 100644
--- a/docs/source/ko/tasks/multiple_choice.md
+++ b/docs/source/ko/tasks/multiple_choice.md
@@ -25,17 +25,6 @@ rendered properly in your Markdown viewer.
1. [SWAG](https://huggingface.co/datasets/swag) ë°ìŽí° ìžížì 'regular' ê”Źì±ìŒëĄ [BERT](https://huggingface.co/google-bert/bert-base-uncased)ë„Œ ëŻžìž ìĄ°ì íìŹ ìŹëŹ ì”ì
êłŒ ìŒë¶ 컚í
ì€ížê° ìŁŒìŽìĄì ë ê°ì„ ì í©í ë”ì ì íí©ëë€.
2. ì¶ëĄ ì ëŻžìž ìĄ°ì ë ëȘšëžì ìŹì©í©ëë€.
-
-ìŽ íí 늏ìŒìì ì€ëȘ
íë ìì
ì ë€ì ëȘšëž ìí€í
ìČìì ì§ìë©ëë€:
-
-
-
-[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
-
-
-
ììíêž° ì ì íìí ëŒìŽëžëŹëŠŹê° ëȘšë ì€ìčëìŽ ìëì§ íìžíìžì:
```bash
diff --git a/docs/source/ko/tasks/object_detection.md b/docs/source/ko/tasks/object_detection.md
index 0076bba6f8441f..2b92d7edb59ff7 100644
--- a/docs/source/ko/tasks/object_detection.md
+++ b/docs/source/ko/tasks/object_detection.md
@@ -30,13 +30,8 @@ rendered properly in your Markdown viewer.
2. 믞ìžìĄ°ì í ëȘšëžì ì¶ëĄ ì ìŹì©íêž°.
-ìŽ íí 늏ìŒì íì€íŹë ë€ì ëȘšëž ìí€í
ìČìì ì§ìë©ëë€:
-
-
-[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/object-detection)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/question_answering.md b/docs/source/ko/tasks/question_answering.md
index 7fe8ba3a5f08d0..cebd9e1a78a4b0 100644
--- a/docs/source/ko/tasks/question_answering.md
+++ b/docs/source/ko/tasks/question_answering.md
@@ -31,14 +31,8 @@ rendered properly in your Markdown viewer.
2. ì¶ëĄ ì ëŻžìž ìĄ°ì ë ëȘšëž ìŹì©íêž°
-ìŽ íí 늏ìŒìì ì€ëȘ
íë íì€íŹë ë€ìêłŒ ê°ì ëȘšëž ìí€í
ìČìì ì§ìë©ëë€.
-
-
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/question-answering)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/semantic_segmentation.md b/docs/source/ko/tasks/semantic_segmentation.md
index 0afa4bbe020f7c..8a5e20228d608f 100644
--- a/docs/source/ko/tasks/semantic_segmentation.md
+++ b/docs/source/ko/tasks/semantic_segmentation.md
@@ -29,13 +29,8 @@ rendered properly in your Markdown viewer.
2. ëŻžìž ìĄ°ì ë ëȘšëžì ì¶ëĄ ì ìŹì©íêž°.
-ìŽ íí 늏ìŒìì ì€ëȘ
íë ìì
ì ë€ì ëȘšëž ìí€í
ìČìì ì§ìë©ëë€:
-
-
-[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet)
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/image-segmentation)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/sequence_classification.md b/docs/source/ko/tasks/sequence_classification.md
index 9cf6b9f52433a3..b9812e63b0631e 100644
--- a/docs/source/ko/tasks/sequence_classification.md
+++ b/docs/source/ko/tasks/sequence_classification.md
@@ -28,14 +28,8 @@ rendered properly in your Markdown viewer.
2. ì¶ëĄ ì ìíŽ íìž íë ëȘšëžì ìŹì©í©ëë€.
-ìŽ íí 늏ìŒìì ì€ëȘ
íë ìì
ì ë€ì ëȘšëž ìí€í
ìČì ìíŽ ì§ìë©ëë€:
-
-
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/text-classification)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/summarization.md b/docs/source/ko/tasks/summarization.md
index 62e410757e464e..fc09d6a86e1fbf 100644
--- a/docs/source/ko/tasks/summarization.md
+++ b/docs/source/ko/tasks/summarization.md
@@ -33,13 +33,8 @@ rendered properly in your Markdown viewer.
2. íìžíëë ëȘšëžì ìŹì©íìŹ ì¶ëĄ í©ëë€.
-ìŽ íí 늏ìŒìì ì€ëȘ
íë ìì
ì ë€ì ëȘšëž ìí€í
ìČìì ì§ìë©ëë€:
-
-
-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/summarization)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/token_classification.md b/docs/source/ko/tasks/token_classification.md
index 5bb3989d45944f..e32a18e1ee0a04 100644
--- a/docs/source/ko/tasks/token_classification.md
+++ b/docs/source/ko/tasks/token_classification.md
@@ -28,13 +28,8 @@ rendered properly in your Markdown viewer.
2. ì¶ëĄ ì ìíŽ íìž íë ëȘšëžì ìŹì©í©ëë€.
-ìŽ íí 늏ìŒìì ì€ëȘ
íë ìì
ì ë€ì ëȘšëž ìí€í
ìČì ìíŽ ì§ìë©ëë€:
-
-
-[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/token-classification)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/translation.md b/docs/source/ko/tasks/translation.md
index 982142c84ea4ef..b05ecf2d5a2cc9 100644
--- a/docs/source/ko/tasks/translation.md
+++ b/docs/source/ko/tasks/translation.md
@@ -28,13 +28,8 @@ rendered properly in your Markdown viewer.
2. íìžíëë ëȘšëžì ì¶ëĄ ì ìŹì©íë ë°©ëČì
ëë€.
-ìŽ íì€íŹ ê°ìŽëë ìë ëȘšëž ìí€í
ìČìë ìì©í ì ìì”ëë€.
-
-
-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/translation)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ko/tasks/video_classification.md b/docs/source/ko/tasks/video_classification.md
index 762716c9ff7f8e..f18ef918fa956e 100644
--- a/docs/source/ko/tasks/video_classification.md
+++ b/docs/source/ko/tasks/video_classification.md
@@ -28,13 +28,7 @@ rendered properly in your Markdown viewer.
-ìŽ íí 늏ìŒìì ì€ëȘ
íë ìì
ì ë€ì ëȘšëž ìí€í
ìČìì ì§ìë©ëë€:
-
-
-
-[TimeSformer](../model_doc/timesformer), [VideoMAE](../model_doc/videomae)
-
-
+ìŽ ìì
êłŒ ížíëë ëȘšë ìí€í
ìČì ìČŽíŹíŹìžížë„Œ ëłŽë €ë©Ž [ìì
íìŽì§](https://huggingface.co/tasks/video-classification)ë„Œ íìžíë êČìŽ ìąì”ëë€.
diff --git a/docs/source/ms/_toctree.yml b/docs/source/ms/_toctree.yml
index 0ec1ee59ad8914..d69f13511e1023 100644
--- a/docs/source/ms/_toctree.yml
+++ b/docs/source/ms/_toctree.yml
@@ -147,8 +147,6 @@
title: Bagaimana untuk menyumbang kepada transformer?
- local: add_new_model
title: Bagaimana untuk menambah model pada đ€ Transformers?
- - local: add_tensorflow_model
- title: Bagaimana untuk menukar model Transformers kepada TensorFlow?
- local: add_new_pipeline
title: Bagaimana untuk menambah saluran paip ke đ€ Transformers?
- local: testing
diff --git a/docs/source/zh/contributing.md b/docs/source/zh/contributing.md
index f430e8a85f16cd..9c247a60a148c8 100644
--- a/docs/source/zh/contributing.md
+++ b/docs/source/zh/contributing.md
@@ -98,7 +98,7 @@ python src/transformers/commands/transformers_cli.py env
ćŠæäœ æłäșČèȘèŽĄçźæšĄćïŒèŻ·ćèŻæ仏ăèź©æä»Źćžźäœ æćźæ·»ć ć° đ€ TransformersïŒ
-æ仏ć·Čç»æ·»ć äș[èŻŠç»çæććæšĄæż](https://github.com/huggingface/transformers/tree/main/templates)æ„ćžźć©äœ æ·»ć æ°æšĄćăæ仏èżæäžäžȘæŽææŻæ§çæćïŒćèŻäœ [ćŠäœć°æšĄćæ·»ć ć° đ€ Transformers](https://huggingface.co/docs/transformers/add_new_model)ă
+æ仏èżæäžäžȘæŽææŻæ§çæćïŒćèŻäœ [ćŠäœć°æšĄćæ·»ć ć° đ€ Transformers](https://huggingface.co/docs/transformers/add_new_model)ă
## äœ æłèŠæ·»ć ææĄŁćïŒ
diff --git a/docs/source/zh/tasks/asr.md b/docs/source/zh/tasks/asr.md
index 48ab94cb7d9503..b4366d720404ac 100644
--- a/docs/source/zh/tasks/asr.md
+++ b/docs/source/zh/tasks/asr.md
@@ -31,13 +31,7 @@ Siri ć Alexa èżç±»èæć©æäœżçš ASR æšĄćæ„ćžźć©çšæ·æ„ćžžç掻ïŒ
-æŹæçšäžć±ç€șçä»»ćĄć仄äžæšĄćæ¶æçæŻæïŒ
-
-
-
-[Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [M-CTC-T](../model_doc/mctct), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-BERT](../model_doc/wav2vec2-bert), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm)
-
-
+ćŠææšæłæ„çææäžæŹä»»ćĄć
Œćźčçæ¶æćæŁæ„çčïŒæć„œæ„ç[ä»»ćĄéĄ”](https://huggingface.co/tasks/automatic-speech-recognition)ă
diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index d832b76ec04bde..40373bd38a4da6 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -1,5 +1,5 @@
absl-py==1.0.0
-aiohttp==3.8.5
+aiohttp==3.9.0
aiosignal==1.2.0
alembic==1.7.7
appdirs==1.4.4
@@ -15,7 +15,7 @@ backcall==0.2.0
backoff==1.11.1
backports.zoneinfo==0.2.1
binaryornot==0.4.4
-black==22.1.0
+black==24.3.0
boto3==1.16.34
botocore==1.19.63
Brotli==1.0.9
@@ -119,7 +119,7 @@ nltk==3.7
numba==0.55.1
numpy==1.22.3
oauthlib==3.2.2
-onnx==1.13.0
+onnx>=1.15.0
onnxconverter-common==1.9.0
opt-einsum==3.3.0
optax==0.1.1
@@ -174,7 +174,7 @@ python-slugify==6.1.1
pytz==2022.1
pytz-deprecation-shim==0.1.0.post0
PyYAML==6.0
-ray==1.11.0
+ray>2.6.3
redis==4.5.4
regex==2022.3.15
requests==2.31.0
@@ -205,7 +205,7 @@ tensorboard==2.8.0
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.1
tensorboardX==2.5
-tensorflow==2.8.1
+tensorflow==2.11.1
tensorflow-io-gcs-filesystem==0.24.0
termcolor==1.1.0
text-unidecode==1.3
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3ce3e057a240c4..a65ed489d9506b 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -709,6 +709,7 @@
],
"models.persimmon": ["PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP", "PersimmonConfig"],
"models.phi": ["PHI_PRETRAINED_CONFIG_ARCHIVE_MAP", "PhiConfig"],
+ "models.phi3": ["PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP", "Phi3Config"],
"models.phobert": ["PhobertTokenizer"],
"models.pix2struct": [
"PIX2STRUCT_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -3057,6 +3058,16 @@
"PhiPreTrainedModel",
]
)
+ _import_structure["models.phi3"].extend(
+ [
+ "PHI3_PRETRAINED_MODEL_ARCHIVE_LIST",
+ "Phi3ForCausalLM",
+ "Phi3ForSequenceClassification",
+ "Phi3ForTokenClassification",
+ "Phi3Model",
+ "Phi3PreTrainedModel",
+ ]
+ )
_import_structure["models.pix2struct"].extend(
[
"PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -5669,6 +5680,7 @@
PersimmonConfig,
)
from .models.phi import PHI_PRETRAINED_CONFIG_ARCHIVE_MAP, PhiConfig
+ from .models.phi3 import PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP, Phi3Config
from .models.phobert import PhobertTokenizer
from .models.pix2struct import (
PIX2STRUCT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -7715,6 +7727,14 @@
PhiModel,
PhiPreTrainedModel,
)
+ from .models.phi3 import (
+ PHI3_PRETRAINED_MODEL_ARCHIVE_LIST,
+ Phi3ForCausalLM,
+ Phi3ForSequenceClassification,
+ Phi3ForTokenClassification,
+ Phi3Model,
+ Phi3PreTrainedModel,
+ )
from .models.pix2struct import (
PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST,
Pix2StructForConditionalGeneration,
diff --git a/src/transformers/commands/add_new_model.py b/src/transformers/commands/add_new_model.py
deleted file mode 100644
index 87949827d9f884..00000000000000
--- a/src/transformers/commands/add_new_model.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import shutil
-import warnings
-from argparse import ArgumentParser, Namespace
-from pathlib import Path
-from typing import List
-
-from ..utils import logging
-from . import BaseTransformersCLICommand
-
-
-try:
- from cookiecutter.main import cookiecutter
-
- _has_cookiecutter = True
-except ImportError:
- _has_cookiecutter = False
-
-logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-
-
-def add_new_model_command_factory(args: Namespace):
- return AddNewModelCommand(args.testing, args.testing_file, path=args.path)
-
-
-class AddNewModelCommand(BaseTransformersCLICommand):
- @staticmethod
- def register_subcommand(parser: ArgumentParser):
- add_new_model_parser = parser.add_parser("add-new-model")
- add_new_model_parser.add_argument("--testing", action="store_true", help="If in testing mode.")
- add_new_model_parser.add_argument("--testing_file", type=str, help="Configuration file on which to run.")
- add_new_model_parser.add_argument(
- "--path", type=str, help="Path to cookiecutter. Should only be used for testing purposes."
- )
- add_new_model_parser.set_defaults(func=add_new_model_command_factory)
-
- def __init__(self, testing: bool, testing_file: str, path=None, *args):
- self._testing = testing
- self._testing_file = testing_file
- self._path = path
-
- def run(self):
- warnings.warn(
- "The command `transformers-cli add-new-model` is deprecated and will be removed in v5 of Transformers. "
- "It is not actively maintained anymore, so might give a result that won't pass all tests and quality "
- "checks, you should use `transformers-cli add-new-model-like` instead."
- )
- if not _has_cookiecutter:
- raise ImportError(
- "Model creation dependencies are required to use the `add_new_model` command. Install them by running "
- "the following at the root of your `transformers` clone:\n\n\t$ pip install -e .[modelcreation]\n"
- )
- # Ensure that there is no other `cookiecutter-template-xxx` directory in the current working directory
- directories = [directory for directory in os.listdir() if "cookiecutter-template-" == directory[:22]]
- if len(directories) > 0:
- raise ValueError(
- "Several directories starting with `cookiecutter-template-` in current working directory. "
- "Please clean your directory by removing all folders starting with `cookiecutter-template-` or "
- "change your working directory."
- )
-
- path_to_transformer_root = (
- Path(__file__).parent.parent.parent.parent if self._path is None else Path(self._path).parent.parent
- )
- path_to_cookiecutter = path_to_transformer_root / "templates" / "adding_a_new_model"
-
- # Execute cookiecutter
- if not self._testing:
- cookiecutter(str(path_to_cookiecutter))
- else:
- with open(self._testing_file, "r") as configuration_file:
- testing_configuration = json.load(configuration_file)
-
- cookiecutter(
- str(path_to_cookiecutter if self._path is None else self._path),
- no_input=True,
- extra_context=testing_configuration,
- )
-
- directory = [directory for directory in os.listdir() if "cookiecutter-template-" in directory[:22]][0]
-
- # Retrieve configuration
- with open(directory + "/configuration.json", "r") as configuration_file:
- configuration = json.load(configuration_file)
-
- lowercase_model_name = configuration["lowercase_modelname"]
- generate_tensorflow_pytorch_and_flax = configuration["generate_tensorflow_pytorch_and_flax"]
- os.remove(f"{directory}/configuration.json")
-
- output_pytorch = "PyTorch" in generate_tensorflow_pytorch_and_flax
- output_tensorflow = "TensorFlow" in generate_tensorflow_pytorch_and_flax
- output_flax = "Flax" in generate_tensorflow_pytorch_and_flax
-
- model_dir = f"{path_to_transformer_root}/src/transformers/models/{lowercase_model_name}"
- os.makedirs(model_dir, exist_ok=True)
- os.makedirs(f"{path_to_transformer_root}/tests/models/{lowercase_model_name}", exist_ok=True)
-
- # Tests require submodules as they have parent imports
- with open(f"{path_to_transformer_root}/tests/models/{lowercase_model_name}/__init__.py", "w"):
- pass
-
- shutil.move(
- f"{directory}/__init__.py",
- f"{model_dir}/__init__.py",
- )
- shutil.move(
- f"{directory}/configuration_{lowercase_model_name}.py",
- f"{model_dir}/configuration_{lowercase_model_name}.py",
- )
-
- def remove_copy_lines(path):
- with open(path, "r") as f:
- lines = f.readlines()
- with open(path, "w") as f:
- for line in lines:
- if "# Copied from transformers." not in line:
- f.write(line)
-
- if output_pytorch:
- if not self._testing:
- remove_copy_lines(f"{directory}/modeling_{lowercase_model_name}.py")
-
- shutil.move(
- f"{directory}/modeling_{lowercase_model_name}.py",
- f"{model_dir}/modeling_{lowercase_model_name}.py",
- )
-
- shutil.move(
- f"{directory}/test_modeling_{lowercase_model_name}.py",
- f"{path_to_transformer_root}/tests/models/{lowercase_model_name}/test_modeling_{lowercase_model_name}.py",
- )
- else:
- os.remove(f"{directory}/modeling_{lowercase_model_name}.py")
- os.remove(f"{directory}/test_modeling_{lowercase_model_name}.py")
-
- if output_tensorflow:
- if not self._testing:
- remove_copy_lines(f"{directory}/modeling_tf_{lowercase_model_name}.py")
-
- shutil.move(
- f"{directory}/modeling_tf_{lowercase_model_name}.py",
- f"{model_dir}/modeling_tf_{lowercase_model_name}.py",
- )
-
- shutil.move(
- f"{directory}/test_modeling_tf_{lowercase_model_name}.py",
- f"{path_to_transformer_root}/tests/models/{lowercase_model_name}/test_modeling_tf_{lowercase_model_name}.py",
- )
- else:
- os.remove(f"{directory}/modeling_tf_{lowercase_model_name}.py")
- os.remove(f"{directory}/test_modeling_tf_{lowercase_model_name}.py")
-
- if output_flax:
- if not self._testing:
- remove_copy_lines(f"{directory}/modeling_flax_{lowercase_model_name}.py")
-
- shutil.move(
- f"{directory}/modeling_flax_{lowercase_model_name}.py",
- f"{model_dir}/modeling_flax_{lowercase_model_name}.py",
- )
-
- shutil.move(
- f"{directory}/test_modeling_flax_{lowercase_model_name}.py",
- f"{path_to_transformer_root}/tests/models/{lowercase_model_name}/test_modeling_flax_{lowercase_model_name}.py",
- )
- else:
- os.remove(f"{directory}/modeling_flax_{lowercase_model_name}.py")
- os.remove(f"{directory}/test_modeling_flax_{lowercase_model_name}.py")
-
- shutil.move(
- f"{directory}/{lowercase_model_name}.md",
- f"{path_to_transformer_root}/docs/source/en/model_doc/{lowercase_model_name}.md",
- )
-
- shutil.move(
- f"{directory}/tokenization_{lowercase_model_name}.py",
- f"{model_dir}/tokenization_{lowercase_model_name}.py",
- )
-
- shutil.move(
- f"{directory}/tokenization_fast_{lowercase_model_name}.py",
- f"{model_dir}/tokenization_{lowercase_model_name}_fast.py",
- )
-
- from os import fdopen, remove
- from shutil import copymode, move
- from tempfile import mkstemp
-
- def replace(original_file: str, line_to_copy_below: str, lines_to_copy: List[str]):
- # Create temp file
- fh, abs_path = mkstemp()
- line_found = False
- with fdopen(fh, "w") as new_file:
- with open(original_file) as old_file:
- for line in old_file:
- new_file.write(line)
- if line_to_copy_below in line:
- line_found = True
- for line_to_copy in lines_to_copy:
- new_file.write(line_to_copy)
-
- if not line_found:
- raise ValueError(f"Line {line_to_copy_below} was not found in file.")
-
- # Copy the file permissions from the old file to the new file
- copymode(original_file, abs_path)
- # Remove original file
- remove(original_file)
- # Move new file
- move(abs_path, original_file)
-
- def skip_units(line):
- return (
- ("generating PyTorch" in line and not output_pytorch)
- or ("generating TensorFlow" in line and not output_tensorflow)
- or ("generating Flax" in line and not output_flax)
- )
-
- def replace_in_files(path_to_datafile):
- with open(path_to_datafile) as datafile:
- lines_to_copy = []
- skip_file = False
- skip_snippet = False
- for line in datafile:
- if "# To replace in: " in line and "##" not in line:
- file_to_replace_in = line.split('"')[1]
- skip_file = skip_units(line)
- elif "# Below: " in line and "##" not in line:
- line_to_copy_below = line.split('"')[1]
- skip_snippet = skip_units(line)
- elif "# End." in line and "##" not in line:
- if not skip_file and not skip_snippet:
- replace(file_to_replace_in, line_to_copy_below, lines_to_copy)
-
- lines_to_copy = []
- elif "# Replace with" in line and "##" not in line:
- lines_to_copy = []
- elif "##" not in line:
- lines_to_copy.append(line)
-
- remove(path_to_datafile)
-
- replace_in_files(f"{directory}/to_replace_{lowercase_model_name}.py")
- os.rmdir(directory)
diff --git a/src/transformers/commands/transformers_cli.py b/src/transformers/commands/transformers_cli.py
index 07396be2e54492..6e8cfea0c3141a 100644
--- a/src/transformers/commands/transformers_cli.py
+++ b/src/transformers/commands/transformers_cli.py
@@ -15,7 +15,6 @@
from argparse import ArgumentParser
-from .add_new_model import AddNewModelCommand
from .add_new_model_like import AddNewModelLikeCommand
from .convert import ConvertCommand
from .download import DownloadCommand
@@ -38,7 +37,6 @@ def main():
RunCommand.register_subcommand(commands_parser)
ServeCommand.register_subcommand(commands_parser)
UserCommands.register_subcommand(commands_parser)
- AddNewModelCommand.register_subcommand(commands_parser)
AddNewModelLikeCommand.register_subcommand(commands_parser)
LfsCommands.register_subcommand(commands_parser)
PTtoTFCommand.register_subcommand(commands_parser)
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index df7ce2f11672b8..a954186155bcb8 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -46,7 +46,7 @@ def import_protobuf(error_message=""):
def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
if add_prefix_space:
prepend_scheme = "always"
- if hasattr(original_tokenizer, "legacy") and not original_tokenizer.legacy:
+ if not getattr(original_tokenizer, "legacy", True):
prepend_scheme = "first"
else:
prepend_scheme = "never"
@@ -105,7 +105,7 @@ def extract(self, vocab_scores=None) -> Tuple[Dict[str, int], List[Tuple]]:
# there is a missing token in the vocab. We have to do this to support merges
# "<0x09>" is the bytefallback for `\t`
- vocab["\t"] = vocab.pop("<0x09>")
+ vocab["\t"] = vocab.get("<0x09>")
if vocab_scores is not None:
vocab_scores, reverse = dict(vocab_scores), True
@@ -1276,7 +1276,7 @@ def vocab(self, proto):
return vocab
def pre_tokenizer(self, replacement, add_prefix_space):
- return None
+ return pre_tokenizers.Split(" ", "merged_with_previous")
def unk_id(self, proto):
unk_id = 3
@@ -1329,7 +1329,7 @@ def tokenizer(self, proto):
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
)
user_defined_symbols = [
- AddedToken(token, normalized=False, special=False) for token in proto.trainer_spec.user_defined_symbols
+ AddedToken(token, normalized=True, special=False) for token in proto.trainer_spec.user_defined_symbols
]
tokenizer.add_tokens(user_defined_symbols)
return tokenizer
@@ -1393,13 +1393,17 @@ def tokenizer(self, proto):
return tokenizer
def normalizer(self, proto):
- sequence = []
- if getattr(self.original_tokenizer, "add_prefix_space", True):
- sequence += [normalizers.Prepend(prepend="â")]
- sequence += [normalizers.Replace(pattern=" ", content="â")]
- return normalizers.Sequence(sequence)
+ if getattr(self.original_tokenizer, "legacy", True):
+ if getattr(self.original_tokenizer, "add_prefix_space"):
+ sequence += [normalizers.Prepend(prepend="â")]
+ sequence += [normalizers.Replace(pattern=" ", content="â")]
+ return normalizers.Sequence(sequence)
+ return None # non-legacy, no normalizer
def pre_tokenizer(self, replacement, add_prefix_space):
+ if not self.original_tokenizer.legacy: # non-legacy, we need a replace
+ prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
+ return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme, split=False)
return None
def post_processor(self):
@@ -1445,6 +1449,99 @@ def converted(self) -> Tokenizer:
return tokenizer
+# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
+def bytes_to_unicode():
+ """
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+ characters the bpe code barfs on.
+
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+ tables between utf-8 bytes and unicode strings.
+ """
+ bs = (
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("ÂĄ"), ord("ÂŹ") + 1)) + list(range(ord("Âź"), ord("Ăż") + 1))
+ )
+ cs = bs[:]
+ n = 0
+ for b in range(2**8):
+ if b not in bs:
+ bs.append(b)
+ cs.append(2**8 + n)
+ n += 1
+ cs = [chr(n) for n in cs]
+ return dict(zip(bs, cs))
+
+
+class TikTokenConverter:
+ """
+ A general tiktoken converter.
+ """
+
+ def __init__(
+ self,
+ vocab_file=None,
+ pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
+ add_prefix_space=False,
+ *args,
+ ):
+ super().__init__(*args)
+ self.vocab_file = vocab_file
+ self.pattern = pattern
+ self.add_prefix_space = add_prefix_space
+
+ def extract_vocab_merges_from_model(self, tiktoken_url: str):
+ try:
+ from tiktoken.load import load_tiktoken_bpe
+ except Exception:
+ raise ValueError(
+ "`tiktoken` is required to read a `tiktoken` file. Install it with " "`pip install tiktoken`."
+ )
+
+ bpe_ranks = load_tiktoken_bpe(tiktoken_url)
+ byte_encoder = bytes_to_unicode()
+
+ def token_bytes_to_string(b):
+ return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
+
+ merges = []
+ vocab = {}
+ for token, rank in bpe_ranks.items():
+ vocab[token_bytes_to_string(token)] = rank
+ if len(token) == 1:
+ continue
+ local = []
+ for index in range(1, len(token)):
+ piece_l, piece_r = token[:index], token[index:]
+ if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks:
+ local.append((piece_l, piece_r, rank))
+ local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False)
+ merges.extend(local)
+ merges = sorted(merges, key=lambda val: val[2], reverse=False)
+ merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges]
+ return vocab, merges
+
+ def tokenizer(self):
+ vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab_file)
+ tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False))
+ if hasattr(tokenizer.model, "ignore_merges"):
+ tokenizer.model.ignore_merges = True
+ return tokenizer
+
+ def converted(self) -> Tokenizer:
+ tokenizer = self.tokenizer()
+ tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+ [
+ pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False),
+ pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False),
+ ]
+ )
+ tokenizer.decoder = decoders.ByteLevel()
+ tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
+ return tokenizer
+
+
SLOW_TO_FAST_CONVERTERS = {
"AlbertTokenizer": AlbertConverter,
"BartTokenizer": RobertaConverter,
diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py
index 5a42f474be2692..44c040ca6a4855 100644
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@@ -481,7 +481,18 @@ def __init__(self, eos_token_id: Union[int, List[int]]):
@add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
- is_done = torch.isin(input_ids[:, -1], self.eos_token_id.to(input_ids.device))
+ if input_ids.device.type == "mps":
+ # https://github.com/pytorch/pytorch/issues/77764#issuecomment-2067838075
+ is_done = (
+ input_ids[:, -1]
+ .tile(self.eos_token_id.shape[0], 1)
+ .eq(self.eos_token_id.unsqueeze(1).to(input_ids.device))
+ .sum(dim=0)
+ .bool()
+ .squeeze()
+ )
+ else:
+ is_done = torch.isin(input_ids[:, -1], self.eos_token_id.to(input_ids.device))
return is_done
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index bf718932a43602..9e6a58d3e5a560 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -641,6 +641,7 @@ def _update_model_kwargs_for_generation(
model_kwargs: Dict[str, Any],
is_encoder_decoder: bool = False,
standardize_cache_format: bool = False,
+ num_new_tokens: int = 1,
) -> Dict[str, Any]:
# update past_key_values
model_kwargs["past_key_values"] = self._extract_past_from_model_output(
@@ -671,7 +672,7 @@ def _update_model_kwargs_for_generation(
)
if "cache_position" in model_kwargs and model_kwargs["cache_position"] is not None:
- model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1
+ model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
return model_kwargs
@@ -1294,6 +1295,21 @@ def _prepare_generation_config(
return generation_config, model_kwargs
+ def _get_initial_cache_position(self, input_ids, model_kwargs):
+ """Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length"""
+ past_length = 0
+ if "past_key_values" in model_kwargs:
+ if isinstance(model_kwargs["past_key_values"], Cache):
+ past_length = model_kwargs["past_key_values"].get_seq_length()
+ else:
+ past_length = model_kwargs["past_key_values"][0][0].shape[2]
+ if "inputs_embeds" in model_kwargs:
+ cur_len = model_kwargs["inputs_embeds"].shape[1]
+ else:
+ cur_len = input_ids.shape[-1]
+ model_kwargs["cache_position"] = torch.arange(past_length, cur_len, device=input_ids.device)
+ return model_kwargs
+
@torch.no_grad()
def generate(
self,
@@ -1560,6 +1576,8 @@ def generate(
raise ValueError("assisted generate is only supported for batch_size = 1")
if not model_kwargs["use_cache"]:
raise ValueError("assisted generate requires `use_cache=True`")
+ if generation_config.cache_implementation == "static":
+ raise ValueError("assisted generate is not supported with `static_cache`")
# 11. Get the candidate generator, given the parameterization
candidate_generator = self._get_candidate_generator(
@@ -2024,11 +2042,9 @@ def _contrastive_search(
)
# keep track of which sequences are already finished
- batch_size, cur_len = input_ids.shape
- if "inputs_embeds" in model_kwargs:
- cur_len = model_kwargs["inputs_embeds"].shape[1]
+ batch_size = input_ids.shape[0]
unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
- model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
+ model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
this_peer_finished = False
@@ -2495,12 +2511,10 @@ def _greedy_search(
)
# keep track of which sequences are already finished
- batch_size, cur_len = input_ids.shape
- if "inputs_embeds" in model_kwargs:
- cur_len = model_kwargs["inputs_embeds"].shape[1]
+ batch_size = input_ids.shape[0]
this_peer_finished = False
unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
- model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
+ model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
# prepare model inputs
@@ -2792,12 +2806,10 @@ def _sample(
)
# keep track of which sequences are already finished
- batch_size, cur_len = input_ids.shape
- if "inputs_embeds" in model_kwargs:
- cur_len = model_kwargs["inputs_embeds"].shape[1]
+ batch_size = input_ids.shape[0]
this_peer_finished = False
unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
- model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
+ model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
# prepare model inputs
@@ -3108,9 +3120,7 @@ def _beam_search(
num_beams = beam_scorer.num_beams
batch_beam_size, cur_len = input_ids.shape
- if "inputs_embeds" in model_kwargs:
- cur_len = model_kwargs["inputs_embeds"].shape[1]
- model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
+ model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
if num_beams * batch_size != batch_beam_size:
raise ValueError(
@@ -3514,9 +3524,7 @@ def _beam_sample(
num_beams = beam_scorer.num_beams
batch_beam_size, cur_len = input_ids.shape
- if "inputs_embeds" in model_kwargs:
- cur_len = model_kwargs["inputs_embeds"].shape[1]
- model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
+ model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
# init attention / hidden states / scores tuples
scores = () if (return_dict_in_generate and output_scores) else None
@@ -3874,9 +3882,7 @@ def _group_beam_search(
device = input_ids.device
batch_beam_size, cur_len = input_ids.shape
- if "inputs_embeds" in model_kwargs:
- cur_len = model_kwargs["inputs_embeds"].shape[1]
- model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
+ model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
if return_dict_in_generate and output_scores:
beam_indices = [tuple(() for _ in range(num_sub_beams * batch_size)) for _ in range(num_beam_groups)]
@@ -4292,9 +4298,7 @@ def _constrained_beam_search(
num_beams = constrained_beam_scorer.num_beams
batch_beam_size, cur_len = input_ids.shape
- if "inputs_embeds" in model_kwargs:
- cur_len = model_kwargs["inputs_embeds"].shape[1]
- model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
+ model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
if num_beams * batch_size != batch_beam_size:
raise ValueError(
@@ -4655,11 +4659,9 @@ def _assisted_decoding(
)
# keep track of which sequences are already finished
- batch_size, cur_len = input_ids.shape
- if "inputs_embeds" in model_kwargs:
- cur_len = model_kwargs["inputs_embeds"].shape[1]
+ batch_size = input_ids.shape[0]
unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
- model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
+ model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
this_peer_finished = False
while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
@@ -4679,20 +4681,21 @@ def _assisted_decoding(
# we use this forward pass to also pick the subsequent logits in the original model.
# 2.1. Prepare the model inputs
- model_kwargs = _prepare_attention_mask(
- model_kwargs, candidate_input_ids.shape[1], self.config.is_encoder_decoder
+ candidate_kwargs = copy.copy(model_kwargs)
+ candidate_kwargs = _prepare_attention_mask(
+ candidate_kwargs, candidate_input_ids.shape[1], self.config.is_encoder_decoder
)
- model_kwargs = _prepare_token_type_ids(model_kwargs, candidate_input_ids.shape[1])
- if "cache_position" in model_kwargs:
- model_kwargs["cache_position"] = torch.cat(
+ candidate_kwargs = _prepare_token_type_ids(candidate_kwargs, candidate_input_ids.shape[1])
+ if "cache_position" in candidate_kwargs:
+ candidate_kwargs["cache_position"] = torch.cat(
(
- model_kwargs["cache_position"],
+ candidate_kwargs["cache_position"],
torch.arange(cur_len, cur_len + candidate_length, device=input_ids.device, dtype=torch.long),
),
dim=0,
)
- model_inputs = self.prepare_inputs_for_generation(candidate_input_ids, **model_kwargs)
+ model_inputs = self.prepare_inputs_for_generation(candidate_input_ids, **candidate_kwargs)
if "num_logits_to_keep" in model_inputs:
model_inputs["num_logits_to_keep"] = candidate_length + 1
@@ -4811,6 +4814,7 @@ def _assisted_decoding(
outputs,
model_kwargs,
is_encoder_decoder=self.config.is_encoder_decoder,
+ num_new_tokens=n_matches + 1,
)
unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index db736439152cd3..5c2ef2ed318de6 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1160,12 +1160,13 @@ def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool
# For 4bit models, we need to multiply the number of parameters by 2 as half of the parameters are
# used for the 4bit quantization (uint8 tensors are stored)
if is_loaded_in_4bit and isinstance(param, bnb.nn.Params4bit):
- quant_storage = self.hf_quantizer.quantization_config.bnb_4bit_quant_storage
- # For compatibility with older PT version - see: https://github.com/huggingface/peft/pull/1635
- nb_params = (
- quant_storage.itemsize if hasattr(quant_storage, "itemsize") else quant_storage.element_size()
- )
- total_numel.append(param.numel() * 2 * nb_params)
+ if hasattr(param, "element_size"):
+ num_bytes = param.element_size()
+ elif hasattr(param, "quant_storage"):
+ num_bytes = param.quant_storage.itemsize
+ else:
+ num_bytes = 1
+ total_numel.append(param.numel() * 2 * num_bytes)
else:
total_numel.append(param.numel())
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 292a264644be85..f07a4fc5887e09 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -179,6 +179,7 @@
perceiver,
persimmon,
phi,
+ phi3,
phobert,
pix2struct,
plbart,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 29a52ba755f023..c8280a1270ac66 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -191,6 +191,7 @@
("perceiver", "PerceiverConfig"),
("persimmon", "PersimmonConfig"),
("phi", "PhiConfig"),
+ ("phi3", "Phi3Config"),
("pix2struct", "Pix2StructConfig"),
("plbart", "PLBartConfig"),
("poolformer", "PoolFormerConfig"),
@@ -412,6 +413,7 @@
("lilt", "LiLT"),
("llama", "LLaMA"),
("llama2", "Llama2"),
+ ("llama3", "Llama3"),
("llava", "LLaVa"),
("llava_next", "LLaVA-NeXT"),
("longformer", "Longformer"),
@@ -470,6 +472,7 @@
("perceiver", "Perceiver"),
("persimmon", "Persimmon"),
("phi", "Phi"),
+ ("phi3", "Phi3"),
("phobert", "PhoBERT"),
("pix2struct", "Pix2Struct"),
("plbart", "PLBart"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index dcc4829f3f6f1e..50b2335800567a 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -180,6 +180,7 @@
("perceiver", "PerceiverModel"),
("persimmon", "PersimmonModel"),
("phi", "PhiModel"),
+ ("phi3", "Phi3Model"),
("plbart", "PLBartModel"),
("poolformer", "PoolFormerModel"),
("prophetnet", "ProphetNetModel"),
@@ -474,6 +475,7 @@
("pegasus", "PegasusForCausalLM"),
("persimmon", "PersimmonForCausalLM"),
("phi", "PhiForCausalLM"),
+ ("phi3", "Phi3ForCausalLM"),
("plbart", "PLBartForCausalLM"),
("prophetnet", "ProphetNetForCausalLM"),
("qdqbert", "QDQBertLMHeadModel"),
@@ -884,6 +886,7 @@
("perceiver", "PerceiverForSequenceClassification"),
("persimmon", "PersimmonForSequenceClassification"),
("phi", "PhiForSequenceClassification"),
+ ("phi3", "Phi3ForSequenceClassification"),
("plbart", "PLBartForSequenceClassification"),
("qdqbert", "QDQBertForSequenceClassification"),
("qwen2", "Qwen2ForSequenceClassification"),
@@ -1049,6 +1052,7 @@
("nezha", "NezhaForTokenClassification"),
("nystromformer", "NystromformerForTokenClassification"),
("phi", "PhiForTokenClassification"),
+ ("phi3", "Phi3ForTokenClassification"),
("qdqbert", "QDQBertForTokenClassification"),
("rembert", "RemBertForTokenClassification"),
("roberta", "RobertaForTokenClassification"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 822a680bc4bf33..363e0a4f9f8361 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -354,6 +354,7 @@
),
),
("phi", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
+ ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("phobert", ("PhobertTokenizer", None)),
("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 5b9e0cf732399e..03e2fceb0e5b83 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -450,6 +450,11 @@ def _init_weights(self, module):
module.text_projection.weight,
std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
)
+ elif isinstance(module, CLIPForImageClassification):
+ nn.init.normal_(
+ module.classifier.weight,
+ std=self.config.vision_config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
if isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 4e3b8498480c9e..d61877cb1f1e7e 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -1209,6 +1209,24 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_
)
return model_inputs
+ def _get_initial_cache_position(self, input_ids, model_kwargs):
+ """
+ Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length.
+ Since gpt bigcode is special, the method is overridden here, other models use it from `generation.utils.py`.
+ """
+ past_length = 0
+ if "past_key_values" in model_kwargs:
+ if self.config.multi_query:
+ past_length = model_kwargs["past_key_values"][0].shape[1]
+ else:
+ past_length = model_kwargs["past_key_values"][0].shape[2]
+ if "inputs_embeds" in model_kwargs:
+ cur_len = model_kwargs["inputs_embeds"].shape[1]
+ else:
+ cur_len = input_ids.shape[-1]
+ model_kwargs["cache_position"] = torch.arange(past_length, cur_len, device=input_ids.device)
+ return model_kwargs
+
@add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py
index 9780d95d4ee376..80d5dad3cbd849 100755
--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@@ -231,6 +231,7 @@ def __init__(self, config, batch_size, dtype=torch.float16, device=None):
conv_kernel_size = config.mamba_d_conv
self.conv_states = []
self.ssm_states = []
+ self.transformer_layers = []
for i in range(config.num_hidden_layers):
if self.layers_block_type[i] == "mamba":
self.conv_states += [
@@ -242,6 +243,7 @@ def __init__(self, config, batch_size, dtype=torch.float16, device=None):
else:
self.conv_states += [torch.tensor([[]] * batch_size, device=device)]
self.ssm_states += [torch.tensor([[]] * batch_size, device=device)]
+ self.transformer_layers.append(i)
self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
@@ -276,6 +278,14 @@ def reorder_cache(self, beam_idx: torch.LongTensor):
device = self.ssm_states[layer_idx].device
self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+ """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+ # take any layer that contains cache and not empty tensor
+ layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+ if len(self.key_cache) <= layer_idx:
+ return 0
+ return self.key_cache[layer_idx].shape[-2]
+
def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.")
@@ -909,6 +919,8 @@ def slow_forward(self, input_states, cache_params: HybridMambaAttentionDynamicCa
else:
ssm_state = cache_params.ssm_states[self.layer_idx]
+ ssm_state = ssm_state.to(hidden_states.device)
+
if cache_params.has_previous_state and seq_len == 1 and \
cache_params.conv_states[self.layer_idx].shape[0] == batch_size:
conv_state = cache_params.conv_states[self.layer_idx] # [batch, intermediate_size, conv_kernel_size]
@@ -952,7 +964,6 @@ def slow_forward(self, input_states, cache_params: HybridMambaAttentionDynamicCa
discrete_A = torch.exp(A[None, :, None, :] * discrete_time_step[:, :, :, None]) # [batch, intermediate_size, seq_len, ssm_state_size]
discrete_B = discrete_time_step[:, :, :, None] * B[:, None, :, :].float() # [batch, intermediade_size, seq_len, ssm_state_size]
deltaB_u = discrete_B * hidden_states[:, :, :, None].float()
-
# 3.c perform the recurrence y â SSM(A, B, C)(x)
scan_outputs = []
for i in range(seq_len):
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 41939b044a8438..e3c58fa47e51ad 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -503,6 +503,9 @@ def _init_weights(self, module):
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
+ elif isinstance(module, LayoutLMv2Model):
+ if hasattr(module, "visual_segment_embedding"):
+ module.visual_segment_embedding.data.normal_(mean=0.0, std=self.config.initializer_range)
def my_convert_sync_batchnorm(module, process_group=None):
@@ -822,7 +825,7 @@ def forward(
>>> import torch
>>> from datasets import load_dataset
- >>> set_seed(88)
+ >>> set_seed(0)
>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
>>> model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased")
@@ -993,7 +996,7 @@ def forward(
>>> import torch
>>> from datasets import load_dataset
- >>> set_seed(88)
+ >>> set_seed(0)
>>> dataset = load_dataset("rvl_cdip", split="train", streaming=True)
>>> data = next(iter(dataset))
@@ -1012,8 +1015,8 @@ def forward(
>>> loss, logits = outputs.loss, outputs.logits
>>> predicted_idx = logits.argmax(dim=-1).item()
>>> predicted_answer = dataset.info.features["label"].names[4]
- >>> predicted_idx, predicted_answer
- (4, 'advertisement')
+ >>> predicted_idx, predicted_answer # results are not good without further fine-tuning
+ (7, 'advertisement')
```
"""
@@ -1172,7 +1175,7 @@ def forward(
>>> from PIL import Image
>>> from datasets import load_dataset
- >>> set_seed(88)
+ >>> set_seed(0)
>>> datasets = load_dataset("nielsr/funsd", split="test")
>>> labels = datasets.features["ner_tags"].feature.names
@@ -1203,8 +1206,8 @@ def forward(
>>> predicted_token_class_ids = logits.argmax(-1)
>>> predicted_tokens_classes = [id2label[t.item()] for t in predicted_token_class_ids[0]]
- >>> predicted_tokens_classes[:5]
- ['B-ANSWER', 'B-HEADER', 'B-HEADER', 'B-HEADER', 'B-HEADER']
+ >>> predicted_tokens_classes[:5] # results are not good without further fine-tuning
+ ['I-HEADER', 'I-HEADER', 'I-QUESTION', 'I-HEADER', 'I-QUESTION']
```
"""
@@ -1314,7 +1317,7 @@ def forward(
>>> from PIL import Image
>>> from datasets import load_dataset
- >>> set_seed(88)
+ >>> set_seed(0)
>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
>>> model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")
@@ -1328,12 +1331,12 @@ def forward(
>>> predicted_start_idx = outputs.start_logits.argmax(-1).item()
>>> predicted_end_idx = outputs.end_logits.argmax(-1).item()
>>> predicted_start_idx, predicted_end_idx
- (154, 287)
+ (30, 191)
>>> predicted_answer_tokens = encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]
>>> predicted_answer = processor.tokenizer.decode(predicted_answer_tokens)
- >>> predicted_answer # results are not very good without further fine-tuning
- 'council mem - bers conducted by trrf treasurer philip g. kuehn to get answers which the public ...
+ >>> predicted_answer # results are not good without further fine-tuning
+ '44 a. m. to 12 : 25 p. m. 12 : 25 to 12 : 58 p. m. 12 : 58 to 4 : 00 p. m. 2 : 00 to 5 : 00 p. m. coffee break coffee will be served for men and women in the lobby adjacent to exhibit area. please move into exhibit area. ( exhibits open ) trrf general session ( part | ) presiding : lee a. waller trrf vice president â introductory remarks â lee a. waller, trrf vice presi - dent individual interviews with trrf public board members and sci - entific advisory council mem - bers conducted by trrf treasurer philip g. kuehn to get answers which the public refrigerated warehousing industry is looking for. plus questions from'
```
```python
@@ -1343,7 +1346,7 @@ def forward(
>>> predicted_answer_span_start = outputs.start_logits.argmax(-1).item()
>>> predicted_answer_span_end = outputs.end_logits.argmax(-1).item()
>>> predicted_answer_span_start, predicted_answer_span_end
- (154, 287)
+ (30, 191)
```
"""
diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
index f9bca1204a22ec..a98d44b7484ada 100644
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
+++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
@@ -20,7 +20,8 @@
import torch
-from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast
+from transformers.convert_slow_tokenizer import TikTokenConverter
try:
@@ -51,10 +52,31 @@
Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+
+If you want you tokenizer to add a bos automatically you should update the tokenizer._tokenizers.post_processor:
+
+```py
+from tokenizers import processors
+bos = "<|begin_of_text|>"
+tokenizer._tokenizers.post_processor = processors.Sequence(
+ [
+ processors.ByteLevel(trim_offsets=False),
+ processors.TemplateProcessing(
+ single=f"{bos}:0 $A:0",
+ pair=f"{bos}:0 $A:0 {bos}:1 $B:1",
+ special_tokens=[
+ (bos, tokenizer.encode(bos)),
+ ],
+ ),
+ ]
+)
+```
"""
NUM_SHARDS = {
"7B": 1,
+ "8B": 1,
+ "8Bf": 1,
"7Bf": 1,
"13B": 2,
"13Bf": 2,
@@ -81,7 +103,12 @@ def write_json(text, path):
def write_model(
- model_path, input_base_path, model_size, tokenizer_path=None, safe_serialization=True, llama_version=1
+ model_path,
+ input_base_path,
+ model_size,
+ safe_serialization=True,
+ llama_version=1,
+ vocab_size=None,
):
# for backward compatibility, before you needed the repo to be called `my_repo/model_size`
if not os.path.isfile(os.path.join(input_base_path, "params.json")):
@@ -101,7 +128,7 @@ def write_model(
dims_per_head = dim // n_heads
base = params.get("rope_theta", 10000.0)
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
- if base > 10000.0:
+ if base > 10000.0 and llama_version != 3:
max_position_embeddings = 16384
else:
# Depending on the Llama version, the default max_position_embeddings has different values.
@@ -109,18 +136,10 @@ def write_model(
max_position_embeddings = 2048
elif llama_version == 2:
max_position_embeddings = 4096
- else:
- raise NotImplementedError(
- f"Version {llama_version} of llama is not supported yet. "
- "Current supported versions of llama are [1, 2]."
- )
-
- tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
- if tokenizer_path is not None:
- tokenizer = tokenizer_class(tokenizer_path)
- tokenizer.save_pretrained(model_path)
- vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
+ elif llama_version == 3:
+ max_position_embeddings = 8192
+ vocab_size = vocab_size if vocab_size is not None else 32000
if params.get("n_kv_heads", None) is not None:
num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
@@ -131,7 +150,7 @@ def write_model(
key_value_dim = dim
# permute for sliced rotary
- def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
+ def permute(w, n_heads, dim1=dim, dim2=dim):
return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
@@ -154,10 +173,12 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
# Unsharded
state_dict = {
f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
- loaded[f"layers.{layer_i}.attention.wq.weight"]
+ loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads
),
f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
- loaded[f"layers.{layer_i}.attention.wk.weight"]
+ loaded[f"layers.{layer_i}.attention.wk.weight"],
+ n_heads=num_key_value_heads,
+ dim1=dim // num_local_key_value_heads,
),
f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
@@ -188,7 +209,8 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
for i in range(num_shards)
],
dim=0,
- ).reshape(dim, dim)
+ ).reshape(dim, dim),
+ n_heads=n_heads,
)
state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
torch.cat(
@@ -242,10 +264,11 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
"lm_head.weight": loaded["output.weight"],
}
else:
+ concat_dim = 0 if llama_version == 3 else 1
state_dict = {
"model.norm.weight": loaded[0]["norm.weight"],
"model.embed_tokens.weight": torch.cat(
- [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
+ [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=concat_dim
),
"lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
}
@@ -270,6 +293,8 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
vocab_size=vocab_size,
rope_theta=base,
max_position_embeddings=max_position_embeddings,
+ bos_token_id=128000 if llama_version == 3 else 1,
+ eos_token_id=128001 if llama_version == 3 else 2,
)
config.save_pretrained(tmp_model_path)
@@ -288,12 +313,54 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
shutil.rmtree(tmp_model_path)
-def write_tokenizer(tokenizer_path, input_tokenizer_path):
- # Initialize the tokenizer based on the `spm` model
+class Llama3Converter(TikTokenConverter):
+ def __init__(self, vocab_file, num_reserved_special_tokens=256, **kwargs):
+ super().__init__(vocab_file, **kwargs)
+ tokenizer = self.converted()
+ chat_template = (
+ "{% set loop_messages = messages %}"
+ "{% for message in loop_messages %}"
+ "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}"
+ "{% if loop.index0 == 0 %}"
+ "{% set content = bos_token + content %}"
+ "{% endif %}"
+ "{{ content }}"
+ "{% endfor %}"
+ "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
+ )
+ num_reserved_special_tokens = 256
+ special_tokens = [
+ "<|begin_of_text|>",
+ "<|end_of_text|>",
+ "<|reserved_special_token_0|>",
+ "<|reserved_special_token_1|>",
+ "<|reserved_special_token_2|>",
+ "<|reserved_special_token_3|>",
+ "<|start_header_id|>",
+ "<|end_header_id|>",
+ "<|reserved_special_token_4|>",
+ "<|eot_id|>", # end of turn
+ ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)]
+ tokenizer.add_special_tokens(special_tokens)
+
+ self.tokenizer = PreTrainedTokenizerFast(
+ tokenizer_object=tokenizer,
+ bos_token="<|begin_of_text|>",
+ eos_token="<|end_of_text|>",
+ chat_template=chat_template,
+ model_input_names=["input_ids", "attention_mask"],
+ )
+
+
+def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version=2):
tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
+ if llama_version == 3:
+ tokenizer = Llama3Converter(input_tokenizer_path).tokenizer
+ else:
+ tokenizer = tokenizer_class(input_tokenizer_path)
print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
- tokenizer = tokenizer_class(input_tokenizer_path)
tokenizer.save_pretrained(tokenizer_path)
+ return tokenizer
def main():
@@ -304,35 +371,36 @@ def main():
)
parser.add_argument(
"--model_size",
- choices=["7B", "7Bf", "13B", "13Bf", "30B", "34B", "65B", "70B", "70Bf", "tokenizer_only"],
+ choices=["7B", "8B", "8Bf", "7Bf", "13B", "13Bf", "30B", "34B", "65B", "70B", "70Bf", "tokenizer_only"],
help="'f' models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama",
)
parser.add_argument(
"--output_dir",
help="Location to write HF model and tokenizer",
)
- parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
+ parser.add_argument(
+ "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
+ )
# Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used.
parser.add_argument(
"--llama_version",
- choices=[1, 2],
+ choices=[1, 2, 3],
default=1,
type=int,
help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size",
)
args = parser.parse_args()
spm_path = os.path.join(args.input_dir, "tokenizer.model")
+ vocab_size = len(write_tokenizer(args.output_dir, spm_path, llama_version=args.llama_version))
if args.model_size != "tokenizer_only":
write_model(
model_path=args.output_dir,
input_base_path=args.input_dir,
model_size=args.model_size,
safe_serialization=args.safe_serialization,
- tokenizer_path=spm_path,
llama_version=args.llama_version,
+ vocab_size=vocab_size,
)
- else:
- write_tokenizer(args.output_dir, spm_path)
if __name__ == "__main__":
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index d95694a1f72c17..def5e8ecbaacf1 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -99,30 +99,30 @@ class LlamaTokenizer(PreTrainedTokenizer):
Whether or not to add spaces between special tokens.
legacy (`bool`, *optional*):
Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
- and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
- example:
+ and #25224 which includes fixes to properly handle tokens that appear after special tokens.
+ Make sure to also set `from_slow` to `True`.
+ A simple example:
- `legacy=True`:
```python
- >>> from transformers import T5Tokenizer
+ >>> from transformers import LlamaTokenizerFast
- >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=True)
- >>> tokenizer.encode("Hello .")
- [8774, 32099, 3, 5, 1]
+ >>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=True, from_slow=True)
+ >>> tokenizer.encode("Hello .") # 869 is 'â.'
+ [1, 15043, 29871, 1, 869]
```
- `legacy=False`:
```python
- >>> from transformers import T5Tokenizer
+ >>> from transformers import LlamaTokenizerFast
- >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
- >>> tokenizer.encode("Hello .") # the extra space `[3]` is no longer here
- [8774, 32099, 5, 1]
+ >>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
+ >>> tokenizer.encode("Hello .") # 29889 is '.'
+ [1, 15043, 29871, 1, 29889]
```
Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
add_prefix_space (`bool`, *optional*, defaults to `True`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
- other word.
-
+ other word. Again, this should be set with `from_slow=True` to make sure it's taken into account.
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index f9ce292b7faab3..ccc01cd61914e9 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -91,7 +91,30 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
add_eos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an `eos_token` at the end of sequences.
use_default_system_prompt (`bool`, *optional*, defaults to `False`):
- Whether or not the default system prompt for Llama should be used.
+ Whether or not the default system prompt for Llama should be used
+ legacy (`bool`, *optional*):
+ Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
+ and #25224 which includes fixes to properly handle tokens that appear after special tokens.
+ Make sure to also set `from_slow` to `True`.
+ A simple example:
+
+ - `legacy=True`:
+ ```python
+ >>> from transformers import LlamaTokenizerFast
+
+ >>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=True, from_slow=True)
+ >>> tokenizer.encode("Hello .") # 869 is 'â.'
+ [1, 15043, 29871, 1, 869]
+ ```
+ - `legacy=False`:
+ ```python
+ >>> from transformers import LlamaTokenizerFast
+
+ >>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
+ >>> tokenizer.encode("Hello .") # 29889 is '.'
+ [1, 15043, 29871, 1, 29889]
+ ```
+ Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
add_prefix_space (`bool`, *optional*):
Whether or not the tokenizer should automatically add a prefix space
"""
@@ -112,9 +135,21 @@ def __init__(
add_bos_token=True,
add_eos_token=False,
use_default_system_prompt=False,
+ legacy=None,
add_prefix_space=None,
**kwargs,
):
+ if legacy is None:
+ logger.warning_once(
+ f"You are using the default legacy behaviour of the {self.__class__}. This is"
+ " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
+ " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
+ " means, and thoroughly read the reason why this was added as explained in"
+ " https://github.com/huggingface/transformers/pull/24565"
+ )
+ legacy = True
+ self.legacy = legacy
+
if add_prefix_space is not None:
logger.warning_once(
"You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers"
diff --git a/src/transformers/models/phi3/__init__.py b/src/transformers/models/phi3/__init__.py
new file mode 100644
index 00000000000000..20cb69f4abc801
--- /dev/null
+++ b/src/transformers/models/phi3/__init__.py
@@ -0,0 +1,69 @@
+# Copyright 2024 Microsoft and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ _LazyModule,
+ is_sentencepiece_available,
+ is_tokenizers_available,
+ is_torch_available,
+)
+
+
+_import_structure = {
+ "configuration_phi3": ["PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP", "Phi3Config"],
+}
+
+try:
+ if not is_torch_available():
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ pass
+else:
+ _import_structure["modeling_phi3"] = [
+ "PHI3_PRETRAINED_MODEL_ARCHIVE_LIST",
+ "Phi3PreTrainedModel",
+ "Phi3Model",
+ "Phi3ForCausalLM",
+ "Phi3ForSequenceClassification",
+ "Phi3ForTokenClassification",
+ ]
+
+
+if TYPE_CHECKING:
+ from .configuration_phi3 import PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP, Phi3Config
+
+ try:
+ if not is_torch_available():
+ raise OptionalDependencyNotAvailable()
+ except OptionalDependencyNotAvailable:
+ pass
+ else:
+ from .modeling_phi3 import (
+ PHI3_PRETRAINED_MODEL_ARCHIVE_LIST,
+ Phi3ForCausalLM,
+ Phi3ForSequenceClassification,
+ Phi3ForTokenClassification,
+ Phi3Model,
+ Phi3PreTrainedModel,
+ )
+
+
+else:
+ import sys
+
+ sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py
new file mode 100644
index 00000000000000..e835c50f63eed5
--- /dev/null
+++ b/src/transformers/models/phi3/configuration_phi3.py
@@ -0,0 +1,213 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Phi-3 model configuration"""
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+ "microsoft/Phi-3-mini-4k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json",
+ "microsoft/Phi-3-mini-128k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json",
+}
+
+
+class Phi3Config(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a similar configuration to that of the
+ [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 32064):
+ Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`Phi3Model`].
+ hidden_size (`int`, *optional*, defaults to 3072):
+ Dimension of the hidden representations.
+ intermediate_size (`int`, *optional*, defaults to 8192):
+ Dimension of the MLP representations.
+ num_hidden_layers (`int`, *optional*, defaults to 32):
+ Number of hidden layers in the Transformer decoder.
+ num_attention_heads (`int`, *optional*, defaults to 32):
+ Number of attention heads for each attention layer in the Transformer decoder.
+ num_key_value_heads (`int`, *optional*):
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+ by meanpooling all the original heads within that group. For more details checkout [this
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+ `num_attention_heads`.
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
+ Dropout probability for mlp outputs.
+ embd_pdrop (`int`, *optional*, defaults to 0.0):
+ The dropout ratio for the embeddings.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio after computing the attention scores.
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+ The non-linear activation function (function or string) in the decoder.
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
+ The maximum sequence length that this model might ever be used with.
+ original_max_position_embeddings (`int`, *optional*, defaults to 4096):
+ The maximum sequence length that this model was trained with. This is used to determine the size of the
+ original RoPE embeddings when using long scaling.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+ The epsilon value used for the RMSNorm.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
+ relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+ Whether to tie weight embeddings
+ rope_theta (`float`, *optional*, defaults to 10000.0):
+ The base period of the RoPE embeddings.
+ rope_scaling (`dict`, *optional*):
+ The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+ contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
+ the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
+ divided by the number of attention heads divided by 2.
+ bos_token_id (`int`, *optional*, defaults to 1):
+ The id of the "beginning-of-sequence" token.
+ eos_token_id (`int`, *optional*, defaults to 32000):
+ The id of the "end-of-sequence" token.
+ pad_token_id (`int`, *optional*, defaults to 32000):
+ The id of the padding token.
+ sliding_window (`int`, *optional*):
+ Sliding window attention window size. If `None`, no sliding window is applied.
+
+ Example:
+
+ ```python
+ >>> from transformers import Phi3Model, Phi3Config
+
+ >>> # Initializing a Phi-3 style configuration
+ >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+
+ >>> # Initializing a model from the configuration
+ >>> model = Phi3Model(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "phi3"
+ keys_to_ignore_at_inference = ["past_key_values"]
+
+ def __init__(
+ self,
+ vocab_size=32064,
+ hidden_size=3072,
+ intermediate_size=8192,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ resid_pdrop=0.0,
+ embd_pdrop=0.0,
+ attention_dropout=0.0,
+ hidden_act="silu",
+ max_position_embeddings=4096,
+ original_max_position_embeddings=4096,
+ initializer_range=0.02,
+ rms_norm_eps=1e-5,
+ use_cache=True,
+ tie_word_embeddings=False,
+ rope_theta=10000.0,
+ rope_scaling=None,
+ bos_token_id=1,
+ eos_token_id=32000,
+ pad_token_id=32000,
+ sliding_window=None,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+
+ self.num_key_value_heads = num_key_value_heads
+ self.resid_pdrop = resid_pdrop
+ self.embd_pdrop = embd_pdrop
+ self.attention_dropout = attention_dropout
+ self.hidden_act = hidden_act
+ self.max_position_embeddings = max_position_embeddings
+ self.original_max_position_embeddings = original_max_position_embeddings
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+ self.sliding_window = sliding_window
+
+ super().__init__(
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ pad_token_id=pad_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
+ rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
+ raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
+ if not (
+ isinstance(rope_scaling_short_factor, list)
+ and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+ ):
+ raise ValueError(
+ f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+ )
+ if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
+ raise ValueError(
+ f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
+ )
+ if not (
+ isinstance(rope_scaling_long_factor, list)
+ and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+ ):
+ raise ValueError(
+ f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+ )
+ if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
+ raise ValueError(
+ f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
+ )
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
new file mode 100644
index 00000000000000..f9364d130b7e6c
--- /dev/null
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -0,0 +1,1595 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" PyTorch Phi-3 model."""
+
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from ...modeling_outputs import (
+ BaseModelOutputWithPast,
+ CausalLMOutputWithPast,
+ SequenceClassifierOutputWithPast,
+ TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ is_flash_attn_2_available,
+ is_flash_attn_greater_or_equal_2_10,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_phi3 import Phi3Config
+
+
+if is_flash_attn_2_available():
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
+
+ _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
+_CONFIG_FOR_DOC = "Phi3Config"
+
+PHI3_PRETRAINED_MODEL_ARCHIVE_LIST = [
+ "microsoft/Phi-3-mini-4k-instruct",
+ "microsoft/Phi-3-mini-128k-instruct",
+ # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3
+]
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
+class Phi3RMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ Phi3RMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
+
+
+# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
+class Phi3RotaryEmbedding(nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ self.register_buffer("inv_freq", None, persistent=False)
+
+ @torch.no_grad()
+ def forward(self, x, position_ids, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if self.inv_freq is None:
+ self.inv_freq = 1.0 / (
+ self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
+ )
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+ position_ids_expanded = position_ids[:, None, :].float()
+ # Force float32 since bfloat16 loses precision on long contexts
+ # See https://github.com/huggingface/transformers/pull/29285
+ device_type = x.device.type
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False):
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = emb.cos()
+ sin = emb.sin()
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding):
+ def __init__(self, dim, config, device=None):
+ super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+
+ self.short_factor = config.rope_scaling["short_factor"]
+ self.long_factor = config.rope_scaling["long_factor"]
+ self.original_max_position_embeddings = config.original_max_position_embeddings
+
+ @torch.no_grad()
+ def forward(self, x, position_ids, seq_len=None):
+ seq_len = torch.max(position_ids) + 1
+ if seq_len > self.original_max_position_embeddings:
+ ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+ else:
+ ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+
+ inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+ self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+ position_ids_expanded = position_ids[:, None, :].float()
+
+ # Force float32 since bfloat16 loses precision on long contexts
+ # See https://github.com/huggingface/transformers/pull/29285
+ device_type = x.device.type
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False):
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+
+ scale = self.max_position_embeddings / self.original_max_position_embeddings
+ if scale <= 1.0:
+ scaling_factor = 1.0
+ else:
+ scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+
+ cos = emb.cos() * scaling_factor
+ sin = emb.sin() * scaling_factor
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding):
+ def __init__(self, dim, config, device=None):
+ super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+
+ self.short_factor = config.rope_scaling["short_factor"]
+ self.long_factor = config.rope_scaling["long_factor"]
+ self.original_max_position_embeddings = config.original_max_position_embeddings
+
+ @torch.no_grad()
+ def forward(self, x, position_ids, seq_len=None):
+ seq_len = torch.max(position_ids) + 1
+ if seq_len > self.original_max_position_embeddings:
+ ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+ else:
+ ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+
+ inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+ self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+ position_ids_expanded = position_ids[:, None, :].float()
+
+ # Force float32 since bfloat16 loses precision on long contexts
+ # See https://github.com/huggingface/transformers/pull/29285
+ device_type = x.device.type
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False):
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+
+ scale = self.max_position_embeddings / self.original_max_position_embeddings
+ if scale <= 1.0:
+ scaling_factor = 1.0
+ else:
+ scaling_factor = 0.1 * math.log(scale) + 1.0
+
+ cos = emb.cos() * scaling_factor
+ sin = emb.sin() * scaling_factor
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ position_ids (`torch.Tensor`, *optional*):
+ Deprecated and unused.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class Phi3MLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+
+ self.config = config
+ self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+
+ self.activation_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+ up_states = self.gate_up_proj(hidden_states)
+
+ gate, up_states = up_states.chunk(2, dim=-1)
+ up_states = up_states * self.activation_fn(gate)
+
+ return self.down_proj(up_states)
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Phi3Attention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None):
+ super().__init__()
+ self.config = config
+ self.layer_idx = layer_idx
+ if layer_idx is None:
+ logger.warning_once(
+ f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+ "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+ "when creating this class."
+ )
+
+ self.attention_dropout = config.attention_dropout
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.original_max_position_embeddings = config.original_max_position_embeddings
+ self.rope_theta = config.rope_theta
+ self.rope_scaling = config.rope_scaling
+ self.is_causal = True
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+
+ op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+ self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.rope_scaling is None:
+ self.rotary_emb = Phi3RotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ if scaling_type == "su":
+ self.rotary_emb = Phi3SuScaledRotaryEmbedding(self.head_dim, self.config)
+ elif scaling_type == "yarn":
+ self.rotary_emb = Phi3YarnScaledRotaryEmbedding(self.head_dim, self.config)
+ else:
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ logger.warning_once("You are not running the flash-attention implementation, expect numerical differences.")
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv = self.qkv_proj(hidden_states)
+ query_pos = self.num_heads * self.head_dim
+ query_states = qkv[..., :query_pos]
+ key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+ value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ if self.layer_idx is None:
+ raise ValueError(
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+ "with a layer index."
+ )
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+ cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ # repeat k/v heads if n_kv_heads < n_heads
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class Phi3FlashAttention2(Phi3Attention):
+ """
+ Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+ flash attention and deal with padding tokens in case the input contains any of them.
+ """
+
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ # Phi3FlashAttention2 attention does not support output_attentions
+
+ if not _flash_supports_window_size:
+ logger.warning_once(
+ "The current flash attention version does not support sliding window attention. Please use `attn_implementation='eager'` or upgrade flash-attn library."
+ )
+ raise ValueError("The current flash attention version does not support sliding window attention.")
+
+ output_attentions = False
+
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+ )
+
+ # overwrite attention_mask with padding_mask
+ attention_mask = kwargs.pop("padding_mask")
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv = self.qkv_proj(hidden_states)
+ query_pos = self.num_heads * self.head_dim
+ query_states = qkv[..., :query_pos]
+ key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+ value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+ # Flash attention requires the input to have the shape
+ # batch_size x seq_length x head_dim x hidden_dim
+ # therefore we just need to keep the original shape
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ if self.layer_idx is None:
+ raise ValueError(
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+ "with a layer index."
+ )
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+ # Because the input can be padded, the absolute sequence length depends on the max position id.
+ rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+ cos, sin = self.rotary_emb(value_states, position_ids, seq_len=rotary_seq_len)
+
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ use_sliding_windows = (
+ _flash_supports_window_size
+ and getattr(self.config, "sliding_window", None) is not None
+ and kv_seq_len > self.config.sliding_window
+ )
+
+ if past_key_value is not None:
+ # Activate slicing cache only if the config has a value `sliding_windows` attribute
+ cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+ if (
+ getattr(self.config, "sliding_window", None) is not None
+ and kv_seq_len > self.config.sliding_window
+ and cache_has_contents
+ ):
+ slicing_tokens = 1 - self.config.sliding_window
+
+ past_key = past_key_value[self.layer_idx][0]
+ past_value = past_key_value[self.layer_idx][1]
+
+ past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+ past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+ if past_key.shape[-2] != self.config.sliding_window - 1:
+ raise ValueError(
+ f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+ f" {past_key.shape}"
+ )
+
+ if attention_mask is not None:
+ attention_mask = attention_mask[:, slicing_tokens:]
+ attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ # repeat k/v heads if n_kv_heads < n_heads
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_dropout = self.attention_dropout if self.training else 0.0
+
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
+ # cast them back in the correct dtype just to be sure everything works as expected.
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+ # in fp32.
+
+ if query_states.dtype == torch.float32:
+ if torch.is_autocast_enabled():
+ target_dtype = torch.get_autocast_gpu_dtype()
+ # Handle the case where the model is quantized
+ elif hasattr(self.config, "_pre_quantization_dtype"):
+ target_dtype = self.config._pre_quantization_dtype
+ else:
+ target_dtype = self.qkv_proj.weight.dtype
+
+ logger.warning_once(
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+ f" {target_dtype}."
+ )
+
+ query_states = query_states.to(target_dtype)
+ key_states = key_states.to(target_dtype)
+ value_states = value_states.to(target_dtype)
+
+ # Reashape to the expected shape for Flash Attention
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ attn_output = self._flash_attention_forward(
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ q_len,
+ dropout=attn_dropout,
+ use_sliding_windows=use_sliding_windows,
+ )
+
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+ # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._flash_attention_forward
+ def _flash_attention_forward(
+ self,
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ query_length,
+ dropout=0.0,
+ softmax_scale=None,
+ use_sliding_windows=False,
+ ):
+ """
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+ first unpad the input, then computes the attention scores and pad the final attention scores.
+
+ Args:
+ query_states (`torch.Tensor`):
+ Input query states to be passed to Flash Attention API
+ key_states (`torch.Tensor`):
+ Input key states to be passed to Flash Attention API
+ value_states (`torch.Tensor`):
+ Input value states to be passed to Flash Attention API
+ attention_mask (`torch.Tensor`):
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+ position of padding tokens and 1 for the position of non-padding tokens.
+ dropout (`float`):
+ Attention dropout
+ softmax_scale (`float`, *optional*):
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+ use_sliding_windows (`bool`, *optional*):
+ Whether to activate sliding window attention.
+ """
+ if not self._flash_attn_uses_top_left_mask:
+ causal = self.is_causal
+ else:
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+ causal = self.is_causal and query_length != 1
+
+ # Contains at least one padding token in the sequence
+ if attention_mask is not None:
+ batch_size = query_states.shape[0]
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+ query_states, key_states, value_states, attention_mask, query_length
+ )
+
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+ if not use_sliding_windows:
+ attn_output_unpad = flash_attn_varlen_func(
+ query_states,
+ key_states,
+ value_states,
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_k=cu_seqlens_k,
+ max_seqlen_q=max_seqlen_in_batch_q,
+ max_seqlen_k=max_seqlen_in_batch_k,
+ dropout_p=dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ )
+ else:
+ attn_output_unpad = flash_attn_varlen_func(
+ query_states,
+ key_states,
+ value_states,
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_k=cu_seqlens_k,
+ max_seqlen_q=max_seqlen_in_batch_q,
+ max_seqlen_k=max_seqlen_in_batch_k,
+ dropout_p=dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ window_size=(self.config.sliding_window, self.config.sliding_window),
+ )
+
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+ else:
+ if not use_sliding_windows:
+ attn_output = flash_attn_func(
+ query_states,
+ key_states,
+ value_states,
+ dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ )
+ else:
+ attn_output = flash_attn_func(
+ query_states,
+ key_states,
+ value_states,
+ dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ window_size=(self.config.sliding_window, self.config.sliding_window),
+ )
+
+ return attn_output
+
+ # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+ batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+ # On the first iteration we need to properly re-create the padding mask
+ # by slicing it on the proper place
+ if kv_seq_len != attention_mask.shape[-1]:
+ attention_mask_num_tokens = attention_mask.shape[-1]
+ attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+ key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+ value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+ if query_length == kv_seq_len:
+ query_layer = index_first_axis(
+ query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+ )
+ cu_seqlens_q = cu_seqlens_k
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
+ indices_q = indices_k
+ elif query_length == 1:
+ max_seqlen_in_batch_q = 1
+ cu_seqlens_q = torch.arange(
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
+ ) # There is a memcpy here, that is very bad.
+ indices_q = cu_seqlens_q[:-1]
+ query_layer = query_layer.squeeze(1)
+ else:
+ # The -q_len: slice assumes left padding.
+ attention_mask = attention_mask[:, -query_length:]
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+ return (
+ query_layer,
+ key_layer,
+ value_layer,
+ indices_q,
+ (cu_seqlens_q, cu_seqlens_k),
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+ )
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3
+# TODO @Arthur no longer copied from LLama after static cache
+class Phi3SdpaAttention(Phi3Attention):
+ """
+ Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+ `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+ SDPA API.
+ """
+
+ # Adapted from Phi3Attention.forward
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if output_attentions:
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+ logger.warning_once(
+ "Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ return super().forward(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv = self.qkv_proj(hidden_states)
+ query_pos = self.num_heads * self.head_dim
+ query_states = qkv[..., :query_pos]
+ key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+ value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+ cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
+ if query_states.device.type == "cuda" and attention_mask is not None:
+ query_states = query_states.contiguous()
+ key_states = key_states.contiguous()
+ value_states = value_states.contiguous()
+
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
+ query_states,
+ key_states,
+ value_states,
+ attn_mask=attention_mask,
+ dropout_p=self.attention_dropout if self.training else 0.0,
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+ is_causal=self.is_causal and attention_mask is None and q_len > 1,
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+ attn_output = self.o_proj(attn_output)
+
+ return attn_output, None, past_key_value
+
+
+PHI3_ATTENTION_CLASSES = {
+ "eager": Phi3Attention,
+ "flash_attention_2": Phi3FlashAttention2,
+ "sdpa": Phi3SdpaAttention,
+}
+
+
+class Phi3DecoderLayer(nn.Module):
+ def __init__(self, config: Phi3Config, layer_idx: int):
+ super().__init__()
+
+ self.config = config
+ self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+
+ self.mlp = Phi3MLP(config)
+ self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
+ self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+ self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ **kwargs,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+ )
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`):
+ input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+ `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ attn_outputs, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = residual + self.resid_attn_dropout(attn_outputs)
+
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + self.resid_mlp_dropout(hidden_states)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+
+PHI3_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`Phi3Config`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+ "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
+ PHI3_START_DOCSTRING,
+)
+class Phi3PreTrainedModel(PreTrainedModel):
+ config_class = Phi3Config
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["Phi3DecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+ _supports_flash_attn_2 = True
+ _supports_sdpa = False
+ _supports_cache_class = True
+
+ _version = "0.0.5"
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+
+PHI3_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ Two formats are allowed:
+ - a [`~cache_utils.Cache`] instance;
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+ cache format.
+
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+ legacy cache format will be returned.
+
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
+ PHI3_START_DOCSTRING,
+)
+class Phi3Model(Phi3PreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`]
+
+ Args:
+ config: Phi3Config
+ """
+
+ def __init__(self, config: Phi3Config):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.embed_dropout = nn.Dropout(config.embd_pdrop)
+ self.layers = nn.ModuleList(
+ [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+ )
+ self._attn_implementation = config._attn_implementation
+ self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape[:2]
+ elif inputs_embeds is not None:
+ batch_size, seq_length = inputs_embeds.shape[:2]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ past_key_values_length = 0
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ if use_cache:
+ use_legacy_cache = not isinstance(past_key_values, Cache)
+ if use_legacy_cache:
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+ else:
+ position_ids = position_ids.view(-1, seq_length).long()
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+
+ if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+ is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+ if is_padding_right:
+ raise ValueError(
+ "You are attempting to perform batched generation with padding_side='right'"
+ " this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to "
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
+ )
+
+ if self._attn_implementation == "flash_attention_2":
+ # 2d mask is passed through the layers
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+ else:
+ # 4d mask is passed through the layers
+ attention_mask = _prepare_4d_causal_attention_mask(
+ attention_mask,
+ (batch_size, seq_length),
+ inputs_embeds,
+ past_key_values_length,
+ sliding_window=self.config.sliding_window,
+ )
+
+ hidden_states = inputs_embeds
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = None
+
+ for decoder_layer in self.layers:
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ decoder_layer.__call__,
+ hidden_states,
+ attention_mask,
+ position_ids,
+ past_key_values,
+ output_attentions,
+ use_cache,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_values,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = None
+ if use_cache:
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+class Phi3ForCausalLM(Phi3PreTrainedModel):
+ _tied_weights_keys = ["lm_head.weight"]
+
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = Phi3Model(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
+ def get_decoder(self):
+ return self.model
+
+ # Ignore copy
+ @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, Phi3ForCausalLM
+
+ >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+ >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+
+ >>> prompt = "This is an example script ."
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ 'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
+ ```"""
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ logits = self.lm_head(hidden_states)
+ logits = logits.float()
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ # Copied from transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM.prepare_inputs_for_generation
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values is not None:
+ if isinstance(past_key_values, Cache):
+ cache_length = past_key_values.get_seq_length()
+ past_length = past_key_values.seen_tokens
+ max_cache_length = past_key_values.get_max_length()
+ else:
+ cache_length = past_length = past_key_values[0][0].shape[2]
+ max_cache_length = None
+
+ # Keep only the unprocessed tokens:
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+ # input)
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+ # input_ids based on the past_length.
+ elif past_length < input_ids.shape[1]:
+ input_ids = input_ids[:, past_length:]
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+ if (
+ max_cache_length is not None
+ and attention_mask is not None
+ and cache_length + input_ids.shape[1] > max_cache_length
+ ):
+ attention_mask = attention_mask[:, -max_cache_length:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -input_ids.shape[1] :]
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+
+@add_start_docstrings(
+ """
+ The [`Phi3Model`] with a sequence classification head on top (linear layer).
+
+ [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+ (e.g. GPT-2) do.
+
+ Since it does classification on the last token, it requires to know the position of the last token. If a
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+ each row of the batch).
+ """,
+ PHI3_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs
+class Phi3ForSequenceClassification(Phi3PreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.model = Phi3Model(config)
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ model_outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = model_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
+ sequence_lengths = sequence_lengths.to(logits.device)
+ else:
+ sequence_lengths = -1
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + model_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=model_outputs.past_key_values,
+ hidden_states=model_outputs.hidden_states,
+ attentions=model_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ [`Phi3Model`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+ Named-Entity-Recognition (NER) tasks.
+ """,
+ PHI3_START_DOCSTRING,
+)
+# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs
+class Phi3ForTokenClassification(Phi3PreTrainedModel):
+ def __init__(self, config: Phi3Config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+
+ self.model = Phi3Model(config)
+ if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+ classifier_dropout = config.classifier_dropout
+ elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+ classifier_dropout = config.hidden_dropout
+ else:
+ classifier_dropout = 0.1
+ self.dropout = nn.Dropout(classifier_dropout)
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TokenClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **deprecated_arguments,
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ model_outputs = self.model(
+ input_ids,
+ past_key_values=past_key_values,
+ attention_mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = model_outputs[0]
+ hidden_states = self.dropout(hidden_states)
+ logits = self.classifier(hidden_states)
+
+ loss = None
+ if labels is not None:
+ # move labels to correct device to enable model parallelism
+ labels = labels.to(logits.device)
+ batch_size, seq_length = labels.shape
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(
+ logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+ )
+
+ if not return_dict:
+ output = (logits,) + model_outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TokenClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=model_outputs.hidden_states,
+ attentions=model_outputs.attentions,
+ )
diff --git a/src/transformers/models/seggpt/modeling_seggpt.py b/src/transformers/models/seggpt/modeling_seggpt.py
index 79fd309eaf808f..64cd4296f7a554 100644
--- a/src/transformers/models/seggpt/modeling_seggpt.py
+++ b/src/transformers/models/seggpt/modeling_seggpt.py
@@ -753,11 +753,15 @@ def forward(
bool_masked_pos: Optional[torch.BoolTensor] = None,
feature_ensemble: Optional[bool] = None,
embedding_type: Optional[str] = None,
+ labels: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, SegGptEncoderOutput]:
r"""
+ labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
+ Ground truth mask for input images.
+
Returns:
Examples:
@@ -799,10 +803,21 @@ def forward(
# Prepare inputs
pixel_values = torch.cat((prompt_pixel_values, pixel_values), dim=2)
- prompt_pixel_values = torch.cat((prompt_masks, prompt_masks), dim=2)
+ prompt_pixel_values = (
+ torch.cat((prompt_masks, prompt_masks), dim=2)
+ if labels is None
+ else torch.cat((prompt_masks, labels), dim=2)
+ )
+
+ if bool_masked_pos is None and labels is not None:
+ logger.warning_once(
+ "Labels were provided, but bool_masked_pos were not. It will be set to default value. If you're training the model, make sure to provide a bool_masked_pos."
+ )
# We concat on height axis so SegGPT can handle as a single image, hence we need to mask the portion
- # of the prompt pixels that will be destinated to the prediction as they don't add any information.
+ # of the mask prompt pixels that will be destinated to the prediction as they don't add any information.
+ # This is only the case for inference. In training, the model concat of prompt mask and label is masked
+ # and reconstructed together (In-Context Painting).
if bool_masked_pos is None:
num_patches = self.embeddings.patch_embeddings.num_patches
bool_masked_pos = torch.zeros(num_patches, dtype=torch.bool).to(pixel_values.device)
@@ -840,7 +855,9 @@ def unpatchify(tensor: torch.Tensor, patch_height: int, patch_width: int) -> tor
batch_size = tensor.shape[0]
patch_size = int((tensor.shape[-1] / 3) ** 0.5)
if patch_height * patch_width != tensor.shape[1]:
- raise ValueError(f"Number of patches {tensor.shape[1]} does not match patch height and width.")
+ raise ValueError(
+ f"Number of patches {tensor.shape[1]} does not match patch height ({patch_height}) and width ({patch_width})."
+ )
tensor = tensor.reshape(shape=(batch_size, patch_height, patch_width, patch_size, patch_size, 3))
tensor = tensor.permute(0, 5, 1, 3, 2, 4)
@@ -857,8 +874,7 @@ def __init__(self, config):
def forward(
self,
- pixel_values: torch.FloatTensor,
- prompt_pixel_values: torch.FloatTensor,
+ prompt_masks: torch.FloatTensor,
pred_masks: torch.FloatTensor,
labels: torch.FloatTensor,
bool_masked_pos: torch.BoolTensor,
@@ -866,11 +882,8 @@ def forward(
"""Computes the L1 loss between the predicted masks and the ground truth masks.
Args:
- pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
- Concatenated pixel values from prompt and input images.
-
- prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
- Concatenated pixel values from mask prompt.
+ prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values from mask prompt.
pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
Predicted masks.
@@ -884,12 +897,12 @@ def forward(
Returns:
`torch.FloatTensor`: The mean L1 loss between the predicted masks and the ground truth masks.
"""
+ ground_truth = torch.cat((prompt_masks, labels), dim=2)
+
mask = bool_masked_pos[:, :, None].repeat(1, 1, self.patch_size**2 * 3)
- mask = unpatchify(mask, pixel_values.shape[1] // self.patch_size, pixel_values.shape[2] // self.patch_size)
- # Changing dummy mask in prompt_pixel_values to labels values
- prompt_pixel_values = prompt_pixel_values.clone()
- prompt_pixel_values[:, :, prompt_pixel_values.shape[2] // 2 :, :] = labels
- loss = F.smooth_l1_loss(pred_masks, prompt_pixel_values, reduction="none", beta=self.beta)
+ mask = unpatchify(mask, ground_truth.shape[2] // self.patch_size, ground_truth.shape[3] // self.patch_size)
+
+ loss = F.smooth_l1_loss(pred_masks, ground_truth, reduction="none", beta=self.beta)
loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches
return loss
@@ -976,6 +989,7 @@ def forward(
bool_masked_pos=bool_masked_pos,
feature_ensemble=feature_ensemble,
embedding_type=embedding_type,
+ labels=labels,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
@@ -988,7 +1002,7 @@ def forward(
loss = None
if labels is not None:
loss_fn = SegGptLoss(self.config)
- loss = loss_fn(pixel_values, prompt_pixel_values, pred_masks, labels, bool_masked_pos)
+ loss = loss_fn(prompt_masks, pred_masks, labels, bool_masked_pos)
if not return_dict:
output = (pred_masks,)
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index cf83e8a39ebbb1..23bcccb31d61d3 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -492,6 +492,11 @@ def _init_weights(self, module):
logit_scale_init = torch.log(torch.tensor(1.0))
module.logit_scale.data.fill_(logit_scale_init)
module.logit_bias.data.zero_()
+ elif isinstance(module, SiglipForImageClassification):
+ nn.init.normal_(
+ module.classifier.weight,
+ std=self.config.vision_config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
elif isinstance(module, (nn.Linear, nn.Conv2d)):
lecun_normal_(module.weight)
if module.bias is not None:
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index c4e44854a0da43..b74819c7a1c91b 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -120,7 +120,7 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size))
- if width < height and width != size:
+ if width <= height and width != size:
height = int(size * height / width)
width = size
elif height < width and height != size:
diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index cab0b0d4aec72b..ae6c0627bb2677 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -28,6 +28,7 @@
parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version)
+is_torch_greater_or_equal_than_2_3 = parsed_torch_version_base >= version.parse("2.3")
is_torch_greater_or_equal_than_2_2 = parsed_torch_version_base >= version.parse("2.2")
is_torch_greater_or_equal_than_2_1 = parsed_torch_version_base >= version.parse("2.1")
is_torch_greater_or_equal_than_2_0 = parsed_torch_version_base >= version.parse("2.0")
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index f911e1c894b623..52beb6c1e56ff5 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -69,7 +69,11 @@
MODEL_MAPPING_NAMES,
)
from .optimization import Adafactor, get_scheduler
-from .pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
+from .pytorch_utils import (
+ ALL_LAYERNORM_LAYERS,
+ is_torch_greater_or_equal_than_1_13,
+ is_torch_greater_or_equal_than_2_3,
+)
from .tokenization_utils_base import PreTrainedTokenizerBase
from .trainer_callback import (
CallbackHandler,
@@ -620,7 +624,8 @@ def __init__(
if (args.fp16 or args.bf16) and args.half_precision_backend == "auto":
if args.device == torch.device("cpu"):
if args.fp16:
- raise ValueError("Tried to use `fp16` but it is not supported on cpu")
+ if not is_torch_greater_or_equal_than_2_3:
+ raise ValueError("Tried to use `fp16` but it is not supported on cpu")
else:
args.half_precision_backend = "cpu_amp"
logger.info(f"Using {args.half_precision_backend} half precision backend")
@@ -1682,6 +1687,12 @@ def _wrap_model(self, model, training=True, dataloader=None):
)
fsdp_kwargs = self.args.xla_fsdp_config
if self.args.fsdp_config["xla_fsdp_grad_ckpt"]:
+ if model.config.use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+ )
+ model.config.use_cache = False
+
# Apply gradient checkpointing to auto-wrapped sub-modules if specified
def auto_wrapper_callable(m, *args, **kwargs):
target_cls = FSDP if not self.is_fsdp_xla_v2_enabled else FSDPv2
@@ -3261,7 +3272,8 @@ def _save_tpu(self, output_dir: Optional[str] = None):
logger.info(f"Saving model checkpoint to {output_dir}")
model = self.model
xm.mark_step()
- model.to("cpu")
+ if self.args.save_safetensors:
+ model.to("cpu")
if xm.is_master_ordinal():
os.makedirs(output_dir, exist_ok=True)
@@ -3296,7 +3308,8 @@ def _save_tpu(self, output_dir: Optional[str] = None):
# We moved the model from TPU -> CPU for saving the weights.
# Now we should move it back to subsequent compute still works.
- model.to(self.args.device)
+ if self.args.save_safetensors:
+ model.to(self.args.device)
def _save(self, output_dir: Optional[str] = None, state_dict=None):
# If we are executing this function, we are the process zero, so we don't check for that.
@@ -4348,6 +4361,18 @@ def create_accelerator_and_postprocess(self):
even_batches=accelerator_config.pop("even_batches"),
use_seedable_sampler=accelerator_config.pop("use_seedable_sampler"),
)
+ non_blocking = accelerator_config.pop("non_blocking")
+ if not is_accelerate_available("0.30.0"):
+ if non_blocking:
+ raise ImportError(
+ "`non_blocking` is only supported in accelerate v0.30.0 and above. Please upgrade accelerate to use this feature."
+ )
+ else:
+ if non_blocking and not self.args.dataloader_pin_memory:
+ logger.warning(
+ "`non_blocking` is enabled but `dataloader_pin_memory` is not. For the best performance, it's recommended to enable both."
+ )
+ dataloader_config.non_blocking = non_blocking
# this would have been updated above, no need for it anymore
accelerator_config.pop("gradient_accumulation_kwargs")
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index a4372ae78a79a2..9defa91b2b8bc8 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -1246,6 +1246,10 @@ class AcceleratorConfig:
The [`accelerate.utils.GradientAccumulationPlugin`] default is `True`.
sync_each_batch (`bool`): Whether to synchronize the gradients at each data batch.
The [`accelerate.utils.GradientAccumulationPlugin`] default is `False`.
+ non_blocking (`bool`, *optional*, defaults to `False`):
+ Whether to use non-blocking CUDA calls to help minimize synchronization during
+ distributed training with prepared `DataLoader` inputs being moved to device.
+ Best if used with `pin_memory=True` in the `TrainingArguments`.
"""
@@ -1284,6 +1288,17 @@ class AcceleratorConfig:
"multiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results."
},
)
+
+ non_blocking: Optional[bool] = field(
+ default=False,
+ metadata={
+ "help": "Whether to use non-blocking CUDA calls to help minimize synchronization during "
+ "distributed training with prepared `DataLoader` inputs being moved to device. "
+ "Best if used with `pin_memory=True` in the `TrainingArguments`. Requires accelerate "
+ "v0.30.0."
+ },
+ )
+
gradient_accumulation_kwargs: Optional[Dict] = field(
default=None,
metadata={
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 5e81c22db93b50..91472eed9b0314 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -67,7 +67,7 @@
import torch
import torch.distributed as dist
- from .pytorch_utils import is_torch_greater_or_equal_than_2_0
+ from .pytorch_utils import is_torch_greater_or_equal_than_2_0, is_torch_greater_or_equal_than_2_3
if is_accelerate_available():
from accelerate.state import AcceleratorState, PartialState
@@ -1618,6 +1618,7 @@ def __post_init__(self):
if (
self.framework == "pt"
and is_torch_available()
+ and (self.device.type == "cpu" and not is_torch_greater_or_equal_than_2_3)
and (self.device.type != "cuda")
and (self.device.type != "mlu")
and (self.device.type != "npu")
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f724d7dd6c41d5..8166c9d24297aa 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -6752,6 +6752,44 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
+PHI3_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class Phi3ForCausalLM(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class Phi3ForSequenceClassification(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class Phi3ForTokenClassification(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class Phi3Model(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class Phi3PreTrainedModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST = None
diff --git a/templates/adding_a_new_model/README.md b/templates/adding_a_new_model/README.md
index 52f481dcb3af06..8c8c7af0b333fb 100644
--- a/templates/adding_a_new_model/README.md
+++ b/templates/adding_a_new_model/README.md
@@ -16,257 +16,8 @@ limitations under the License.
# Adding a new model
-This folder contains templates to generate new models that fit the current API and pass all tests. It generates
-models in both PyTorch, TensorFlow, and Flax and completes the `__init__.py` and auto-modeling files, and creates the
-documentation. Their use is described in the [next section](#cookiecutter-templates).
+This page has been updated in light of the removal of the `add_new_model` script in favor of the more complete
+`add_new_model_like` script.
-There is also a CLI tool to generate a new model like an existing one called `transformers-cli add-new-model-like`.
-Jump to the [Add new model like section](#add-new-model-like-command) to learn how to use it.
-
-## Cookiecutter Templates
-
-Using the `cookiecutter` utility requires to have all the `dev` dependencies installed. Let's first clone the
-repository and install it in our environment:
-
-```shell script
-git clone https://github.com/huggingface/transformers
-cd transformers
-pip install -e ".[dev]"
-```
-
-Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
-failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
-(PyTorch, TensorFlow and/or Flax) then do:
-
-```bash
-pip install -e ".[quality]"
-```
-
-Once the installation is done, you can use the CLI command `add-new-model` to generate your models:
-
-```shell script
-transformers-cli add-new-model
-```
-
-This should launch the `cookiecutter` package which should prompt you to fill in the configuration.
-
-The `modelname` should be cased according to the plain text casing, i.e., BERT, RoBERTa, DeBERTa.
-```
-modelname []:
-uppercase_modelname []:
-lowercase_modelname []:
-camelcase_modelname []:
-```
-
-Fill in the `authors` with your team members:
-```
-authors [The HuggingFace Team]:
-```
-
-The checkpoint identifier is the checkpoint that will be used in the examples across the files. Put the name you wish,
-as it will appear on the modelhub. Do not forget to include the organisation.
-```
-checkpoint_identifier [organisation/-base-cased]:
-```
-
-The tokenizer should either be based on BERT if it behaves exactly like the BERT tokenizer, or a standalone otherwise.
-```
-Select tokenizer_type:
-1 - Based on BERT
-2 - Standalone
-Choose from 1, 2 [1]:
-```
-
-
-Once the command has finished, you should have a total of 7 new files spread across the repository:
-```
-docs/source/model_doc/.md
-src/transformers/models//configuration_.py
-src/transformers/models//modeling_.py
-src/transformers/models//modeling_tf_.py
-src/transformers/models//tokenization_.py
-tests/models//test_modeling_.py
-tests/models//test_modeling_tf_.py
-```
-
-You can run the tests to ensure that they all pass:
-
-```bash
-python -m pytest ./tests/test_**.py
-```
-
-Feel free to modify each file to mimic the behavior of your model.
-
-â You should be careful about the classes preceded by the following line:ïž
-
-```python
-# Copied from transformers.[...]
-```
-
-This line ensures that the copy does not diverge from the source. If it *should* diverge, because the implementation
-is different, this line needs to be deleted. If you don't delete this line and run `make fix-copies`,
-your changes will be overwritten.
-
-Once you have edited the files to fit your architecture, simply re-run the tests (and edit them if a change
-is needed!) afterwards to make sure everything works as expected.
-
-Once the files are generated and you are happy with your changes, here's a checklist to ensure that your contribution
-will be merged quickly:
-
-- You should run the `make fixup` utility to fix the style of the files and to ensure the code quality meets the
- library's standards.
-- You should complete the documentation file (`docs/source/model_doc/.rst`) so that your model may be
- usable.
-
-## Add new model like command
-
-Using the `transformers-cli add-new-model-like` command requires to have all the `dev` dependencies installed. Let's
-first clone the repository and install it in our environment:
-
-```shell script
-git clone https://github.com/huggingface/transformers
-cd transformers
-pip install -e ".[dev]"
-```
-
-Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
-failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
-(PyTorch, TensorFlow and/or Flax) then do:
-
-```bash
-pip install -e ".[quality]"
-```
-
-Once the installation is done, you can use the CLI command `add-new-model-like` to generate your models:
-
-```shell script
-transformers-cli add-new-model-like
-```
-
-This will start a small questionnaire you have to fill.
-
-```
-What identifier would you like to use for the model type of this model?
-```
-
-You will have to input the model type of the model you want to clone. The model type can be found in several places:
-- inside the configuration of any checkpoint of that model
-- the name of the documentation page of that model
-
-For instance the doc page of `BigBirdPegasus` is `https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus`
-so its model type is `"bigbird_pegasus"`.
-
-If you make a typo, the command will suggest you the closest model types it can find.
-
-Once this is done, the questionnaire will ask you for the new model name and its various casings:
-
-```
-What is the name for your new model?
-What identifier would you like to use for the model type of this model?
-What name would you like to use for the module of this model?
-What prefix (camel-cased) would you like to use for the model classes of this model?
-What prefix (upper-cased) would you like to use for the constants relative to this model?
-```
-
-From your answer to the first question, defaults will be determined for all others. The first name should be written
-as you want your model be named in the doc, with no special casing (like RoBERTa) and from there, you can either stick
-with the defaults or change the cased versions.
-
-Next will be the name of the config class to use for this model:
-
-```
-What will be the name of the config class for this model?
-```
-
-Then, you will be asked for a checkpoint identifier:
-
-```
-Please give a checkpoint identifier (on the model Hub) for this new model.
-```
-
-This is the checkpoint that will be used in the examples across the files and the integration tests. Put the name you
-wish, as it will appear on the Model Hub. Do not forget to include the organisation.
-
-Then you will have to say whether your model re-uses the same processing classes as the model you're cloning:
-
-```
-Will your new model use the same processing class as Xxx (XxxTokenizer/XxxFeatureExtractor/XxxImageProcessor)
-```
-
-Answer yes if you have no intentions to make any change to the class used for preprocessing. It can use different
-files (for instance you can reuse the `BertTokenizer` with a new vocab file).
-
-If you answer no, you will have to give the name of the classes
-for the new tokenizer/image processor/feature extractor/processor (depending on the model you're cloning).
-
-Next the questionnaire will ask
-
-```
-Should we add # Copied from statements when creating the new modeling file?
-```
-
-This is the internal mechanism used in the library to make sure code copied from various modeling files stay consistent.
-If you plan to completely rewrite the modeling file, you should answer no, whereas if you just want to tweak one part
-of the model, you should answer yes.
-
-Lastly, the questionnaire will inquire about frameworks:
-
-```
-Should we add a version of your new model in all the frameworks implemented by Old Model (xxx)?
-```
-
-If you answer yes, the new model will have files for all the frameworks implemented by the model you're cloning.
-Otherwise, you will get a new question to select the frameworks you want.
-
-Once the command has finished, you will see a new subfolder in the `src/transformers/models/` folder, with the
-necessary files (configuration and modeling files for all frameworks requested, and maybe the processing files,
-depending on your choices).
-
-You will also see a doc file and tests for your new models. First you should run
-
-```bash
-make style
-make fix-copies
-```
-
-and then you can start tweaking your model. You should:
-- fill the doc file at `docs/source/model_doc/model_name.md`
-- tweak the configuration and modeling files to your need
-
-Once you're done, you can run the tests to ensure that they all pass:
-
-```bash
-python -m pytest ./tests/test_**.py
-```
-
-â You should be careful about the classes preceded by the following line:ïž
-
-```python
-# Copied from transformers.[...]
-```
-
-This line ensures that the copy does not diverge from the source. If it *should* diverge, because the implementation
-is different, this line needs to be deleted. If you don't delete this line and run `make fix-copies`,
-your changes will be overwritten.
-
-Once you have edited the files to fit your architecture, simply re-run the tests (and edit them if a change
-is needed!) afterwards to make sure everything works as expected.
-
-Once the files are generated and you are happy with your changes, here's a checklist to ensure that your contribution
-will be merged quickly:
-
-- You should run the `make fixup` utility to fix the style of the files and to ensure the code quality meets the
- library's standards.
-- You should add your model to the main README then run `make fix-copies`.
+We recommend you checkout the documentation of [How to add a model](https://huggingface.co/docs/transformers/main/en/add_new_model)
+in the Hugging Face Transformers documentation for complete and up-to-date instructions.
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py
deleted file mode 100644
index 5dd27ef591a180..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import TYPE_CHECKING
-
-from ...utils import _LazyModule, OptionalDependencyNotAvailable, is_tokenizers_available
-
-
-{%- if "TensorFlow" in cookiecutter.generate_tensorflow_pytorch_and_flax %}
-from ...utils import is_tf_available
-
-
-{% endif %}
-{%- if "PyTorch" in cookiecutter.generate_tensorflow_pytorch_and_flax %}
-from ...utils import is_torch_available
-
-
-{% endif %}
-{%- if "Flax" in cookiecutter.generate_tensorflow_pytorch_and_flax %}
-from ...utils import is_flax_available
-
-
-{% endif %}
-
-_import_structure = {
- "configuration_{{cookiecutter.lowercase_modelname}}": ["{{cookiecutter.uppercase_modelname}}_PRETRAINED_CONFIG_ARCHIVE_MAP", "{{cookiecutter.camelcase_modelname}}Config"],
- "tokenization_{{cookiecutter.lowercase_modelname}}": ["{{cookiecutter.camelcase_modelname}}Tokenizer"],
-}
-
-try:
- if not is_tokenizers_available():
- raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
- pass
-else:
- _import_structure["tokenization_{{cookiecutter.lowercase_modelname}}_fast"] = ["{{cookiecutter.camelcase_modelname}}TokenizerFast"]
-
-{%- if "PyTorch" in cookiecutter.generate_tensorflow_pytorch_and_flax %}
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-try:
- if not is_torch_available():
- raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
- pass
-else:
- _import_structure["modeling_{{cookiecutter.lowercase_modelname}}"] = [
- "{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST",
- "{{cookiecutter.camelcase_modelname}}ForMaskedLM",
- "{{cookiecutter.camelcase_modelname}}ForCausalLM",
- "{{cookiecutter.camelcase_modelname}}ForMultipleChoice",
- "{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
- "{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
- "{{cookiecutter.camelcase_modelname}}ForTokenClassification",
- "{{cookiecutter.camelcase_modelname}}Layer",
- "{{cookiecutter.camelcase_modelname}}Model",
- "{{cookiecutter.camelcase_modelname}}PreTrainedModel",
- "load_tf_weights_in_{{cookiecutter.lowercase_modelname}}",
- ]
-{% else %}
-try:
- if not is_torch_available():
- raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
- pass
-else:
- _import_structure["modeling_{{cookiecutter.lowercase_modelname}}"] = [
- "{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST",
- "{{cookiecutter.camelcase_modelname}}ForConditionalGeneration",
- "{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
- "{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
- "{{cookiecutter.camelcase_modelname}}ForCausalLM",
- "{{cookiecutter.camelcase_modelname}}Model",
- "{{cookiecutter.camelcase_modelname}}PreTrainedModel",
- ]
-{% endif %}
-{% endif %}
-
-
-{%- if "TensorFlow" in cookiecutter.generate_tensorflow_pytorch_and_flax %}
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-try:
- if not is_tf_available():
- raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
- pass
-else:
- _import_structure["modeling_tf_{{cookiecutter.lowercase_modelname}}"] = [
- "TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST",
- "TF{{cookiecutter.camelcase_modelname}}ForMaskedLM",
- "TF{{cookiecutter.camelcase_modelname}}ForCausalLM",
- "TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice",
- "TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
- "TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
- "TF{{cookiecutter.camelcase_modelname}}ForTokenClassification",
- "TF{{cookiecutter.camelcase_modelname}}Layer",
- "TF{{cookiecutter.camelcase_modelname}}Model",
- "TF{{cookiecutter.camelcase_modelname}}PreTrainedModel",
- ]
-{% else %}
-try:
- if not is_tf_available():
- raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
- pass
-else:
- _import_structure["modeling_tf_{{cookiecutter.lowercase_modelname}}"] = [
- "TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration",
- "TF{{cookiecutter.camelcase_modelname}}Model",
- "TF{{cookiecutter.camelcase_modelname}}PreTrainedModel",
- ]
-{% endif %}
-{% endif %}
-
-
-{%- if "Flax" in cookiecutter.generate_tensorflow_pytorch_and_flax %}
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-try:
- if not is_flax_available():
- raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
- pass
-else:
- _import_structure["modeling_flax_{{cookiecutter.lowercase_modelname}}"] = [
- "Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM",
- "Flax{{cookiecutter.camelcase_modelname}}ForCausalLM",
- "Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice",
- "Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
- "Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
- "Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification",
- "Flax{{cookiecutter.camelcase_modelname}}Layer",
- "Flax{{cookiecutter.camelcase_modelname}}Model",
- "Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel",
- ]
-{% else %}
-try:
- if not is_flax_available():
- raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
- pass
-else:
- _import_structure["modeling_flax_{{cookiecutter.lowercase_modelname}}"] = [
- "Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration",
- "Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
- "Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
- "Flax{{cookiecutter.camelcase_modelname}}Model",
- "Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel",
- ]
-{% endif %}
-{% endif %}
-
-
-if TYPE_CHECKING:
- from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.uppercase_modelname}}_PRETRAINED_CONFIG_ARCHIVE_MAP, {{cookiecutter.camelcase_modelname}}Config
- from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Tokenizer
-
- try:
- if not is_tokenizers_available():
- raise OptionalDependencyNotAvailable()
- except OptionalDependencyNotAvailable:
- pass
- else:
- from .tokenization_{{cookiecutter.lowercase_modelname}}_fast import {{cookiecutter.camelcase_modelname}}TokenizerFast
-
-{%- if "PyTorch" in cookiecutter.generate_tensorflow_pytorch_and_flax %}
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
- try:
- if not is_torch_available():
- raise OptionalDependencyNotAvailable()
- except OptionalDependencyNotAvailable:
- pass
- else:
- from .modeling_{{cookiecutter.lowercase_modelname}} import (
- {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST,
- {{cookiecutter.camelcase_modelname}}ForMaskedLM,
- {{cookiecutter.camelcase_modelname}}ForCausalLM,
- {{cookiecutter.camelcase_modelname}}ForMultipleChoice,
- {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- {{cookiecutter.camelcase_modelname}}ForTokenClassification,
- {{cookiecutter.camelcase_modelname}}Layer,
- {{cookiecutter.camelcase_modelname}}Model,
- {{cookiecutter.camelcase_modelname}}PreTrainedModel,
- load_tf_weights_in_{{cookiecutter.lowercase_modelname}},
- )
-{% else %}
- try:
- if not is_torch_available():
- raise OptionalDependencyNotAvailable()
- except OptionalDependencyNotAvailable:
- pass
- else:
- from .modeling_{{cookiecutter.lowercase_modelname}} import (
- {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST,
- {{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
- {{cookiecutter.camelcase_modelname}}ForCausalLM,
- {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- {{cookiecutter.camelcase_modelname}}Model,
- {{cookiecutter.camelcase_modelname}}PreTrainedModel,
- )
-{% endif %}
-{% endif %}
-{%- if "TensorFlow" in cookiecutter.generate_tensorflow_pytorch_and_flax %}
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
- try:
- if not is_tf_available():
- raise OptionalDependencyNotAvailable()
- except OptionalDependencyNotAvailable:
- pass
- else:
- from .modeling_tf_{{cookiecutter.lowercase_modelname}} import (
- TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST,
- TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
- TF{{cookiecutter.camelcase_modelname}}ForCausalLM,
- TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
- TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- TF{{cookiecutter.camelcase_modelname}}ForTokenClassification,
- TF{{cookiecutter.camelcase_modelname}}Layer,
- TF{{cookiecutter.camelcase_modelname}}Model,
- TF{{cookiecutter.camelcase_modelname}}PreTrainedModel,
- )
-{% else %}
- try:
- if not is_tf_available():
- raise OptionalDependencyNotAvailable()
- except OptionalDependencyNotAvailable:
- pass
- else:
- from .modeling_tf_{{cookiecutter.lowercase_modelname}} import (
- TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
- TF{{cookiecutter.camelcase_modelname}}Model,
- TF{{cookiecutter.camelcase_modelname}}PreTrainedModel,
- )
-{% endif %}
-{% endif %}
-{%- if "Flax" in cookiecutter.generate_tensorflow_pytorch_and_flax %}
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
- try:
- if not is_flax_available():
- raise OptionalDependencyNotAvailable()
- except OptionalDependencyNotAvailable:
- pass
- else:
- from .modeling_{{cookiecutter.lowercase_modelname}} import (
- Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM,
- Flax{{cookiecutter.camelcase_modelname}}ForCausalLM,
- Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
- Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification,
- Flax{{cookiecutter.camelcase_modelname}}Layer,
- Flax{{cookiecutter.camelcase_modelname}}Model,
- Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel,
- )
-{% else %}
- try:
- if not is_flax_available():
- raise OptionalDependencyNotAvailable()
- except OptionalDependencyNotAvailable:
- pass
- else:
- from .modeling_{{cookiecutter.lowercase_modelname}} import (
- Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
- Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- Flax{{cookiecutter.camelcase_modelname}}Model,
- Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel,
- )
-{% endif %}
-{% endif %}
-
-else:
- import sys
-
- sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json
deleted file mode 100644
index fea453b421fa20..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
- "modelname": "{{cookiecutter.modelname}}",
- "uppercase_modelname": "{{cookiecutter.uppercase_modelname}}",
- "lowercase_modelname": "{{cookiecutter.lowercase_modelname}}",
- "camelcase_modelname": "{{cookiecutter.camelcase_modelname}}",
- "authors": "{{cookiecutter.authors}}",
- "checkpoint_identifier": "{{cookiecutter.checkpoint_identifier}}",
- "tokenizer_type": "{{cookiecutter.tokenizer_type}}",
- "generate_tensorflow_pytorch_and_flax": "{{cookiecutter.generate_tensorflow_pytorch_and_flax}}",
- "is_encoder_decoder_model": "{{cookiecutter.is_encoder_decoder_model}}"
-}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
deleted file mode 100644
index 61f4e81d744193..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# coding=utf-8
-# Copyright 2022 {{cookiecutter.authors}} and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" {{cookiecutter.modelname}} model configuration """
-
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
- r"""
- This is the configuration class to store the configuration of a [`~{{cookiecutter.camelcase_modelname}}Model`].
- It is used to instantiate an {{cookiecutter.modelname}} model according to the specified arguments, defining the model
- architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
- the {{cookiecutter.modelname}} [{{cookiecutter.checkpoint_identifier}}](https://huggingface.co/{{cookiecutter.checkpoint_identifier}}) architecture.
-
- Configuration objects inherit from [`PretrainedConfig`] and can be used
- to control the model outputs. Read the documentation from [`PretrainedConfig`]
- for more information.
-
-
- Args:
- {% if cookiecutter.is_encoder_decoder_model == "False" -%}
- vocab_size (`int`, *optional*, defaults to 30522):
- Vocabulary size of the {{cookiecutter.modelname}} model. Defines the number of different tokens that can be represented by the
- `inputs_ids` passed when calling [`~{{cookiecutter.camelcase_modelname}}Model`] or
- [`~TF{{cookiecutter.camelcase_modelname}}Model`].
- hidden_size (`int`, *optional*, defaults to 768):
- Dimension of the encoder layers and the pooler layer.
- num_hidden_layers (`int`, *optional*, defaults to 12):
- Number of hidden layers in the Transformer encoder.
- num_attention_heads (`int`, *optional*, defaults to 12):
- Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (`int`, *optional*, defaults to 3072):
- Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
- The non-linear activation function (function or string) in the encoder and pooler.
- If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
- hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
- The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
- The dropout ratio for the attention probabilities.
- max_position_embeddings (`int`, *optional*, defaults to 512):
- The maximum sequence length that this model might ever be used with.
- Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (`int`, *optional*, defaults to 2):
- The vocabulary size of the `token_type_ids` passed when calling [`~{{cookiecutter.camelcase_modelname}}Model`] or
- [`~TF{{cookiecutter.camelcase_modelname}}Model`].
- initializer_range (`float`, *optional*, defaults to 0.02):
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (`float`, *optional*, defaults to 1e-12):
- The epsilon used by the layer normalization layers.
- use_cache (`bool`, *optional*, defaults to `True`):
- Whether or not the model should return the last key/values attentions (not used by all models). Only
- relevant if `config.is_decoder=True`.
- {% else -%}
- vocab_size (`int`, *optional*, defaults to 50265):
- Vocabulary size of the {{cookiecutter.modelname}} model. Defines the number of different tokens that can be represented by the
- `inputs_ids` passed when calling [`~{{cookiecutter.camelcase_modelname}}Model`] or
- [`~TF{{cookiecutter.camelcase_modelname}}Model`].
- d_model (`int`, *optional*, defaults to 1024):
- Dimension of the layers and the pooler layer.
- encoder_layers (`int`, *optional*, defaults to 12):
- Number of encoder layers.
- decoder_layers (`int`, *optional*, defaults to 12):
- Number of decoder layers.
- encoder_attention_heads (`int`, *optional*, defaults to 16):
- Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (`int`, *optional*, defaults to 16):
- Number of attention heads for each attention layer in the Transformer decoder.
- decoder_ffn_dim (`int`, *optional*, defaults to 4096):
- Dimension of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (`int`, *optional*, defaults to 4096):
- Dimension of the "intermediate" (often named feed-forward) layer in decoder.
- activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
- The non-linear activation function (function or string) in the encoder and pooler. If string,
- `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
- dropout (`float`, *optional*, defaults to 0.1):
- The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (`float`, *optional*, defaults to 0.0):
- The dropout ratio for the attention probabilities.
- activation_dropout (`float`, *optional*, defaults to 0.0):
- The dropout ratio for activations inside the fully connected layer.
- classifier_dropout (`float`, *optional*, defaults to 0.0):
- The dropout ratio for classifier.
- max_position_embeddings (`int`, *optional*, defaults to 1024):
- The maximum sequence length that this model might ever be used with. Typically set this to something large
- just in case (e.g., 512 or 1024 or 2048).
- init_std (`float`, *optional*, defaults to 0.02):
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- encoder_layerdrop (`float`, *optional*, defaults to 0.0):
- The LayerDrop probability for the encoder. See the [LayerDrop paper](see
- https://arxiv.org/abs/1909.11556) for more details.
- decoder_layerdrop (`float`, *optional*, defaults to 0.0):
- The LayerDrop probability for the decoder. See the [LayerDrop paper](see
- https://arxiv.org/abs/1909.11556) for more details.
- use_cache (`bool`, *optional*, defaults to `True`):
- Whether or not the model should return the last key/values attentions (not used by all models).
- {% endif -%}
-
- Example:
-
- ```python
- >>> from transformers import {{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}Config
-
- >>> # Initializing a {{cookiecutter.modelname}} {{cookiecutter.checkpoint_identifier}} style configuration
- >>> configuration = {{cookiecutter.camelcase_modelname}}Config()
-
- >>> # Initializing a model from the {{cookiecutter.checkpoint_identifier}} style configuration
- >>> model = {{cookiecutter.camelcase_modelname}}Model(configuration)
-
- >>> # Accessing the model configuration
- >>> configuration = model.config
- ```
-"""
- model_type = "{{cookiecutter.lowercase_modelname}}"
- {% if cookiecutter.is_encoder_decoder_model == "False" -%}
- {% else -%}
- keys_to_ignore_at_inference = ["past_key_values"]
- {% endif -%}
-
- {% if cookiecutter.is_encoder_decoder_model == "False" %}
- {%- else %}
- attribute_map = {
- "num_attention_heads": "encoder_attention_heads",
- "hidden_size": "d_model"
- }
-
- {%- endif %}
-
- def __init__(
- self,
- {% if cookiecutter.is_encoder_decoder_model == "False" -%}
- vocab_size=30522,
- hidden_size=768,
- num_hidden_layers=12,
- num_attention_heads=12,
- intermediate_size=3072,
- hidden_act="gelu",
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=512,
- type_vocab_size=2,
- initializer_range=0.02,
- layer_norm_eps=1e-12,
- use_cache=True,
- {% else -%}
- vocab_size=50265,
- max_position_embeddings=1024,
- encoder_layers=12,
- encoder_ffn_dim=4096,
- encoder_attention_heads=16,
- decoder_layers=12,
- decoder_ffn_dim=4096,
- decoder_attention_heads=16,
- encoder_layerdrop=0.0,
- decoder_layerdrop=0.0,
- use_cache=True,
- is_encoder_decoder=True,
- activation_function="gelu",
- d_model=1024,
- dropout=0.1,
- attention_dropout=0.0,
- activation_dropout=0.0,
- init_std=0.02,
- decoder_start_token_id=2,
- classifier_dropout=0.0,
- scale_embedding=False,
- {% endif -%}
- pad_token_id=1,
- bos_token_id=0,
- eos_token_id=2,
- **kwargs
- ):
- self.vocab_size = vocab_size
- self.max_position_embeddings = max_position_embeddings
- {% if cookiecutter.is_encoder_decoder_model == "False" -%}
- self.hidden_size = hidden_size
- self.num_hidden_layers = num_hidden_layers
- self.num_attention_heads = num_attention_heads
- self.intermediate_size = intermediate_size
- self.hidden_act = hidden_act
- self.hidden_dropout_prob = hidden_dropout_prob
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
- self.initializer_range = initializer_range
- self.type_vocab_size = type_vocab_size
- self.layer_norm_eps = layer_norm_eps
- self.use_cache = use_cache
- {% else -%}
- self.d_model = d_model
- self.encoder_ffn_dim = encoder_ffn_dim
- self.encoder_layers = encoder_layers
- self.encoder_attention_heads = encoder_attention_heads
- self.decoder_ffn_dim = decoder_ffn_dim
- self.decoder_layers = decoder_layers
- self.decoder_attention_heads = decoder_attention_heads
- self.dropout = dropout
- self.attention_dropout = attention_dropout
- self.activation_dropout = activation_dropout
- self.activation_function = activation_function
- self.init_std = init_std
- self.encoder_layerdrop = encoder_layerdrop
- self.decoder_layerdrop = decoder_layerdrop
- self.classifier_dropout = classifier_dropout
- self.use_cache = use_cache
- self.num_hidden_layers = encoder_layers
- self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
-
- {% endif -%}
- super().__init__(
- pad_token_id=pad_token_id,
- bos_token_id=bos_token_id,
- eos_token_id=eos_token_id,
- {% if cookiecutter.is_encoder_decoder_model == "False" -%}
- {% else -%}
- is_encoder_decoder=is_encoder_decoder,
- decoder_start_token_id=decoder_start_token_id,
- {% endif -%}
- **kwargs
- )
-
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
deleted file mode 100644
index 6cccf46eeb62d6..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
+++ /dev/null
@@ -1,3240 +0,0 @@
-# coding=utf-8
-# Copyright 2022 {{cookiecutter.authors}} and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Flax {{cookiecutter.modelname}} model. """
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-
-from typing import Callable, Optional, Tuple
-
-import numpy as np
-
-import flax.linen as nn
-import jax
-import jax.numpy as jnp
-from flax.core.frozen_dict import FrozenDict, unfreeze, freeze
-from flax.linen import combine_masks, make_causal_mask
-from flax.linen import partitioning as nn_partitioning
-from flax.traverse_util import flatten_dict, unflatten_dict
-from flax.linen.attention import dot_product_attention_weights
-from jax import lax
-
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward
-from ...modeling_flax_outputs import (
- FlaxBaseModelOutputWithPastAndCrossAttentions,
- FlaxBaseModelOutputWithPoolingAndCrossAttentions,
- FlaxCausalLMOutput,
- FlaxCausalLMOutputWithCrossAttentions,
- FlaxMaskedLMOutput,
- FlaxMultipleChoiceModelOutput,
- FlaxQuestionAnsweringModelOutput,
- FlaxSequenceClassifierOutput,
- FlaxTokenClassifierOutput,
-)
-from ...modeling_flax_utils import (
- ACT2FN,
- FlaxPreTrainedModel,
- append_call_sample_docstring,
- overwrite_call_docstring,
-)
-from ...utils import logging
-from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "{{cookiecutter.checkpoint_identifier}}"
-_CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config"
-_TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer"
-{{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""
-
- This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
- generic methods the library implements for all its model (such as downloading, saving and converting weights from
- PyTorch models)
-
- This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as a regular Flax linen Module
- and refer to the Flax documentation for all matter related to general usage and behavior.
-
- Finally, this model supports inherent JAX features such as:
-
- - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
-
- Parameters:
- config ([`~{{cookiecutter.uppercase_modelname}}Config`]): Model configuration class with all the parameters of the model.
- Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
- model weights.
- dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
- The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
- GPUs) and `jax.numpy.bfloat16` (on TPUs).
-
- This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given `dtype`.
-
- **Note that this only specifies the dtype of the computation and does not influence the dtype of model
- parameters.**
-
- If you wish to change the dtype of the model parameters, see
- [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
-"""
-{{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r"""
- Args:
- input_ids (`numpy.ndarray` of shape `({0})`):
- Indices of input sequence tokens in the vocabulary.
-
- Indices can be obtained using [`~{{cookiecutter.uppercase_modelname}}ConfiTokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
- details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
-
- - 0 corresponds to a *sentence A* token,
- - 1 corresponds to a *sentence B* token.
-
- [What are token type IDs?](../glossary#token-type-ids)
- position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- head_mask (`numpy.ndarray` of shape `({0})`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-
-"""
-
-remat = nn_partitioning.remat
-
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}Embeddings(nn.Module):
- """Construct the embeddings from word, position and token_type embeddings."""
-
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
-
- def setup(self):
- self.word_embeddings = nn.Embed(
- self.config.vocab_size,
- self.config.hidden_size,
- embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
- )
- self.position_embeddings = nn.Embed(
- self.config.max_position_embeddings,
- self.config.hidden_size,
- embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
- )
- self.token_type_embeddings = nn.Embed(
- self.config.type_vocab_size,
- self.config.hidden_size,
- embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
- )
- self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
- self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
-
- def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True):
- # Embed
- inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
- position_embeds = self.position_embeddings(position_ids.astype("i4"))
- token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
-
- # Sum all embeddings
- hidden_states = inputs_embeds + token_type_embeddings + position_embeds
-
- # Layer Norm
- hidden_states = self.LayerNorm(hidden_states)
- hidden_states = self.dropout(hidden_states, deterministic=deterministic)
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}SelfAttention(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- causal: bool = False
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
-
- def setup(self):
- self.head_dim = self.config.hidden_size // self.config.num_attention_heads
- if self.config.hidden_size % self.config.num_attention_heads != 0:
- raise ValueError(
- "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`\
- : {self.config.num_attention_heads}"
- )
-
- self.query = nn.Dense(
- self.config.hidden_size,
- dtype=self.dtype,
- kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
- )
- self.key = nn.Dense(
- self.config.hidden_size,
- dtype=self.dtype,
- kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
- )
- self.value = nn.Dense(
- self.config.hidden_size,
- dtype=self.dtype,
- kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
- )
-
- if self.causal:
- self.causal_mask = make_causal_mask(
- jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
- )
-
- def _split_heads(self, hidden_states):
- return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim))
-
- def _merge_heads(self, hidden_states):
- return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,))
-
- @nn.compact
- # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache
- def _concatenate_to_cache(self, key, value, query, attention_mask):
- """
- This function takes projected key, value states from a single input token and concatenates the states to cached
- states from previous steps. This function is slighly adapted from the official Flax repository:
- https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
- """
- # detect if we're initializing by absence of existing cache data.
- is_initialized = self.has_variable("cache", "cached_key")
- cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
- cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
- cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
-
- if is_initialized:
- *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
- # update key, value caches with our new 1d spatial slices
- cur_index = cache_index.value
- indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
- key = lax.dynamic_update_slice(cached_key.value, key, indices)
- value = lax.dynamic_update_slice(cached_value.value, value, indices)
- cached_key.value = key
- cached_value.value = value
- num_updated_cache_vectors = query.shape[1]
- cache_index.value = cache_index.value + num_updated_cache_vectors
- # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
- pad_mask = jnp.broadcast_to(
- jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
- tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
- )
- attention_mask = combine_masks(pad_mask, attention_mask)
- return key, value, attention_mask
-
- def __call__(
- self,
- hidden_states,
- attention_mask,
- layer_head_mask,
- key_value_states: Optional[jnp.ndarray] = None,
- init_cache: bool = False,
- deterministic=True,
- output_attentions: bool = False,
- ):
- # if key_value_states are provided this layer is used as a cross-attention layer
- # for the decoder
- is_cross_attention = key_value_states is not None
- batch_size = hidden_states.shape[0]
-
- # get query proj
- query_states = self.query(hidden_states)
- # get key, value proj
- if is_cross_attention:
- # cross_attentions
- key_states = self.key(key_value_states)
- value_states = self.value(key_value_states)
- else:
- # self_attention
- key_states = self.key(hidden_states)
- value_states = self.value(hidden_states)
-
- query_states = self._split_heads(query_states)
- key_states = self._split_heads(key_states)
- value_states = self._split_heads(value_states)
-
- # handle cache prepare causal attention mask
- if self.causal:
- query_length, key_length = query_states.shape[1], key_states.shape[1]
- if self.has_variable("cache", "cached_key"):
- mask_shift = self.variables["cache"]["cache_index"]
- max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
- causal_mask = lax.dynamic_slice(
- self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
- )
- else:
- causal_mask = self.causal_mask[:, :, :query_length, :key_length]
- causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
-
- # combine masks if needed
- if attention_mask is not None and self.causal:
- attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
- attention_mask = combine_masks(attention_mask, causal_mask)
- elif self.causal:
- attention_mask = causal_mask
- elif attention_mask is not None:
- attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
-
- # During fast autoregressive decoding, we feed one position at a time,
- # and cache the keys and values step by step.
- if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
- key_states, value_states, attention_mask = self._concatenate_to_cache(
- key_states, value_states, query_states, attention_mask
- )
-
- # Convert the boolean attention mask to an attention bias.
- if attention_mask is not None:
- # attention mask in the form of attention bias
- attention_bias = lax.select(
- attention_mask > 0,
- jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
- jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
- )
- else:
- attention_bias = None
-
- dropout_rng = None
- if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
- dropout_rng = self.make_rng("dropout")
-
- attn_weights = dot_product_attention_weights(
- query_states,
- key_states,
- bias=attention_bias,
- dropout_rng=dropout_rng,
- dropout_rate=self.config.attention_probs_dropout_prob,
- broadcast_dropout=True,
- deterministic=deterministic,
- dtype=self.dtype,
- precision=None,
- )
-
- # Mask heads if we want to
- if layer_head_mask is not None:
- attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask)
-
- attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
- attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
-
- outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
- return outputs
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}SelfOutput(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
-
- def setup(self):
- self.dense = nn.Dense(
- self.config.hidden_size,
- kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
- dtype=self.dtype,
- )
- self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
- self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
-
- def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
- hidden_states = self.dense(hidden_states)
- hidden_states = self.dropout(hidden_states, deterministic=deterministic)
- hidden_states = self.LayerNorm(hidden_states + input_tensor)
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertAttention with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}Attention(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- causal: bool = False
- dtype: jnp.dtype = jnp.float32
-
- def setup(self):
- self.self = Flax{{cookiecutter.camelcase_modelname}}SelfAttention(self.config, dtype=self.dtype)
- self.output = Flax{{cookiecutter.camelcase_modelname}}SelfOutput(self.config, dtype=self.dtype)
-
- def __call__(
- self,
- hidden_states,
- attention_mask,
- layer_head_mask,
- key_value_states=None,
- init_cache=False,
- deterministic=True,
- output_attentions: bool = False,
- ):
- # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length)
- # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable
- # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length)
- attn_outputs = self.self(
- hidden_states,
- attention_mask,
- layer_head_mask=layer_head_mask,
- key_value_states=key_value_states,
- init_cache=init_cache,
- deterministic=deterministic,
- output_attentions=output_attentions,
- )
- attn_output = attn_outputs[0]
- hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
-
- outputs = (hidden_states,)
-
- if output_attentions:
- outputs += (attn_outputs[1],)
-
- return outputs
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}Intermediate(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
-
- def setup(self):
- self.dense = nn.Dense(
- self.config.intermediate_size,
- kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
- dtype=self.dtype,
- )
- self.activation = ACT2FN[self.config.hidden_act]
-
- def __call__(self, hidden_states):
- hidden_states = self.dense(hidden_states)
- hidden_states = self.activation(hidden_states)
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}Output(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
-
- def setup(self):
- self.dense = nn.Dense(
- self.config.hidden_size,
- kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
- dtype=self.dtype,
- )
- self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
- self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
-
- def __call__(self, hidden_states, attention_output, deterministic: bool = True):
- hidden_states = self.dense(hidden_states)
- hidden_states = self.dropout(hidden_states, deterministic=deterministic)
- hidden_states = self.LayerNorm(hidden_states + attention_output)
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}Layer(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
-
- def setup(self):
- self.attention = Flax{{cookiecutter.camelcase_modelname}}Attention(self.config, dtype=self.dtype)
- self.intermediate = Flax{{cookiecutter.camelcase_modelname}}Intermediate(self.config, dtype=self.dtype)
- self.output = Flax{{cookiecutter.camelcase_modelname}}Output(self.config, dtype=self.dtype)
- if self.config.add_cross_attention:
- self.crossattention = Flax{{cookiecutter.camelcase_modelname}}Attention(self.config, causal=False, dtype=self.dtype)
-
- def __call__(
- self,
- hidden_states,
- attention_mask,
- layer_head_mask,
- encoder_hidden_states: Optional[jnp.ndarray] = None,
- encoder_attention_mask: Optional[jnp.ndarray] = None,
- init_cache: bool = False,
- deterministic: bool = True,
- output_attentions: bool = False,
- ):
- # Self Attention
- attention_outputs = self.attention(
- hidden_states,
- attention_mask,
- layer_head_mask=layer_head_mask,
- init_cache=init_cache,
- deterministic=deterministic,
- output_attentions=output_attentions,
- )
- attention_output = attention_outputs[0]
-
- # Cross-Attention Block
- if encoder_hidden_states is not None:
- cross_attention_outputs = self.crossattention(
- attention_output,
- attention_mask=encoder_attention_mask,
- layer_head_mask=layer_head_mask,
- key_value_states=encoder_hidden_states,
- deterministic=deterministic,
- output_attentions=output_attentions,
- )
- attention_output = cross_attention_outputs[0]
-
- hidden_states = self.intermediate(attention_output)
- hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
-
- outputs = (hidden_states,)
-
- if output_attentions:
- outputs += (attention_outputs[1],)
- if encoder_hidden_states is not None:
- outputs += (cross_attention_outputs[1],)
- return outputs
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}LayerCollection(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
- gradient_checkpointing: bool = False
-
- def setup(self):
- if self.gradient_checkpointing:
- Flax{{cookiecutter.camelcase_modelname}}CheckpointLayer = remat(Flax{{cookiecutter.camelcase_modelname}}Layer, static_argnums=(5, 6, 7))
- self.layers = [
- Flax{{cookiecutter.camelcase_modelname}}CheckpointLayer(self.config, name=str(i), dtype=self.dtype)
- for i in range(self.config.num_hidden_layers)
- ]
- else:
- self.layers = [
- Flax{{cookiecutter.camelcase_modelname}}Layer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
- ]
-
- def __call__(
- self,
- hidden_states,
- attention_mask,
- head_mask,
- encoder_hidden_states: Optional[jnp.ndarray] = None,
- encoder_attention_mask: Optional[jnp.ndarray] = None,
- init_cache: bool = False,
- deterministic: bool = True,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- ):
- all_attentions = () if output_attentions else None
- all_hidden_states = () if output_hidden_states else None
- all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
-
- # Check if head_mask has a correct number of layers specified if desired
- if head_mask is not None:
- if head_mask.shape[0] != (len(self.layers)):
- raise ValueError(
- f"The head_mask should be specified for {len(self.layers)} layers, but it is for \
- {head_mask.shape[0]}."
- )
-
- for i, layer in enumerate(self.layers):
- if output_hidden_states:
- all_hidden_states += (hidden_states,)
-
- layer_outputs = layer(
- hidden_states,
- attention_mask,
- head_mask[i] if head_mask is not None else None,
- encoder_hidden_states,
- encoder_attention_mask,
- init_cache,
- deterministic,
- output_attentions,
- )
-
- hidden_states = layer_outputs[0]
-
- if output_attentions:
- all_attentions += (layer_outputs[1],)
-
- if encoder_hidden_states is not None:
- all_cross_attentions += (layer_outputs[2],)
-
- if output_hidden_states:
- all_hidden_states += (hidden_states,)
-
- outputs = (hidden_states,)
-
- if not return_dict:
- return tuple(v for v in outputs if v is not None)
-
- return FlaxBaseModelOutputWithPastAndCrossAttentions(
- last_hidden_state=hidden_states,
- hidden_states=all_hidden_states,
- attentions=all_attentions,
- cross_attentions=all_cross_attentions,
- )
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}Encoder(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
- gradient_checkpointing: bool = False
-
- def setup(self):
- self.layer = Flax{{cookiecutter.camelcase_modelname}}LayerCollection(self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing)
-
- def __call__(
- self,
- hidden_states,
- attention_mask,
- head_mask,
- encoder_hidden_states: Optional[jnp.ndarray] = None,
- encoder_attention_mask: Optional[jnp.ndarray] = None,
- init_cache: bool = False,
- deterministic: bool = True,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- ):
- return self.layer(
- hidden_states,
- attention_mask,
- head_mask=head_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- init_cache=init_cache,
- deterministic=deterministic,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPooler with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}Pooler(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
-
- def setup(self):
- self.dense = nn.Dense(
- self.config.hidden_size,
- kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
- dtype=self.dtype,
- )
-
- def __call__(self, hidden_states):
- cls_hidden_state = hidden_states[:, 0]
- cls_hidden_state = self.dense(cls_hidden_state)
- return nn.tanh(cls_hidden_state)
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPredictionHeadTransform with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
-
- def setup(self):
- self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
- self.activation = ACT2FN[self.config.hidden_act]
- self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
-
- def __call__(self, hidden_states):
- hidden_states = self.dense(hidden_states)
- hidden_states = self.activation(hidden_states)
- return self.LayerNorm(hidden_states)
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLMPredictionHead with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}LMPredictionHead(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
- bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
-
- def setup(self):
- self.transform = Flax{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(self.config, dtype=self.dtype)
- self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False)
- self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
-
- def __call__(self, hidden_states, shared_embedding=None):
- hidden_states = self.transform(hidden_states)
-
- if shared_embedding is not None:
- hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
- else:
- hidden_states = self.decoder(hidden_states)
-
- hidden_states += self.bias
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOnlyMLMHead with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}OnlyMLMHead(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
-
- def setup(self):
- self.predictions = Flax{{cookiecutter.camelcase_modelname}}LMPredictionHead(self.config, dtype=self.dtype)
-
- def __call__(self, hidden_states, shared_embedding=None):
- hidden_states = self.predictions(hidden_states, shared_embedding=shared_embedding)
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOnlyNSPHead with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}OnlyNSPHead(nn.Module):
- dtype: jnp.dtype = jnp.float32
-
- def setup(self):
- self.seq_relationship = nn.Dense(2, dtype=self.dtype)
-
- def __call__(self, pooled_output):
- return self.seq_relationship(pooled_output)
-
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainingHeads with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}PreTrainingHeads(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
-
- def setup(self):
- self.predictions = Flax{{cookiecutter.camelcase_modelname}}LMPredictionHead(self.config, dtype=self.dtype)
- self.seq_relationship = nn.Dense(2, dtype=self.dtype)
-
- def __call__(self, hidden_states, pooled_output, shared_embedding=None):
- prediction_scores = self.predictions(hidden_states, shared_embedding=shared_embedding)
- seq_relationship_score = self.seq_relationship(pooled_output)
- return prediction_scores, seq_relationship_score
-
-
-class Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel(FlaxPreTrainedModel):
- """
- An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
- models.
- """
-
- config_class = {{cookiecutter.camelcase_modelname}}Config
- base_model_prefix = "{{cookiecutter.lowercase_modelname}}"
- module_class: nn.Module = None
-
- def __init__(
- self,
- config: {{cookiecutter.camelcase_modelname}}Config,
- input_shape: Tuple = (1, 1),
- seed: int = 0,
- dtype: jnp.dtype = jnp.float32,
- _do_init: bool = True,
- gradient_checkpointing: bool = False,
- **kwargs
- ):
- module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
- super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
-
- # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing
- def enable_gradient_checkpointing(self):
- self._module = self.module_class(
- config=self.config,
- dtype=self.dtype,
- gradient_checkpointing=True,
- )
-
- # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.init_weights with Bert->{{cookiecutter.camelcase_modelname}}
- def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
- # init input tensors
- input_ids = jnp.zeros(input_shape, dtype="i4")
- token_type_ids = jnp.zeros_like(input_ids)
- position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
- attention_mask = jnp.ones_like(input_ids)
- head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
-
- params_rng, dropout_rng = jax.random.split(rng)
- rngs = {"params": params_rng, "dropout": dropout_rng}
-
- if self.config.add_cross_attention:
- encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
- encoder_attention_mask = attention_mask
- module_init_outputs = self.module.init(
- rngs,
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- encoder_hidden_states,
- encoder_attention_mask,
- return_dict=False,
- )
- else:
- module_init_outputs = self.module.init(
- rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False
- )
-
- random_params = module_init_outputs["params"]
-
- if params is not None:
- random_params = flatten_dict(unfreeze(random_params))
- params = flatten_dict(unfreeze(params))
- for missing_key in self._missing_keys:
- params[missing_key] = random_params[missing_key]
- self._missing_keys = set()
- return freeze(unflatten_dict(params))
- else:
- return random_params
-
-
- # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.init_cache with Bert->{{cookiecutter.camelcase_modelname}}
- def init_cache(self, batch_size, max_length):
- r"""
- Args:
- batch_size (`int`):
- batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
- max_length (`int`):
- maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
- cache.
- """
- # init input variables to retrieve cache
- input_ids = jnp.ones((batch_size, max_length))
- attention_mask = jnp.ones_like(input_ids)
- position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
-
- init_variables = self.module.init(
- jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
- )
- return unfreeze(init_variables["cache"])
-
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.__call__ with Bert->{{cookiecutter.camelcase_modelname}}
- def __call__(
- self,
- input_ids,
- attention_mask=None,
- token_type_ids=None,
- position_ids=None,
- head_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- params: dict = None,
- dropout_rng: jax.random.PRNGKey = None,
- train: bool = False,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- past_key_values: dict = None,
- ):
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.return_dict
-
- # init input tensors if not passed
- if token_type_ids is None:
- token_type_ids = jnp.zeros_like(input_ids)
-
- if position_ids is None:
- position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
-
- if attention_mask is None:
- attention_mask = jnp.ones_like(input_ids)
-
- if head_mask is None:
- head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
-
- # Handle any PRNG if needed
- rngs = {}
- if dropout_rng is not None:
- rngs["dropout"] = dropout_rng
-
- inputs = {"params": params or self.params}
-
- if self.config.add_cross_attention:
- # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed
- # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be
- # changed by FlaxBertAttention module
- if past_key_values:
- inputs["cache"] = past_key_values
- mutable = ["cache"]
- else:
- mutable = False
-
- outputs = self.module.apply(
- inputs,
- jnp.array(input_ids, dtype="i4"),
- jnp.array(attention_mask, dtype="i4"),
- token_type_ids=jnp.array(token_type_ids, dtype="i4"),
- position_ids=jnp.array(position_ids, dtype="i4"),
- head_mask=jnp.array(head_mask, dtype="i4"),
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- deterministic=not train,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- rngs=rngs,
- mutable=mutable,
- )
-
- # add updated cache to model output
- if past_key_values is not None and return_dict:
- outputs, past_key_values = outputs
- outputs["past_key_values"] = unfreeze(past_key_values["cache"])
- return outputs
- elif past_key_values is not None and not return_dict:
- outputs, past_key_values = outputs
- outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
-
- else:
- outputs = self.module.apply(
- inputs,
- jnp.array(input_ids, dtype="i4"),
- jnp.array(attention_mask, dtype="i4"),
- token_type_ids=jnp.array(token_type_ids, dtype="i4"),
- position_ids=jnp.array(position_ids, dtype="i4"),
- head_mask=jnp.array(head_mask, dtype="i4"),
- deterministic=not train,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- rngs=rngs,
- )
-
- return outputs
-
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertModule with Bert->{{cookiecutter.camelcase_modelname}}
-class Flax{{cookiecutter.camelcase_modelname}}Module(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
- add_pooling_layer: bool = True
- gradient_checkpointing: bool = False
-
- def setup(self):
- self.embeddings = Flax{{cookiecutter.camelcase_modelname}}Embeddings(self.config, dtype=self.dtype)
- self.encoder = Flax{{cookiecutter.camelcase_modelname}}Encoder(self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing)
- self.pooler = Flax{{cookiecutter.camelcase_modelname}}Pooler(self.config, dtype=self.dtype)
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- token_type_ids: Optional[jnp.ndarray] = None,
- position_ids: Optional[jnp.ndarray] = None,
- head_mask: Optional[jnp.ndarray] = None,
- encoder_hidden_states: Optional[jnp.ndarray] = None,
- encoder_attention_mask: Optional[jnp.ndarray] = None,
- init_cache: bool = False,
- deterministic: bool = True,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- ):
- # make sure `token_type_ids` is correctly initialized when not passed
- if token_type_ids is None:
- token_type_ids = jnp.zeros_like(input_ids)
-
- # make sure `position_ids` is correctly initialized when not passed
- if position_ids is None:
- position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
-
- hidden_states = self.embeddings(
- input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic
- )
- outputs = self.encoder(
- hidden_states,
- attention_mask,
- head_mask=head_mask,
- deterministic=deterministic,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- init_cache=init_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
- hidden_states = outputs[0]
- pooled = self.pooler(hidden_states) if self.add_pooling_layer else None
-
- if not return_dict:
- # if pooled is None, don't return it
- if pooled is None:
- return (hidden_states,) + outputs[1:]
- return (hidden_states, pooled) + outputs[1:]
-
- return FlaxBaseModelOutputWithPoolingAndCrossAttentions(
- last_hidden_state=hidden_states,
- pooler_output=pooled,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- cross_attentions=outputs.cross_attentions,
- )
-
-add_start_docstrings(
- "The bare {{cookiecutter.camelcase_modelname}} Model transformer outputting raw hidden-states without any specific head on top.",
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class Flax{{cookiecutter.camelcase_modelname}}Model(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- module_class = Flax{{cookiecutter.camelcase_modelname}}Module
-
-
-class Flax{{cookiecutter.camelcase_modelname}}ForMaskedLMModule(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
- gradient_checkpointing: bool = False
-
- def setup(self):
- self.{{cookiecutter.lowercase_modelname}} = Flax{{cookiecutter.camelcase_modelname}}Module(config=self.config, add_pooling_layer=False, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing)
- self.cls = Flax{{cookiecutter.camelcase_modelname}}OnlyMLMHead(config=self.config, dtype=self.dtype)
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- deterministic: bool = True,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- ):
- # Model
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- deterministic=deterministic,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- hidden_states = outputs[0]
- if self.config.tie_word_embeddings:
- shared_embedding = self.{{cookiecutter.lowercase_modelname}}.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
- else:
- shared_embedding = None
-
- # Compute the prediction scores
- logits = self.cls(hidden_states, shared_embedding=shared_embedding)
-
- if not return_dict:
- return (logits,) + outputs[1:]
-
- return FlaxCausalLMOutput(
- logits=logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-@add_start_docstrings("""{{cookiecutter.camelcase_modelname}} Model with a `language modeling` head on top for MLM training. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING)
-class Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- module_class = Flax{{cookiecutter.camelcase_modelname}}ForMaskedLMModule
-
-
-append_call_sample_docstring(
- Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC
-)
-
-class Flax{{cookiecutter.camelcase_modelname}}ForCausalLMModule(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
- gradient_checkpointing: bool = False
-
- def setup(self):
- self.{{cookiecutter.lowercase_modelname}} = Flax{{cookiecutter.camelcase_modelname}}Module(config=self.config, add_pooling_layer=False, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing)
- self.cls = Flax{{cookiecutter.camelcase_modelname}}OnlyMLMHead(config=self.config, dtype=self.dtype)
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- deterministic: bool = True,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- ):
- # Model
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- deterministic=deterministic,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- hidden_states = outputs[0]
- if self.config.tie_word_embeddings:
- shared_embedding = self.{{cookiecutter.lowercase_modelname}}.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
- else:
- shared_embedding = None
-
- # Compute the prediction scores
- logits = self.cls(hidden_states, shared_embedding=shared_embedding)
-
- if not return_dict:
- return (logits,) + outputs[1:]
-
- return FlaxCausalLMOutput(
- logits=logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-@add_start_docstrings("""{{cookiecutter.camelcase_modelname}} Model with a `language modeling` head on top for CLM training. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING)
-class Flax{{cookiecutter.camelcase_modelname}}ForCausalLM(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- module_class = Flax{{cookiecutter.camelcase_modelname}}ForCausalLMModule
-
-
-append_call_sample_docstring(
- Flax{{cookiecutter.camelcase_modelname}}ForCausalLM, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC
-)
-
-
-class Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassificationModule(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
- gradient_checkpointing: bool = False
-
- def setup(self):
- self.{{cookiecutter.lowercase_modelname}} = Flax{{cookiecutter.camelcase_modelname}}Module(config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing)
- self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
- self.classifier = nn.Dense(
- self.config.num_labels,
- dtype=self.dtype,
- )
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- deterministic: bool = True,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- ):
- # Model
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- deterministic=deterministic,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- pooled_output = outputs[1]
- pooled_output = self.dropout(pooled_output, deterministic=deterministic)
- logits = self.classifier(pooled_output)
-
- if not return_dict:
- return (logits,) + outputs[2:]
-
- return FlaxSequenceClassifierOutput(
- logits=logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-@add_start_docstrings(
- """
- {{cookiecutter.camelcase_modelname}} Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
- output) e.g. for GLUE tasks.
- """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- module_class = Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassificationModule
-
-
-append_call_sample_docstring(
- Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- _TOKENIZER_FOR_DOC,
- _CHECKPOINT_FOR_DOC,
- FlaxSequenceClassifierOutput,
- _CONFIG_FOR_DOC,
-)
-
-
-class Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoiceModule(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
- gradient_checkpointing: bool = False
-
- def setup(self):
- self.{{cookiecutter.lowercase_modelname}} = Flax{{cookiecutter.camelcase_modelname}}Module(config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing)
- self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
- self.classifier = nn.Dense(1, dtype=self.dtype)
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- deterministic: bool = True,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- ):
- num_choices = input_ids.shape[1]
- input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
- attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
- token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
- position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None
-
- # Model
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- deterministic=deterministic,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- pooled_output = outputs[1]
- pooled_output = self.dropout(pooled_output, deterministic=deterministic)
- logits = self.classifier(pooled_output)
-
- reshaped_logits = logits.reshape(-1, num_choices)
-
- if not return_dict:
- return (reshaped_logits,) + outputs[2:]
-
- return FlaxMultipleChoiceModelOutput(
- logits=reshaped_logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-@add_start_docstrings(
- """
- {{cookiecutter.camelcase_modelname}} Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
- softmax) e.g. for RocStories/SWAG tasks.
- """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- module_class = Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoiceModule
-
-
-overwrite_call_docstring(
- Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice, {{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
-)
-append_call_sample_docstring(
- Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxMultipleChoiceModelOutput, _CONFIG_FOR_DOC
-)
-
-
-class Flax{{cookiecutter.camelcase_modelname}}ForTokenClassificationModule(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
- gradient_checkpointing: bool = False
-
- def setup(self):
- self.{{cookiecutter.lowercase_modelname}} = Flax{{cookiecutter.camelcase_modelname}}Module(config=self.config, dtype=self.dtype, add_pooling_layer=False, gradient_checkpointing=self.gradient_checkpointing)
- self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
- self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- deterministic: bool = True,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- ):
- # Model
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- deterministic=deterministic,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- hidden_states = outputs[0]
- hidden_states = self.dropout(hidden_states, deterministic=deterministic)
- logits = self.classifier(hidden_states)
-
- if not return_dict:
- return (logits,) + outputs[1:]
-
- return FlaxTokenClassifierOutput(
- logits=logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-@add_start_docstrings(
- """
- {{cookiecutter.camelcase_modelname}} Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
- Named-Entity-Recognition (NER) tasks.
- """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- module_class = Flax{{cookiecutter.camelcase_modelname}}ForTokenClassificationModule
-
-
-append_call_sample_docstring(
- Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxTokenClassifierOutput, _CONFIG_FOR_DOC
-)
-
-
-class Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnsweringModule(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
- gradient_checkpointing: bool = False
-
- def setup(self):
- self.{{cookiecutter.lowercase_modelname}} = Flax{{cookiecutter.camelcase_modelname}}Module(config=self.config, dtype=self.dtype, add_pooling_layer=False, gradient_checkpointing=self.gradient_checkpointing)
- self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- deterministic: bool = True,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- ):
- # Model
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- deterministic=deterministic,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- hidden_states = outputs[0]
-
- logits = self.qa_outputs(hidden_states)
- start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
- start_logits = start_logits.squeeze(-1)
- end_logits = end_logits.squeeze(-1)
-
- if not return_dict:
- return (start_logits, end_logits) + outputs[1:]
-
- return FlaxQuestionAnsweringModelOutput(
- start_logits=start_logits,
- end_logits=end_logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-@add_start_docstrings(
- """
- {{cookiecutter.camelcase_modelname}} Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
- layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
- """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- module_class = Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnsweringModule
-
-
-append_call_sample_docstring(
- Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- _TOKENIZER_FOR_DOC,
- _CHECKPOINT_FOR_DOC,
- FlaxQuestionAnsweringModelOutput,
- _CONFIG_FOR_DOC,
-)
-
-
-class Flax{{cookiecutter.camelcase_modelname}}ForCausalLMModule(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
- gradient_checkpointing: bool = False
-
- def setup(self):
- self.{{cookiecutter.lowercase_modelname}} = Flax{{cookiecutter.camelcase_modelname}}Module(config=self.config, add_pooling_layer=False, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing)
- self.cls = Flax{{cookiecutter.camelcase_modelname}}OnlyMLMHead(config=self.config, dtype=self.dtype)
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- position_ids,
- token_type_ids: Optional[jnp.ndarray] = None,
- head_mask: Optional[jnp.ndarray] = None,
- encoder_hidden_states: Optional[jnp.ndarray] = None,
- encoder_attention_mask: Optional[jnp.ndarray] = None,
- init_cache: bool = False,
- deterministic: bool = True,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- ):
- # Model
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask,
- token_type_ids,
- position_ids,
- head_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- init_cache=init_cache,
- deterministic=deterministic,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- hidden_states = outputs[0]
- if self.config.tie_word_embeddings:
- shared_embedding = self.{{cookiecutter.lowercase_modelname}}.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
- else:
- shared_embedding = None
-
- # Compute the prediction scores
- logits = self.cls(hidden_states, shared_embedding=shared_embedding)
-
- if not return_dict:
- return (logits,) + outputs[1:]
-
- return FlaxCausalLMOutputWithCrossAttentions(
- logits=logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- cross_attentions=outputs.cross_attentions,
- )
-
-
-@add_start_docstrings(
- """
- {{cookiecutter.camelcase_modelname}} Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
- autoregressive tasks.
- """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-
-class Flax{{cookiecutter.camelcase_modelname}}ForCausalLM(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- module_class = Flax{{cookiecutter.camelcase_modelname}}ForCausalLMModule
-
- def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
- # initializing the cache
- batch_size, seq_length = input_ids.shape
-
- past_key_values = self.init_cache(batch_size, max_length)
- # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
- # But since the decoder uses a causal mask, those positions are masked anyway.
- # Thus, we can create a single static attention_mask here, which is more efficient for compilation
- extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
- if attention_mask is not None:
- position_ids = attention_mask.cumsum(axis=-1) - 1
- extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
- else:
- position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
-
- return {
- "past_key_values": past_key_values,
- "attention_mask": extended_attention_mask,
- "position_ids": position_ids,
- }
-
- def update_inputs_for_generation(self, model_outputs, model_kwargs):
- model_kwargs["past_key_values"] = model_outputs.past_key_values
- model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
- return model_kwargs
-
-
-append_call_sample_docstring(
- Flax{{cookiecutter.camelcase_modelname}}ForCausalLM,
- _TOKENIZER_FOR_DOC,
- _CHECKPOINT_FOR_DOC,
- FlaxCausalLMOutputWithCrossAttentions,
- _CONFIG_FOR_DOC,
-)
-{# encoder_decoder #}
-{% else %}
-import math
-import random
-from functools import partial
-from typing import Callable, Optional, Tuple
-
-import flax.linen as nn
-import jax
-import jax.numpy as jnp
-from flax.core.frozen_dict import FrozenDict, unfreeze, freeze
-from flax.linen import combine_masks, make_causal_mask
-from flax.linen.attention import dot_product_attention_weights
-from flax.traverse_util import flatten_dict, unflatten_dict
-from jax import lax
-from jax.random import PRNGKey
-
-from ...utils import add_start_docstrings, replace_return_docstrings
-from ...modeling_flax_outputs import (
- FlaxBaseModelOutput,
- FlaxBaseModelOutputWithPastAndCrossAttentions,
- FlaxCausalLMOutputWithCrossAttentions,
- FlaxSeq2SeqLMOutput,
- FlaxSeq2SeqModelOutput,
- FlaxSeq2SeqQuestionAnsweringModelOutput,
- FlaxSeq2SeqSequenceClassifierOutput,
-)
-from ...modeling_flax_utils import (
- ACT2FN,
- FlaxPreTrainedModel,
- append_call_sample_docstring,
- append_replace_return_docstrings,
- overwrite_call_docstring,
-)
-from ...utils import logging
-from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "{{cookiecutter.checkpoint_identifier}}"
-_CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config"
-_TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer"
-
-{{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""
- This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
- generic methods the library implements for all its model (such as downloading or saving, resizing the input
- embeddings, pruning heads etc.)
-
- This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
- Module and refer to the Flax documentation for all matter related to general usage and behavior.
-
- Finally, this model supports inherent JAX features such as:
-
- - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
-
- Parameters:
- config ([`~{{cookiecutter.camelcase_modelname}}Config`]): Model configuration class with all the parameters of the model.
- Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
- model weights.
- dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
- The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
- GPUs) and `jax.numpy.bfloat16` (on TPUs).
-
- This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given `dtype`.
-
- **Note that this only specifies the dtype of the computation and does not influence the dtype of model
- parameters.**
-
- If you wish to change the dtype of the model parameters, see
- [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
-"""
-
-{{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r"""
- Args:
- input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
- it.
-
- Indices can be obtained using [`~{{cookiecutter.camelcase_modelname}}Tokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
- details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
- Indices of decoder input sequence tokens in the vocabulary.
-
- Indices can be obtained using [`~{{cookiecutter.camelcase_modelname}}Tokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
- details.
-
- [What are decoder input IDs?](../glossary#decoder-input-ids)
-
- For translation and summarization training, `decoder_input_ids` should be provided. If no
- `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
- the right for denoising pre-training following the paper.
- decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
- Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
- also be used by default.
-
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
- position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
- Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range `[0, config.max_position_embeddings - 1]`.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
- tensors for more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
- more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-{{cookiecutter.uppercase_modelname}}_ENCODE_INPUTS_DOCSTRING = r"""
- Args:
- input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
- it.
-
- Indices can be obtained using [`~{{cookiecutter.camelcase_modelname}}Tokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
- details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
- tensors for more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
- more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-{{cookiecutter.uppercase_modelname}}_DECODE_INPUTS_DOCSTRING = r"""
- Args:
- decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
- Indices of decoder input sequence tokens in the vocabulary.
-
- Indices can be obtained using [`~{{cookiecutter.camelcase_modelname}}Tokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
- details.
-
- [What are decoder input IDs?](../glossary#decoder-input-ids)
-
- For translation and summarization training, `decoder_input_ids` should be provided. If no
- `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
- the right for denoising pre-training following the paper.
- encoder_outputs (`tuple(tuple(jnp.ndarray)`):
- Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
- `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
- *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
- cross-attention of the decoder.
- encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
- Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
- also be used by default.
-
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
- decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
- Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range `[0, config.max_position_embeddings - 1]`.
- past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
- Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
- auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
- tensors for more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
- more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
- """
- Shift input ids one token to the right.
- """
- shifted_input_ids = jnp.roll(input_ids, 1, axis=-1)
- shifted_input_ids = shifted_input_ids.at[(..., 0)].set(decoder_start_token_id)
- # replace possible -100 values in labels by `pad_token_id`
- shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
-
- return shifted_input_ids
-
-
-
-class Flax{{cookiecutter.camelcase_modelname}}Attention(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- embed_dim: int
- num_heads: int
- dropout: float = 0.0
- causal: bool = False
- bias: bool = True
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
-
- def setup(self) -> None:
- self.head_dim = self.embed_dim // self.num_heads
- assert (
- self.head_dim * self.num_heads == self.embed_dim
- ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
-
- dense = partial(
- nn.Dense,
- self.embed_dim,
- use_bias=self.bias,
- dtype=self.dtype,
- kernel_init=jax.nn.initializers.normal(self.config.init_std),
- )
-
- self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
- self.out_proj = dense()
-
- self.dropout_layer = nn.Dropout(rate=self.dropout)
-
- if self.causal:
- self.causal_mask = make_causal_mask(
- jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
- )
-
- def _split_heads(self, hidden_states):
- return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
-
- def _merge_heads(self, hidden_states):
- return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
-
- @nn.compact
- def _concatenate_to_cache(self, key, value, query, attention_mask):
- """
- This function takes projected key, value states from a single input token and concatenates the states to cached
- states from previous steps. This function is slighly adapted from the official Flax repository:
- https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
- """
- # detect if we're initializing by absence of existing cache data.
- is_initialized = self.has_variable("cache", "cached_key")
- cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
- cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
- cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
-
- if is_initialized:
- *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
- # update key, value caches with our new 1d spatial slices
- cur_index = cache_index.value
- indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
- key = lax.dynamic_update_slice(cached_key.value, key, indices)
- value = lax.dynamic_update_slice(cached_value.value, value, indices)
- cached_key.value = key
- cached_value.value = value
- num_updated_cache_vectors = query.shape[1]
- cache_index.value = cache_index.value + num_updated_cache_vectors
- # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
- pad_mask = jnp.broadcast_to(
- jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
- tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
- )
- attention_mask = combine_masks(pad_mask, attention_mask)
- return key, value, attention_mask
-
- def __call__(
- self,
- hidden_states: jnp.ndarray,
- key_value_states: Optional[jnp.ndarray] = None,
- attention_mask: Optional[jnp.ndarray] = None,
- init_cache: bool = False,
- deterministic: bool = True,
- ) -> Tuple[jnp.ndarray]:
- """Input shape: Batch x Time x Channel"""
-
- # if key_value_states are provided this layer is used as a cross-attention layer
- # for the decoder
- is_cross_attention = key_value_states is not None
- batch_size = hidden_states.shape[0]
-
- # get query proj
- query_states = self.q_proj(hidden_states)
- # get key, value proj
- if is_cross_attention:
- # cross_attentions
- key_states = self.k_proj(key_value_states)
- value_states = self.v_proj(key_value_states)
- else:
- # self_attention
- key_states = self.k_proj(hidden_states)
- value_states = self.v_proj(hidden_states)
-
- query_states = self._split_heads(query_states)
- key_states = self._split_heads(key_states)
- value_states = self._split_heads(value_states)
-
- # handle cache prepare causal attention mask
- if self.causal:
- query_length, key_length = query_states.shape[1], key_states.shape[1]
- if self.has_variable("cache", "cached_key"):
- mask_shift = self.variables["cache"]["cache_index"]
- max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
- causal_mask = lax.dynamic_slice(
- self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
- )
- else:
- causal_mask = self.causal_mask[:, :, :query_length, :key_length]
- causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
-
- # combine masks if needed
- if attention_mask is not None and self.causal:
- attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
- attention_mask = combine_masks(attention_mask, causal_mask)
- elif self.causal:
- attention_mask = causal_mask
- elif attention_mask is not None:
- attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
-
- # During fast autoregressive decoding, we feed one position at a time,
- # and cache the keys and values step by step.
- if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
- key_states, value_states, attention_mask = self._concatenate_to_cache(
- key_states, value_states, query_states, attention_mask
- )
-
- # Convert the boolean attention mask to an attention bias.
- if attention_mask is not None:
- # attention mask in the form of attention bias
- attention_bias = lax.select(
- attention_mask > 0,
- jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
- jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
- )
- else:
- attention_bias = None
-
- dropout_rng = None
- if not deterministic and self.dropout > 0.0:
- dropout_rng = self.make_rng("dropout")
-
- attn_weights = dot_product_attention_weights(
- query_states,
- key_states,
- bias=attention_bias,
- dropout_rng=dropout_rng,
- dropout_rate=self.dropout,
- broadcast_dropout=True,
- deterministic=deterministic,
- dtype=self.dtype,
- precision=None,
- )
-
- attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
- attn_output = self._merge_heads(attn_output)
- attn_output = self.out_proj(attn_output)
-
- return attn_output, attn_weights
-
-
-class Flax{{cookiecutter.camelcase_modelname}}EncoderLayer(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
-
- def setup(self) -> None:
- self.embed_dim = self.config.d_model
- self.self_attn = Flax{{cookiecutter.camelcase_modelname}}Attention(
- config=self.config,
- embed_dim=self.embed_dim,
- num_heads=self.config.encoder_attention_heads,
- dropout=self.config.attention_dropout,
- dtype=self.dtype
- )
- self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype)
- self.dropout_layer = nn.Dropout(rate=self.config.dropout)
- self.activation_fn = ACT2FN[self.config.activation_function]
- self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
- self.fc1 = nn.Dense(
- self.config.encoder_ffn_dim,
- dtype=self.dtype,
- kernel_init=jax.nn.initializers.normal(self.config.init_std),
- )
- self.fc2 = nn.Dense(
- self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
- )
- self.final_layer_norm = nn.LayerNorm(dtype=self.dtype)
-
- def __call__(
- self,
- hidden_states: jnp.ndarray,
- attention_mask: jnp.ndarray,
- output_attentions: bool = True,
- deterministic: bool = True,
- ) -> Tuple[jnp.ndarray]:
- residual = hidden_states
- hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask)
-
- hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
- hidden_states = residual + hidden_states
- hidden_states = self.self_attn_layer_norm(hidden_states)
-
- residual = hidden_states
- hidden_states = self.activation_fn(self.fc1(hidden_states))
- hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
- hidden_states = self.fc2(hidden_states)
- hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
- hidden_states = residual + hidden_states
- hidden_states = self.final_layer_norm(hidden_states)
-
- outputs = (hidden_states,)
-
- if output_attentions:
- outputs += (attn_weights,)
-
- return outputs
-
-
-class Flax{{cookiecutter.camelcase_modelname}}EncoderLayerCollection(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
-
- def setup(self):
- self.layers = [
- Flax{{cookiecutter.camelcase_modelname}}EncoderLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.encoder_layers)
- ]
- self.layerdrop = self.config.encoder_layerdrop
-
- def __call__(
- self,
- hidden_states,
- attention_mask,
- deterministic: bool = True,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- ):
- all_attentions = () if output_attentions else None
- all_hidden_states = () if output_hidden_states else None
-
- for encoder_layer in self.layers:
- if output_hidden_states:
- all_hidden_states = all_hidden_states + (hidden_states,)
- # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
- dropout_probability = random.uniform(0, 1)
- if not deterministic and (dropout_probability < self.layerdrop): # skip the layer
- layer_outputs = (None, None)
- else:
- layer_outputs = encoder_layer(
- hidden_states,
- attention_mask,
- output_attentions,
- deterministic,
- )
- hidden_states = layer_outputs[0]
- if output_attentions:
- all_attentions = all_attentions + (layer_outputs[1],)
-
- if output_hidden_states:
- all_hidden_states += (hidden_states,)
-
- outputs = (hidden_states, all_hidden_states, all_attentions)
-
- if not return_dict:
- return tuple(v for v in outputs if v is not None)
-
- return FlaxBaseModelOutput(
- last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
- )
-
-
-class Flax{{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
-
- def setup(self) -> None:
- self.embed_dim = self.config.d_model
- self.self_attn = Flax{{cookiecutter.camelcase_modelname}}Attention(
- config=self.config,
- embed_dim=self.embed_dim,
- num_heads=self.config.decoder_attention_heads,
- dropout=self.config.attention_dropout,
- causal=True,
- dtype=self.dtype,
- )
- self.dropout_layer = nn.Dropout(rate=self.config.dropout)
- self.activation_fn = ACT2FN[self.config.activation_function]
- self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
-
- self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype)
- self.encoder_attn = Flax{{cookiecutter.camelcase_modelname}}Attention(
- config=self.config,
- embed_dim=self.embed_dim,
- num_heads=self.config.decoder_attention_heads,
- dropout=self.config.attention_dropout,
- dtype=self.dtype,
- )
- self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype)
- self.fc1 = nn.Dense(
- self.config.decoder_ffn_dim,
- dtype=self.dtype,
- kernel_init=jax.nn.initializers.normal(self.config.init_std),
- )
- self.fc2 = nn.Dense(
- self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
- )
- self.final_layer_norm = nn.LayerNorm(dtype=self.dtype)
-
- def __call__(
- self,
- hidden_states: jnp.ndarray,
- attention_mask: jnp.ndarray,
- encoder_hidden_states: Optional[jnp.ndarray] = None,
- encoder_attention_mask: Optional[jnp.ndarray] = None,
- init_cache: bool = False,
- output_attentions: bool = True,
- deterministic: bool = True,
- ) -> Tuple[jnp.ndarray]:
- residual = hidden_states
-
- # Self Attention
- hidden_states, self_attn_weights = self.self_attn(
- hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache
- )
- hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
- hidden_states = residual + hidden_states
- hidden_states = self.self_attn_layer_norm(hidden_states)
-
- # Cross-Attention Block
- cross_attn_weights = None
- if encoder_hidden_states is not None:
- residual = hidden_states
-
- hidden_states, cross_attn_weights = self.encoder_attn(
- hidden_states=hidden_states,
- key_value_states=encoder_hidden_states,
- attention_mask=encoder_attention_mask,
- )
- hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
- hidden_states = residual + hidden_states
- hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
- # Fully Connected
- residual = hidden_states
- hidden_states = self.activation_fn(self.fc1(hidden_states))
- hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
- hidden_states = self.fc2(hidden_states)
- hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
- hidden_states = residual + hidden_states
- hidden_states = self.final_layer_norm(hidden_states)
-
- outputs = (hidden_states,)
-
- if output_attentions:
- outputs += (self_attn_weights, cross_attn_weights)
-
- return outputs
-
-
-class Flax{{cookiecutter.camelcase_modelname}}DecoderLayerCollection(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
-
- def setup(self):
- self.layers = [
- Flax{{cookiecutter.camelcase_modelname}}DecoderLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.decoder_layers)
- ]
- self.layerdrop = self.config.decoder_layerdrop
-
- def __call__(
- self,
- hidden_states,
- attention_mask,
- encoder_hidden_states: Optional[jnp.ndarray] = None,
- encoder_attention_mask: Optional[jnp.ndarray] = None,
- deterministic: bool = True,
- init_cache: bool = False,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- ):
- # decoder layers
- all_hidden_states = () if output_hidden_states else None
- all_self_attns = () if output_attentions else None
- all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
-
- for decoder_layer in self.layers:
- if output_hidden_states:
- all_hidden_states += (hidden_states,)
- # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
- dropout_probability = random.uniform(0, 1)
- if not deterministic and (dropout_probability < self.layerdrop):
- layer_outputs = (None, None, None)
- else:
- layer_outputs = decoder_layer(
- hidden_states,
- attention_mask=attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- init_cache=init_cache,
- output_attentions=output_attentions,
- deterministic=deterministic,
- )
-
- hidden_states = layer_outputs[0]
- if output_attentions:
- all_self_attns += (layer_outputs[1],)
-
- if encoder_hidden_states is not None:
- all_cross_attentions += (layer_outputs[2],)
-
- # add hidden states from the last decoder layer
- if output_hidden_states:
- all_hidden_states += (hidden_states,)
-
- outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions]
-
- if not return_dict:
- return tuple(v for v in outputs if v is not None)
-
- return FlaxBaseModelOutputWithPastAndCrossAttentions(
- last_hidden_state=hidden_states,
- hidden_states=all_hidden_states,
- attentions=all_self_attns,
- cross_attentions=all_cross_attentions,
- )
-
-
-class Flax{{cookiecutter.camelcase_modelname}}ClassificationHead(nn.Module):
- """Head for sentence-level classification tasks."""
-
- config: {{cookiecutter.camelcase_modelname}}Config
- inner_dim: int
- num_classes: int
- pooler_dropout: float
- dtype: jnp.dtype = jnp.float32
-
- def setup(self):
- self.dense = nn.Dense(
- self.inner_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
- )
- self.dropout = nn.Dropout(rate=self.pooler_dropout)
- self.out_proj = nn.Dense(
- self.num_classes,
- dtype=self.dtype,
- kernel_init=jax.nn.initializers.normal(self.config.init_std),
- )
-
- def __call__(self, hidden_states: jnp.ndarray, deterministic: bool):
- hidden_states = self.dropout(hidden_states, deterministic=deterministic)
- hidden_states = self.dense(hidden_states)
- hidden_states = jnp.tanh(hidden_states)
- hidden_states = self.dropout(hidden_states, deterministic=deterministic)
- hidden_states = self.out_proj(hidden_states)
- return hidden_states
-
-
-class Flax{{cookiecutter.camelcase_modelname}}Encoder(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
- embed_tokens: Optional[nn.Embed] = None
-
- def setup(self):
- self.dropout_layer = nn.Dropout(rate=self.config.dropout)
-
- embed_dim = self.config.d_model
- self.padding_idx = self.config.pad_token_id
- self.max_source_positions = self.config.max_position_embeddings
- self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0
-
- if self.embed_tokens is None:
- self.embed_tokens = nn.Embed(
- self.config.vocab_size,
- embed_dim,
- embedding_init=jax.nn.initializers.normal(self.config.init_std),
- )
-
- # {{cookiecutter.camelcase_modelname}} is set up so that if padding_idx is specified then offset the embedding ids by 2
- # and adjust num_embeddings appropriately. Other models don't have this hack
- self.offset = 2
- self.embed_positions = nn.Embed(
- self.config.max_position_embeddings + self.offset,
- embed_dim,
- embedding_init=jax.nn.initializers.normal(self.config.init_std),
- )
- self.layers = Flax{{cookiecutter.camelcase_modelname}}EncoderLayerCollection(self.config, self.dtype)
- self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype)
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- position_ids,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- deterministic: bool = True,
- ):
- input_shape = input_ids.shape
- input_ids = input_ids.reshape(-1, input_shape[-1])
-
- inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-
- embed_pos = self.embed_positions(position_ids + self.offset)
-
- hidden_states = inputs_embeds + embed_pos
- hidden_states = self.layernorm_embedding(hidden_states)
- hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
-
- outputs = self.layers(
- hidden_states,
- attention_mask,
- deterministic=deterministic,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- if not return_dict:
- return outputs
-
- return FlaxBaseModelOutput(
- last_hidden_state=outputs.last_hidden_state,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-class Flax{{cookiecutter.camelcase_modelname}}Decoder(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
- embed_tokens: Optional[nn.Embed] = None
-
- def setup(self):
- self.dropout_layer = nn.Dropout(rate=self.config.dropout)
-
- embed_dim = self.config.d_model
- self.padding_idx = self.config.pad_token_id
- self.max_target_positions = self.config.max_position_embeddings
- self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0
-
- if self.embed_tokens is None:
- self.embed_tokens = nn.Embed(
- self.config.vocab_size,
- embed_dim,
- embedding_init=jax.nn.initializers.normal(self.config.init_std),
- )
-
- # {{cookiecutter.camelcase_modelname}} is set up so that if padding_idx is specified then offset the embedding ids by 2
- # and adjust num_embeddings appropriately. Other models don't have this hack
- self.offset = 2
- self.embed_positions = nn.Embed(
- self.config.max_position_embeddings + self.offset,
- embed_dim,
- embedding_init=jax.nn.initializers.normal(self.config.init_std),
- )
-
- self.layers = Flax{{cookiecutter.camelcase_modelname}}DecoderLayerCollection(self.config, self.dtype)
- self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype)
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- position_ids,
- encoder_hidden_states: Optional[jnp.ndarray] = None,
- encoder_attention_mask: Optional[jnp.ndarray] = None,
- init_cache: bool = False,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- deterministic: bool = True,
- ):
- input_shape = input_ids.shape
- input_ids = input_ids.reshape(-1, input_shape[-1])
-
- inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-
- # embed positions
- positions = self.embed_positions(position_ids + self.offset)
-
- hidden_states = inputs_embeds + positions
- hidden_states = self.layernorm_embedding(hidden_states)
-
- hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
-
- outputs = self.layers(
- hidden_states,
- attention_mask,
- encoder_hidden_states,
- encoder_attention_mask,
- deterministic=deterministic,
- init_cache=init_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- if not return_dict:
- return outputs
-
- return FlaxBaseModelOutputWithPastAndCrossAttentions(
- last_hidden_state=outputs.last_hidden_state,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- cross_attentions=outputs.cross_attentions,
- )
-
-
-class Flax{{cookiecutter.camelcase_modelname}}Module(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
-
- def setup(self):
- self.shared = nn.Embed(
- self.config.vocab_size,
- self.config.d_model,
- embedding_init=jax.nn.initializers.normal(self.config.init_std),
- )
-
- self.encoder = Flax{{cookiecutter.camelcase_modelname}}Encoder(self.config, dtype=self.dtype, embed_tokens=self.shared)
- self.decoder = Flax{{cookiecutter.camelcase_modelname}}Decoder(self.config, dtype=self.dtype, embed_tokens=self.shared)
-
- def _get_encoder_module(self):
- return self.encoder
-
- def _get_decoder_module(self):
- return self.decoder
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- decoder_input_ids,
- decoder_attention_mask,
- position_ids,
- decoder_position_ids,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- deterministic: bool = True,
- ):
- encoder_outputs = self.encoder(
- input_ids=input_ids,
- attention_mask=attention_mask,
- position_ids=position_ids,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- deterministic=deterministic,
- )
-
- decoder_outputs = self.decoder(
- input_ids=decoder_input_ids,
- attention_mask=decoder_attention_mask,
- position_ids=decoder_position_ids,
- encoder_hidden_states=encoder_outputs[0],
- encoder_attention_mask=attention_mask,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- deterministic=deterministic,
- )
-
- if not return_dict:
- return decoder_outputs + encoder_outputs
-
- return FlaxSeq2SeqModelOutput(
- last_hidden_state=decoder_outputs.last_hidden_state,
- decoder_hidden_states=decoder_outputs.hidden_states,
- decoder_attentions=decoder_outputs.attentions,
- cross_attentions=decoder_outputs.cross_attentions,
- encoder_last_hidden_state=encoder_outputs.last_hidden_state,
- encoder_hidden_states=encoder_outputs.hidden_states,
- encoder_attentions=encoder_outputs.attentions,
- )
-
-
-class Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel(FlaxPreTrainedModel):
- config_class = {{cookiecutter.camelcase_modelname}}Config
- base_model_prefix: str = "model"
- module_class: nn.Module = None
-
- def __init__(
- self,
- config: {{cookiecutter.camelcase_modelname}}Config,
- input_shape: Tuple[int] = (1, 1),
- seed: int = 0,
- dtype: jnp.dtype = jnp.float32,
- _do_init: bool = True,
- **kwargs
- ):
- module = self.module_class(config=config, dtype=dtype, **kwargs)
- super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
-
- def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
- # init input tensors
- input_ids = jnp.zeros(input_shape, dtype="i4")
- # make sure initialization pass will work for Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassificationModule
- input_ids = input_ids.at[(..., -1)].set(self.config.eos_token_id)
- attention_mask = jnp.ones_like(input_ids)
- decoder_input_ids = input_ids
- decoder_attention_mask = jnp.ones_like(input_ids)
-
- batch_size, sequence_length = input_ids.shape
- position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
- decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
-
- params_rng, dropout_rng = jax.random.split(rng)
- rngs = {"params": params_rng, "dropout": dropout_rng}
-
- random_params = self.module.init(
- rngs,
- input_ids,
- attention_mask,
- decoder_input_ids,
- decoder_attention_mask,
- position_ids,
- decoder_position_ids,
- )["params"]
-
- if params is not None:
- random_params = flatten_dict(unfreeze(random_params))
- params = flatten_dict(unfreeze(params))
- for missing_key in self._missing_keys:
- params[missing_key] = random_params[missing_key]
- self._missing_keys = set()
- return freeze(unflatten_dict(params))
- else:
- return random_params
-
- def init_cache(self, batch_size, max_length, encoder_outputs):
- r"""
- Args:
- batch_size (`int`):
- batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
- max_length (`int`):
- maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
- cache.
- encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
- `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`,
- *optional*: `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the
- encoder. Used in the cross-attention of the decoder.
- """
- # init input variables to retrieve cache
- decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
- decoder_attention_mask = jnp.ones_like(decoder_input_ids)
- decoder_position_ids = jnp.broadcast_to(
- jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
- )
-
- def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
- decoder_module = module._get_decoder_module()
- return decoder_module(
- decoder_input_ids,
- decoder_attention_mask,
- decoder_position_ids,
- **kwargs,
- )
-
- init_variables = self.module.init(
- jax.random.PRNGKey(0),
- decoder_input_ids=decoder_input_ids,
- decoder_attention_mask=decoder_attention_mask,
- decoder_position_ids=decoder_position_ids,
- encoder_hidden_states=encoder_outputs[0],
- init_cache=True,
- method=_decoder_forward, # we only need to call the decoder to init the cache
- )
- return unfreeze(init_variables["cache"])
-
- @add_start_docstrings({{cookiecutter.uppercase_modelname}}_ENCODE_INPUTS_DOCSTRING)
- @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class={{cookiecutter.camelcase_modelname}}Config)
- def encode(
- self,
- input_ids: jnp.ndarray,
- attention_mask: Optional[jnp.ndarray] = None,
- position_ids: Optional[jnp.ndarray] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- train: bool = False,
- params: dict = None,
- dropout_rng: PRNGKey = None,
- ):
- r"""
- Returns:
-
- Example:
-
- ```python
- >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-
- >>> model = Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
- >>> encoder_outputs = model.encode(**inputs)
- ```"""
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.return_dict
-
- if attention_mask is None:
- attention_mask = jnp.ones_like(input_ids)
- if position_ids is None:
- batch_size, sequence_length = input_ids.shape
- position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
-
- # Handle any PRNG if needed
- rngs = {}
- if dropout_rng is not None:
- rngs["dropout"] = dropout_rng
-
- def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs):
- encode_module = module._get_encoder_module()
- return encode_module(input_ids, attention_mask, position_ids, **kwargs)
-
- return self.module.apply(
- {"params": params or self.params},
- input_ids=jnp.array(input_ids, dtype="i4"),
- attention_mask=jnp.array(attention_mask, dtype="i4"),
- position_ids=jnp.array(position_ids, dtype="i4"),
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- deterministic=not train,
- rngs=rngs,
- method=_encoder_forward,
- )
-
- @add_start_docstrings({{cookiecutter.uppercase_modelname}}_DECODE_INPUTS_DOCSTRING)
- @replace_return_docstrings(output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class={{cookiecutter.camelcase_modelname}}Config)
- def decode(
- self,
- decoder_input_ids,
- encoder_outputs,
- encoder_attention_mask: Optional[jnp.ndarray] = None,
- decoder_attention_mask: Optional[jnp.ndarray] = None,
- decoder_position_ids: Optional[jnp.ndarray] = None,
- past_key_values: dict = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- train: bool = False,
- params: dict = None,
- dropout_rng: PRNGKey = None,
- ):
- r"""
- Returns:
-
- Example:
-
- ```python
- >>> import jax.numpy as jnp
- >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-
- >>> model = Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
- >>> encoder_outputs = model.encode(**inputs)
-
- >>> decoder_start_token_id = model.config.decoder_start_token_id
- >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
-
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> last_decoder_hidden_states = outputs.last_hidden_state
- ```"""
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.return_dict
-
- encoder_hidden_states = encoder_outputs[0]
- if encoder_attention_mask is None:
- batch_size, sequence_length = encoder_hidden_states.shape[:2]
- encoder_attention_mask = jnp.ones((batch_size, sequence_length))
-
- batch_size, sequence_length = decoder_input_ids.shape
- if decoder_attention_mask is None:
- decoder_attention_mask = jnp.ones((batch_size, sequence_length))
-
- if decoder_position_ids is None:
- if past_key_values is not None:
- raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
-
- decoder_position_ids = jnp.broadcast_to(
- jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
- )
-
- # Handle any PRNG if needed
- rngs = {}
- if dropout_rng is not None:
- rngs["dropout"] = dropout_rng
-
- inputs = {"params": params or self.params}
-
- # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
- # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
- # it can be changed by Flax{{cookiecutter.camelcase_modelname}}Attention module
- if past_key_values:
- inputs["cache"] = past_key_values
- mutable = ["cache"]
- else:
- mutable = False
-
- def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
- decoder_module = module._get_decoder_module()
- return decoder_module(
- decoder_input_ids,
- decoder_attention_mask,
- decoder_position_ids,
- **kwargs,
- )
-
- outputs = self.module.apply(
- inputs,
- decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
- decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
- decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"),
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- deterministic=not train,
- rngs=rngs,
- mutable=mutable,
- method=_decoder_forward,
- )
-
- # add updated cache to model output
- if past_key_values is not None and return_dict:
- outputs, past = outputs
- outputs["past_key_values"] = unfreeze(past["cache"])
- return outputs
- elif past_key_values is not None and not return_dict:
- outputs, past = outputs
- outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
-
- return outputs
-
- def __call__(
- self,
- input_ids: jnp.ndarray,
- attention_mask: Optional[jnp.ndarray] = None,
- decoder_input_ids: Optional[jnp.ndarray] = None,
- decoder_attention_mask: Optional[jnp.ndarray] = None,
- position_ids: Optional[jnp.ndarray] = None,
- decoder_position_ids: Optional[jnp.ndarray] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- train: bool = False,
- params: dict = None,
- dropout_rng: PRNGKey = None,
- ):
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.return_dict
-
- # prepare encoder inputs
- if attention_mask is None:
- attention_mask = jnp.ones_like(input_ids)
- if position_ids is None:
- batch_size, sequence_length = input_ids.shape
- position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
-
- # prepare decoder inputs
- if decoder_input_ids is None:
- decoder_input_ids = shift_tokens_right(
- input_ids, self.config.pad_token_id, decoder_start_token_id=self.config.decoder_start_token_id
- )
- if decoder_attention_mask is None:
- decoder_attention_mask = jnp.ones_like(decoder_input_ids)
- if decoder_position_ids is None:
- batch_size, sequence_length = decoder_input_ids.shape
- decoder_position_ids = jnp.broadcast_to(
- jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
- )
-
- # Handle any PRNG if needed
- rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
-
- return self.module.apply(
- {"params": params or self.params},
- input_ids=jnp.array(input_ids, dtype="i4"),
- attention_mask=jnp.array(attention_mask, dtype="i4"),
- position_ids=jnp.array(position_ids, dtype="i4"),
- decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
- decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
- decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- deterministic=not train,
- rngs=rngs,
- )
-
-
-@add_start_docstrings(
- "The bare {{cookiecutter.camelcase_modelname}} Model transformer outputting raw hidden-states without any specific head on top.",
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class Flax{{cookiecutter.camelcase_modelname}}Model(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32 # the dtype of the computation
- module_class = Flax{{cookiecutter.camelcase_modelname}}Module
-
-
-append_call_sample_docstring(
- Flax{{cookiecutter.camelcase_modelname}}Model, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC
-)
-
-
-class Flax{{cookiecutter.camelcase_modelname}}ForConditionalGenerationModule(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
- bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros
-
- def setup(self):
- self.model = Flax{{cookiecutter.camelcase_modelname}}Module(config=self.config, dtype=self.dtype)
- self.lm_head = nn.Dense(
- self.model.shared.num_embeddings,
- use_bias=False,
- dtype=self.dtype,
- kernel_init=jax.nn.initializers.normal(self.config.init_std),
- )
- self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, self.model.shared.num_embeddings))
-
- def _get_encoder_module(self):
- return self.model.encoder
-
- def _get_decoder_module(self):
- return self.model.decoder
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- decoder_input_ids,
- decoder_attention_mask,
- position_ids,
- decoder_position_ids,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- deterministic: bool = True,
- ):
- outputs = self.model(
- input_ids=input_ids,
- attention_mask=attention_mask,
- decoder_input_ids=decoder_input_ids,
- decoder_attention_mask=decoder_attention_mask,
- position_ids=position_ids,
- decoder_position_ids=decoder_position_ids,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- deterministic=deterministic,
- )
-
- hidden_states = outputs[0]
-
- if self.config.tie_word_embeddings:
- shared_embedding = self.model.variables["params"]["shared"]["embedding"]
- lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
- else:
- lm_logits = self.lm_head(hidden_states)
-
- lm_logits += self.final_logits_bias.astype(self.dtype)
-
- if not return_dict:
- output = (lm_logits,) + outputs[1:]
- return output
-
- return FlaxSeq2SeqLMOutput(
- logits=lm_logits,
- decoder_hidden_states=outputs.decoder_hidden_states,
- decoder_attentions=outputs.decoder_attentions,
- cross_attentions=outputs.cross_attentions,
- encoder_last_hidden_state=outputs.encoder_last_hidden_state,
- encoder_hidden_states=outputs.encoder_hidden_states,
- encoder_attentions=outputs.encoder_attentions,
- )
-
-
-@add_start_docstrings(
- "The {{cookiecutter.uppercase_modelname}} Model with a language modeling head. Can be used for summarization.", {{cookiecutter.uppercase_modelname}}_START_DOCSTRING
-)
-class Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- module_class = Flax{{cookiecutter.camelcase_modelname}}ForConditionalGenerationModule
- dtype: jnp.dtype = jnp.float32
-
- @add_start_docstrings({{cookiecutter.uppercase_modelname}}_DECODE_INPUTS_DOCSTRING)
- @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class={{cookiecutter.camelcase_modelname}}Config)
- def decode(
- self,
- decoder_input_ids,
- encoder_outputs,
- encoder_attention_mask: Optional[jnp.ndarray] = None,
- decoder_attention_mask: Optional[jnp.ndarray] = None,
- decoder_position_ids: Optional[jnp.ndarray] = None,
- past_key_values: dict = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- deterministic: bool = True,
- params: dict = None,
- dropout_rng: PRNGKey = None,
- ):
- r"""
- Returns:
-
- Example:
-
- ```python
- >>> import jax.numpy as jnp
- >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-
- >>> model = Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
- >>> encoder_outputs = model.encode(**inputs)
-
- >>> decoder_start_token_id = model.config.decoder_start_token_id
- >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
-
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> logits = outputs.logits
- ```"""
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.return_dict
-
- encoder_hidden_states = encoder_outputs[0]
- if encoder_attention_mask is None:
- batch_size, sequence_length = encoder_hidden_states.shape[:2]
- encoder_attention_mask = jnp.ones((batch_size, sequence_length))
-
- batch_size, sequence_length = decoder_input_ids.shape
- if decoder_attention_mask is None:
- decoder_attention_mask = jnp.ones((batch_size, sequence_length))
-
- if decoder_position_ids is None:
- if past_key_values is not None:
- raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
-
- decoder_position_ids = jnp.broadcast_to(
- jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
- )
-
- # Handle any PRNG if needed
- rngs = {}
- if dropout_rng is not None:
- rngs["dropout"] = dropout_rng
-
- inputs = {"params": params or self.params}
-
- # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
- # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
- # it can be changed by Flax{{cookiecutter.camelcase_modelname}}Attention module
- if past_key_values:
- inputs["cache"] = past_key_values
- mutable = ["cache"]
- else:
- mutable = False
-
- def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
- decoder_module = module._get_decoder_module()
- outputs = decoder_module(
- decoder_input_ids,
- decoder_attention_mask,
- decoder_position_ids,
- **kwargs,
- )
- hidden_states = outputs[0]
-
- if self.config.tie_word_embeddings:
- shared_embedding = module.model.variables["params"]["shared"]["embedding"]
- lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
- else:
- lm_logits = module.lm_head(hidden_states)
-
- lm_logits += module.final_logits_bias.astype(self.dtype)
- return lm_logits, outputs
-
- outputs = self.module.apply(
- inputs,
- decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
- decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
- decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"),
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- deterministic=deterministic,
- rngs=rngs,
- mutable=mutable,
- method=_decoder_forward,
- )
-
- if past_key_values is None:
- lm_logits, decoder_outputs = outputs
- else:
- (lm_logits, decoder_outputs), past = outputs
-
- if return_dict:
- outputs = FlaxCausalLMOutputWithCrossAttentions(
- logits=lm_logits,
- hidden_states=decoder_outputs.hidden_states,
- attentions=decoder_outputs.attentions,
- cross_attentions=decoder_outputs.cross_attentions,
- )
- else:
- outputs = (lm_logits,) + decoder_outputs[1:]
-
- # add updated cache to model output
- if past_key_values is not None and return_dict:
- outputs["past_key_values"] = unfreeze(past["cache"])
- return outputs
- elif past_key_values is not None and not return_dict:
- outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
-
- return outputs
-
- def prepare_inputs_for_generation(
- self,
- decoder_input_ids,
- max_length,
- attention_mask: Optional[jax.Array] = None,
- decoder_attention_mask: Optional[jax.Array] = None,
- encoder_outputs=None,
- **kwargs
- ):
- # initializing the cache
- batch_size, seq_length = decoder_input_ids.shape
-
- past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
- # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
- # But since the decoder uses a causal mask, those positions are masked anyways.
- # Thus we can create a single static attention_mask here, which is more efficient for compilation
- extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
- if decoder_attention_mask is not None:
- position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
- extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
- else:
- position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
-
- return {
- "past_key_values": past_key_values,
- "encoder_outputs": encoder_outputs,
- "encoder_attention_mask": attention_mask,
- "decoder_attention_mask": extended_attention_mask,
- "decoder_position_ids": position_ids,
- }
-
- def update_inputs_for_generation(self, model_outputs, model_kwargs):
- model_kwargs["past_key_values"] = model_outputs.past_key_values
- model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
- return model_kwargs
-
-
-FLAX_{{cookiecutter.uppercase_modelname}}_CONDITIONAL_GENERATION_DOCSTRING = """
- Returns:
-
- Summarization example:
-
- ```python
- >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-
- >>> model = Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
- >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='np')
-
- >>> # Generate Summary
- >>> summary_ids = model.generate(inputs['input_ids']).sequences
- >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
- ```
-
- Mask filling example:
-
- ```python
- >>> import jax
- >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-
- >>> model = Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
- >>> TXT = "My friends are but they eat too many carbs."
- >>> input_ids = tokenizer([TXT], return_tensors='np')['input_ids']
-
- >>> logits = model(input_ids).logits
- >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
- >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0)
- >>> values, predictions = jax.lax.top_k(probs, k=1)
-
- >>> tokenizer.decode(predictions).split()
- ```
-"""
-
-overwrite_call_docstring(
- Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration, {{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING + FLAX_{{cookiecutter.uppercase_modelname}}_CONDITIONAL_GENERATION_DOCSTRING
-)
-append_replace_return_docstrings(
- Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
-)
-
-
-class Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassificationModule(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
- num_labels: Optional[int] = None
-
- def setup(self):
- self.model = Flax{{cookiecutter.camelcase_modelname}}Module(config=self.config, dtype=self.dtype)
- self.classification_head = Flax{{cookiecutter.camelcase_modelname}}ClassificationHead(
- config=self.config,
- inner_dim=self.config.d_model,
- num_classes=self.num_labels if self.num_labels is not None else self.config.num_labels,
- pooler_dropout=self.config.classifier_dropout,
- )
-
- def _get_encoder_module(self):
- return self.model.encoder
-
- def _get_decoder_module(self):
- return self.model.decoder
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- decoder_input_ids,
- decoder_attention_mask,
- position_ids,
- decoder_position_ids,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- deterministic: bool = True,
- ):
- outputs = self.model(
- input_ids=input_ids,
- attention_mask=attention_mask,
- decoder_input_ids=decoder_input_ids,
- decoder_attention_mask=decoder_attention_mask,
- position_ids=position_ids,
- decoder_position_ids=decoder_position_ids,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- deterministic=deterministic,
- )
-
- hidden_states = outputs[0] # last hidden state
-
- eos_mask = jnp.where(input_ids == self.config.eos_token_id, 1, 0)
-
- # The first condition is necessary to overcome jax._src.errors.ConcretizationTypeError during JIT compilation
- if type(eos_mask) != jax.interpreters.partial_eval.DynamicJaxprTracer:
- if len(jnp.unique(eos_mask.sum(1))) > 1:
- raise ValueError("All examples must have the same number of tokens.")
-
- if any(eos_mask.sum(1) == 0):
- raise ValueError("There are missing tokens in input_ids")
-
- # Ensure to keep 1 only for the last token for each example
- eos_mask_noised = eos_mask + jnp.arange(eos_mask.shape[1]) * 1e-6
- eos_mask = jnp.where(eos_mask_noised == eos_mask_noised.max(1).reshape(-1, 1), 1, 0)
-
- sentence_representation = jnp.einsum("ijk, ij -> ijk", hidden_states, eos_mask).sum(1)
- logits = self.classification_head(sentence_representation, deterministic=deterministic)
-
- if not return_dict:
- output = (logits,) + outputs[1:]
- return output
-
- return FlaxSeq2SeqSequenceClassifierOutput(
- logits=logits,
- decoder_hidden_states=outputs.decoder_hidden_states,
- decoder_attentions=outputs.decoder_attentions,
- cross_attentions=outputs.cross_attentions,
- encoder_last_hidden_state=outputs.encoder_last_hidden_state,
- encoder_hidden_states=outputs.encoder_hidden_states,
- encoder_attentions=outputs.encoder_attentions,
- )
-
-
-@add_start_docstrings(
- """
- {{cookiecutter.camelcase_modelname}} model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
- tasks.
- """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- module_class = Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassificationModule
- dtype = jnp.float32
-
-
-append_call_sample_docstring(
- Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- _TOKENIZER_FOR_DOC,
- _CHECKPOINT_FOR_DOC,
- FlaxSeq2SeqSequenceClassifierOutput,
- _CONFIG_FOR_DOC,
-)
-
-
-class Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnsweringModule(nn.Module):
- config: {{cookiecutter.camelcase_modelname}}Config
- dtype: jnp.dtype = jnp.float32
- num_labels = 2
-
- def setup(self):
- self.model = Flax{{cookiecutter.camelcase_modelname}}Module(config=self.config, dtype=self.dtype)
- self.qa_outputs = nn.Dense(
- self.num_labels, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
- )
-
- def _get_encoder_module(self):
- return self.model.encoder
-
- def _get_decoder_module(self):
- return self.model.decoder
-
- def __call__(
- self,
- input_ids,
- attention_mask,
- decoder_input_ids,
- decoder_attention_mask,
- position_ids,
- decoder_position_ids,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = True,
- deterministic: bool = True,
- ):
- outputs = self.model(
- input_ids=input_ids,
- attention_mask=attention_mask,
- decoder_input_ids=decoder_input_ids,
- decoder_attention_mask=decoder_attention_mask,
- position_ids=position_ids,
- decoder_position_ids=decoder_position_ids,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- deterministic=deterministic,
- )
-
- sequence_output = outputs[0]
-
- logits = self.qa_outputs(sequence_output)
- start_logits, end_logits = jnp.split(logits, logits.shape[-1], axis=-1)
- start_logits = start_logits.squeeze(-1)
- end_logits = end_logits.squeeze(-1)
-
- if not return_dict:
- output = (start_logits, end_logits) + outputs[1:]
- return output
-
- return FlaxSeq2SeqQuestionAnsweringModelOutput(
- start_logits=start_logits,
- end_logits=end_logits,
- decoder_hidden_states=outputs.decoder_hidden_states,
- decoder_attentions=outputs.decoder_attentions,
- cross_attentions=outputs.cross_attentions,
- encoder_last_hidden_state=outputs.encoder_last_hidden_state,
- encoder_hidden_states=outputs.encoder_hidden_states,
- encoder_attentions=outputs.encoder_attentions,
- )
-
-
-@add_start_docstrings(
- """
- {{cookiecutter.uppercase_modelname}} Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
- layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
- """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- module_class = Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnsweringModule
- dtype = jnp.float32
-
-
-append_call_sample_docstring(
- Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- _TOKENIZER_FOR_DOC,
- _CHECKPOINT_FOR_DOC,
- FlaxSeq2SeqQuestionAnsweringModelOutput,
- _CONFIG_FOR_DOC,
-)
-
-{% endif -%}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
deleted file mode 100644
index d903c18b2f06f3..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ /dev/null
@@ -1,2819 +0,0 @@
-# coding=utf-8
-# Copyright 2022 {{cookiecutter.authors}} and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 {{cookiecutter.modelname}} model. """
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-
-import math
-from typing import Dict, Optional, Tuple, Union
-
-import numpy as np
-import tensorflow as tf
-
-from ...activations_tf import get_tf_activation
-from ...utils import (
- DUMMY_INPUTS,
- MULTIPLE_CHOICE_DUMMY_INPUTS,
- add_code_sample_docstrings,
- add_start_docstrings,
- add_start_docstrings_to_model_forward,
-)
-from ...modeling_tf_outputs import (
- TFBaseModelOutputWithPastAndCrossAttentions,
- TFCausalLMOutputWithCrossAttentions,
- TFMaskedLMOutput,
- TFMultipleChoiceModelOutput,
- TFQuestionAnsweringModelOutput,
- TFSequenceClassifierOutput,
- TFTokenClassifierOutput,
-)
-from ...modeling_tf_utils import (
- TFCausalLanguageModelingLoss,
- TFMaskedLanguageModelingLoss,
- TFModelInputType,
- TFMultipleChoiceLoss,
- TFPreTrainedModel,
- TFQuestionAnsweringLoss,
- TFSequenceClassificationLoss,
- TFSequenceSummary,
- TFTokenClassificationLoss,
- get_initializer,
- keras,
- keras_serializable,
- unpack_inputs,
-)
-from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
-from ...utils import logging
-from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "{{cookiecutter.checkpoint_identifier}}"
-_CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config"
-
-
-# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}}
-class TF{{cookiecutter.camelcase_modelname}}Embeddings(keras.layers.Layer):
- """Construct the embeddings from word, position and token_type embeddings."""
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(**kwargs)
-
- self.vocab_size = config.vocab_size
- self.type_vocab_size = config.type_vocab_size
- self.hidden_size = config.hidden_size
- self.max_position_embeddings = config.max_position_embeddings
- self.initializer_range = config.initializer_range
- self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
- self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
-
- def build(self, input_shape: tf.TensorShape):
- with tf.name_scope("word_embeddings"):
- self.weight = self.add_weight(
- name="weight",
- shape=[self.vocab_size, self.hidden_size],
- initializer=get_initializer(self.initializer_range),
- )
-
- with tf.name_scope("token_type_embeddings"):
- self.token_type_embeddings = self.add_weight(
- name="embeddings",
- shape=[self.type_vocab_size, self.hidden_size],
- initializer=get_initializer(self.initializer_range),
- )
-
- with tf.name_scope("position_embeddings"):
- self.position_embeddings = self.add_weight(
- name="embeddings",
- shape=[self.max_position_embeddings, self.hidden_size],
- initializer=get_initializer(self.initializer_range),
- )
-
- super().build(input_shape)
-
- def call(
- self,
- input_ids: tf.Tensor = None,
- position_ids: tf.Tensor = None,
- token_type_ids: tf.Tensor = None,
- inputs_embeds: tf.Tensor = None,
- past_key_values_length=0,
- training: bool = False,
- ) -> tf.Tensor:
- """
- Applies embedding based on inputs tensor.
-
- Returns:
- final_embeddings (`tf.Tensor`): output embedding tensor.
- """
- assert not (input_ids is None and inputs_embeds is None)
-
- if input_ids is not None:
- check_embeddings_within_bounds(input_ids, self.vocab_size)
- inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
-
- input_shape = shape_list(inputs_embeds)[:-1]
-
- if token_type_ids is None:
- token_type_ids = tf.fill(dims=input_shape, value=0)
-
- if position_ids is None:
- position_ids = tf.expand_dims(
- tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0
- )
-
- position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
- token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
- final_embeddings = inputs_embeds + position_embeds + token_type_embeds
- final_embeddings = self.LayerNorm(inputs=final_embeddings)
- final_embeddings = self.dropout(inputs=final_embeddings, training=training)
-
- return final_embeddings
-
-
-# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}}
-class TF{{cookiecutter.camelcase_modelname}}SelfAttention(keras.layers.Layer):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(**kwargs)
-
- if config.hidden_size % config.num_attention_heads != 0:
- raise ValueError(
- f"The hidden size ({config.hidden_size}) is not a multiple of the number "
- f"of attention heads ({config.num_attention_heads})"
- )
-
- self.num_attention_heads = config.num_attention_heads
- self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
- self.all_head_size = self.num_attention_heads * self.attention_head_size
- self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
-
- self.query = keras.layers.Dense(
- units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
- )
- self.key = keras.layers.Dense(
- units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
- )
- self.value = keras.layers.Dense(
- units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
- )
- self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
-
- self.is_decoder = config.is_decoder
-
- def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
- # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
- tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
-
- # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
- return tf.transpose(tensor, perm=[0, 2, 1, 3])
-
- def call(
- self,
- hidden_states: tf.Tensor,
- attention_mask: tf.Tensor,
- head_mask: tf.Tensor,
- encoder_hidden_states: tf.Tensor,
- encoder_attention_mask: tf.Tensor,
- past_key_value: Tuple[tf.Tensor],
- output_attentions: bool,
- training: bool = False,
- ) -> Tuple[tf.Tensor]:
- batch_size = shape_list(hidden_states)[0]
- mixed_query_layer = self.query(inputs=hidden_states)
-
- # If this is instantiated as a cross-attention module, the keys
- # and values come from an encoder; the attention mask needs to be
- # such that the encoder's padding tokens are not attended to.
- is_cross_attention = encoder_hidden_states is not None
-
- if is_cross_attention and past_key_value is not None:
- # reuse k,v, cross_attentions
- key_layer = past_key_value[0]
- value_layer = past_key_value[1]
- attention_mask = encoder_attention_mask
- elif is_cross_attention:
- key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size)
- value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size)
- attention_mask = encoder_attention_mask
- elif past_key_value is not None:
- key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
- value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
- key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
- value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
- else:
- key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
- value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
-
- query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
-
- if self.is_decoder:
- # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
- # Further calls to cross_attention layer can then reuse all cross-attention
- # key/value_states (first "if" case)
- # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
- # all previous decoder key/value_states. Further calls to uni-directional self-attention
- # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
- # if encoder bi-directional self-attention `past_key_value` is always `None`
- past_key_value = (key_layer, value_layer)
-
- # Take the dot product between "query" and "key" to get the raw attention scores.
- # (batch size, num_heads, seq_len_q, seq_len_k)
- attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
- dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
- attention_scores = tf.divide(attention_scores, dk)
-
- if attention_mask is not None:
- # Apply the attention mask is (precomputed for all layers in TF{{cookiecutter.camelcase_modelname}}Model call() function)
- attention_scores = tf.add(attention_scores, attention_mask)
-
- # Normalize the attention scores to probabilities.
- attention_probs = stable_softmax(logits=attention_scores, axis=-1)
-
- # This is actually dropping out entire tokens to attend to, which might
- # seem a bit unusual, but is taken from the original Transformer paper.
- attention_probs = self.dropout(inputs=attention_probs, training=training)
-
- # Mask heads if we want to
- if head_mask is not None:
- attention_probs = tf.multiply(attention_probs, head_mask)
-
- attention_output = tf.matmul(attention_probs, value_layer)
- attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
-
- # (batch_size, seq_len_q, all_head_size)
- attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
- outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
-
- if self.is_decoder:
- outputs = outputs + (past_key_value,)
- return outputs
-
-
-# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->{{cookiecutter.camelcase_modelname}}
-class TF{{cookiecutter.camelcase_modelname}}SelfOutput(keras.layers.Layer):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(**kwargs)
-
- self.dense = keras.layers.Dense(
- units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
- )
- self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
- self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
-
- def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
- hidden_states = self.dense(inputs=hidden_states)
- hidden_states = self.dropout(inputs=hidden_states, training=training)
- hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
-
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->{{cookiecutter.camelcase_modelname}}
-class TF{{cookiecutter.camelcase_modelname}}Attention(keras.layers.Layer):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(**kwargs)
-
- self.self_attention = TF{{cookiecutter.camelcase_modelname}}SelfAttention(config, name="self")
- self.dense_output = TF{{cookiecutter.camelcase_modelname}}SelfOutput(config, name="output")
-
- def prune_heads(self, heads):
- raise NotImplementedError
-
- def call(
- self,
- input_tensor: tf.Tensor,
- attention_mask: tf.Tensor,
- head_mask: tf.Tensor,
- encoder_hidden_states: tf.Tensor,
- encoder_attention_mask: tf.Tensor,
- past_key_value: Tuple[tf.Tensor],
- output_attentions: bool,
- training: bool = False,
- ) -> Tuple[tf.Tensor]:
- self_outputs = self.self_attention(
- hidden_states=input_tensor,
- attention_mask=attention_mask,
- head_mask=head_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- past_key_value=past_key_value,
- output_attentions=output_attentions,
- training=training,
- )
- attention_output = self.dense_output(
- hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
- )
- # add attentions (possibly with past_key_value) if we output them
- outputs = (attention_output,) + self_outputs[1:]
-
- return outputs
-
-
-# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->{{cookiecutter.camelcase_modelname}}
-class TF{{cookiecutter.camelcase_modelname}}Intermediate(keras.layers.Layer):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(**kwargs)
-
- self.dense = keras.layers.Dense(
- units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
- )
-
- if isinstance(config.hidden_act, str):
- self.intermediate_act_fn = get_tf_activation(config.hidden_act)
- else:
- self.intermediate_act_fn = config.hidden_act
-
- def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
- hidden_states = self.dense(inputs=hidden_states)
- hidden_states = self.intermediate_act_fn(hidden_states)
-
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->{{cookiecutter.camelcase_modelname}}
-class TF{{cookiecutter.camelcase_modelname}}Output(keras.layers.Layer):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(**kwargs)
-
- self.dense = keras.layers.Dense(
- units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
- )
- self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
- self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
-
- def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
- hidden_states = self.dense(inputs=hidden_states)
- hidden_states = self.dropout(inputs=hidden_states, training=training)
- hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
-
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->{{cookiecutter.camelcase_modelname}}
-class TF{{cookiecutter.camelcase_modelname}}Layer(keras.layers.Layer):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(**kwargs)
-
- self.attention = TF{{cookiecutter.camelcase_modelname}}Attention(config, name="attention")
- self.is_decoder = config.is_decoder
- self.add_cross_attention = config.add_cross_attention
- if self.add_cross_attention:
- if not self.is_decoder:
- raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
- self.crossattention = TF{{cookiecutter.camelcase_modelname}}Attention(config, name="crossattention")
- self.intermediate = TF{{cookiecutter.camelcase_modelname}}Intermediate(config, name="intermediate")
- self.bert_output = TF{{cookiecutter.camelcase_modelname}}Output(config, name="output")
-
- def call(
- self,
- hidden_states: tf.Tensor,
- attention_mask: tf.Tensor,
- head_mask: tf.Tensor,
- encoder_hidden_states: tf.Tensor | None,
- encoder_attention_mask: tf.Tensor | None,
- past_key_value: Tuple[tf.Tensor] | None,
- output_attentions: bool,
- training: bool = False,
- ) -> Tuple[tf.Tensor]:
- # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
- self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
- self_attention_outputs = self.attention(
- input_tensor=hidden_states,
- attention_mask=attention_mask,
- head_mask=head_mask,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- past_key_value=self_attn_past_key_value,
- output_attentions=output_attentions,
- training=training,
- )
- attention_output = self_attention_outputs[0]
-
- # if decoder, the last output is tuple of self-attn cache
- if self.is_decoder:
- outputs = self_attention_outputs[1:-1]
- present_key_value = self_attention_outputs[-1]
- else:
- outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
-
- cross_attn_present_key_value = None
- if self.is_decoder and encoder_hidden_states is not None:
- if not hasattr(self, "crossattention"):
- raise ValueError(
- f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers "
- "by setting `config.add_cross_attention=True`"
- )
-
- # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
- cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
- cross_attention_outputs = self.crossattention(
- input_tensor=attention_output,
- attention_mask=attention_mask,
- head_mask=head_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- past_key_value=cross_attn_past_key_value,
- output_attentions=output_attentions,
- training=training,
- )
- attention_output = cross_attention_outputs[0]
- outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
-
- # add cross-attn cache to positions 3,4 of present_key_value tuple
- cross_attn_present_key_value = cross_attention_outputs[-1]
- present_key_value = present_key_value + cross_attn_present_key_value
-
- intermediate_output = self.intermediate(hidden_states=attention_output)
- layer_output = self.bert_output(
- hidden_states=intermediate_output, input_tensor=attention_output, training=training
- )
- outputs = (layer_output,) + outputs # add attentions if we output them
-
- # if decoder, return the attn key/values as the last output
- if self.is_decoder:
- outputs = outputs + (present_key_value,)
-
- return outputs
-
-
-# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->{{cookiecutter.camelcase_modelname}}
-class TF{{cookiecutter.camelcase_modelname}}Encoder(keras.layers.Layer):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(**kwargs)
- self.config = config
- self.layer = [TF{{cookiecutter.camelcase_modelname}}Layer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
-
- def call(
- self,
- hidden_states: tf.Tensor,
- attention_mask: tf.Tensor,
- head_mask: tf.Tensor,
- encoder_hidden_states: tf.Tensor | None,
- encoder_attention_mask: tf.Tensor | None,
- past_key_values: Tuple[Tuple[tf.Tensor]] | None,
- use_cache: Optional[bool],
- output_attentions: bool,
- output_hidden_states: bool,
- return_dict: bool,
- training: bool = False,
- ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
- all_hidden_states = () if output_hidden_states else None
- all_attentions = () if output_attentions else None
- all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-
- next_decoder_cache = () if use_cache else None
- for i, layer_module in enumerate(self.layer):
- if output_hidden_states:
- all_hidden_states = all_hidden_states + (hidden_states,)
-
- past_key_value = past_key_values[i] if past_key_values is not None else None
-
- layer_outputs = layer_module(
- hidden_states=hidden_states,
- attention_mask=attention_mask,
- head_mask=head_mask[i],
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- past_key_value=past_key_value,
- output_attentions=output_attentions,
- training=training,
- )
- hidden_states = layer_outputs[0]
-
- if use_cache:
- next_decoder_cache += (layer_outputs[-1],)
-
- if output_attentions:
- all_attentions = all_attentions + (layer_outputs[1],)
- if self.config.add_cross_attention and encoder_hidden_states is not None:
- all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
-
- # Add last layer
- if output_hidden_states:
- all_hidden_states = all_hidden_states + (hidden_states,)
-
- if not return_dict:
- return tuple(
- v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
- )
-
- return TFBaseModelOutputWithPastAndCrossAttentions(
- last_hidden_state=hidden_states,
- past_key_values=next_decoder_cache,
- hidden_states=all_hidden_states,
- attentions=all_attentions,
- cross_attentions=all_cross_attentions,
- )
-
-
-# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->{{cookiecutter.camelcase_modelname}}
-class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(keras.layers.Layer):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(**kwargs)
-
- self.dense = keras.layers.Dense(
- units=config.hidden_size,
- kernel_initializer=get_initializer(config.initializer_range),
- name="dense",
- )
-
- if isinstance(config.hidden_act, str):
- self.transform_act_fn = get_tf_activation(config.hidden_act)
- else:
- self.transform_act_fn = config.hidden_act
-
- self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-
- def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
- hidden_states = self.dense(inputs=hidden_states)
- hidden_states = self.transform_act_fn(hidden_states)
- hidden_states = self.LayerNorm(inputs=hidden_states)
-
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->{{cookiecutter.camelcase_modelname}}
-class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(keras.layers.Layer):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: keras.layers.Layer, **kwargs):
- super().__init__(**kwargs)
-
- self.vocab_size = config.vocab_size
- self.hidden_size = config.hidden_size
-
- self.transform = TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config, name="transform")
-
- # The output weights are the same as the input embeddings, but there is
- # an output-only bias for each token.
- self.input_embeddings = input_embeddings
-
- def build(self, input_shape: tf.TensorShape):
- self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
-
- super().build(input_shape)
-
- def get_output_embeddings(self) -> keras.layers.Layer:
- return self.input_embeddings
-
- def set_output_embeddings(self, value: tf.Variable):
- self.input_embeddings.weight = value
- self.input_embeddings.vocab_size = shape_list(value)[0]
-
- def get_bias(self) -> Dict[str, tf.Variable]:
- return {"bias": self.bias}
-
- def set_bias(self, value: tf.Variable):
- self.bias = value["bias"]
- self.vocab_size = shape_list(value["bias"])[0]
-
- def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
- hidden_states = self.transform(hidden_states=hidden_states)
- seq_length = shape_list(hidden_states)[1]
- hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
- hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
- hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
- hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
-
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->{{cookiecutter.camelcase_modelname}}
-class TF{{cookiecutter.camelcase_modelname}}MLMHead(keras.layers.Layer):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: keras.layers.Layer, **kwargs):
- super().__init__(**kwargs)
-
- self.predictions = TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(config, input_embeddings, name="predictions")
-
- def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
- prediction_scores = self.predictions(hidden_states=sequence_output)
-
- return prediction_scores
-
-
-@keras_serializable
-class TF{{cookiecutter.camelcase_modelname}}MainLayer(keras.layers.Layer):
- config_class = {{cookiecutter.camelcase_modelname}}Config
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, add_pooling_layer: bool = True, **kwargs):
- super().__init__(**kwargs)
-
- self.config = config
- self.is_decoder = config.is_decoder
-
- self.embeddings = TF{{cookiecutter.camelcase_modelname}}Embeddings(config, name="embeddings")
- self.encoder = TF{{cookiecutter.camelcase_modelname}}Encoder(config, name="encoder")
-
- # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
- def get_input_embeddings(self) -> keras.layers.Layer:
- return self.embeddings
-
- # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
- def set_input_embeddings(self, value: tf.Variable):
- self.embeddings.weight = value
- self.embeddings.vocab_size = shape_list(value)[0]
-
- # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
- def _prune_heads(self, heads_to_prune):
- """
- Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
- class PreTrainedModel
- """
- raise NotImplementedError
-
- @unpack_inputs
- def call(
- self,
- input_ids: TFModelInputType | None = None,
- attention_mask: np.ndarray | tf.Tensor | None = None,
- token_type_ids: np.ndarray | tf.Tensor | None = None,
- position_ids: np.ndarray | tf.Tensor | None = None,
- head_mask: np.ndarray | tf.Tensor | None = None,
- inputs_embeds: np.ndarray | tf.Tensor | None = None,
- encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
- encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
- past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
- use_cache: Optional[bool] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- training: bool = False,
- ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
-
- if not self.config.is_decoder:
- use_cache = False
-
- if input_ids is not None and inputs_embeds is not None:
- raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
- elif input_ids is not None:
- input_shape = shape_list(input_ids)
- elif inputs_embeds is not None:
- input_shape = shape_list(inputs_embeds)[:-1]
- else:
- raise ValueError("You have to specify either input_ids or inputs_embeds")
-
- batch_size, seq_length = input_shape
-
- if past_key_values is None:
- past_key_values_length = 0
- past_key_values = [None] * len(self.encoder.layer)
- else:
- past_key_values_length = shape_list(past_key_values[0][0])[-2]
-
- if attention_mask is None:
- attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1)
-
- if token_type_ids is None:
- token_type_ids = tf.fill(dims=input_shape, value=0)
-
- embedding_output = self.embeddings(
- input_ids=input_ids,
- position_ids=position_ids,
- token_type_ids=token_type_ids,
- inputs_embeds=inputs_embeds,
- past_key_values_length=past_key_values_length,
- training=training,
- )
-
- # We create a 3D attention mask from a 2D tensor mask.
- # Sizes are [batch_size, 1, 1, to_seq_length]
- # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
- # this attention mask is more simple than the triangular masking of causal attention
- # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
- attention_mask_shape = shape_list(attention_mask)
-
- mask_seq_length = seq_length + past_key_values_length
- # Copied from `modeling_tf_t5.py`
- # Provided a padding mask of dimensions [batch_size, mask_seq_length]
- # - if the model is a decoder, apply a causal mask in addition to the padding mask
- # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
- if self.is_decoder:
- seq_ids = tf.range(mask_seq_length)
- causal_mask = tf.less_equal(
- tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)),
- seq_ids[None, :, None],
- )
- causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype)
- extended_attention_mask = causal_mask * attention_mask[:, None, :]
- attention_mask_shape = shape_list(extended_attention_mask)
- extended_attention_mask = tf.reshape(
- extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2])
- )
- if past_key_values[0] is not None:
- # attention_mask needs to be sliced to the shape `[batch_size, 1, from_seq_length - cached_seq_length, to_seq_length]
- extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :]
- else:
- extended_attention_mask = tf.reshape(
- attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
- )
-
- # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
- # masked positions, this operation will create a tensor which is 0.0 for
- # positions we want to attend and -10000.0 for masked positions.
- # Since we are adding it to the raw scores before the softmax, this is
- # effectively the same as removing these entirely.
- extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
- one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
- ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
- extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
-
- # Copied from `modeling_tf_t5.py` with -1e9 -> -10000
- if self.is_decoder and encoder_attention_mask is not None:
- # If a 2D ou 3D attention mask is provided for the cross-attention
- # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
- # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
- encoder_attention_mask = tf.cast(
- encoder_attention_mask, dtype=extended_attention_mask.dtype
- )
- num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
- if num_dims_encoder_attention_mask == 3:
- encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
- if num_dims_encoder_attention_mask == 2:
- encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
-
- # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
- # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
- # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
- # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
-
- encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
- else:
- encoder_extended_attention_mask = None
-
- # Prepare head mask if needed
- # 1.0 in head_mask indicate we keep the head
- # attention_probs has shape bsz x n_heads x N x N
- # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
- # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
- if head_mask is not None:
- raise NotImplementedError
- else:
- head_mask = [None] * self.config.num_hidden_layers
-
- encoder_outputs = self.encoder(
- hidden_states=embedding_output,
- attention_mask=extended_attention_mask,
- head_mask=head_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_extended_attention_mask,
- past_key_values=past_key_values,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- training=training,
- )
-
- sequence_output = encoder_outputs[0]
-
- if not return_dict:
- return (
- sequence_output,
- ) + encoder_outputs[1:]
-
- return TFBaseModelOutputWithPastAndCrossAttentions(
- last_hidden_state=sequence_output,
- past_key_values=encoder_outputs.past_key_values,
- hidden_states=encoder_outputs.hidden_states,
- attentions=encoder_outputs.attentions,
- cross_attentions=encoder_outputs.cross_attentions,
- )
-
-
-class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
- """An abstract class to handle weights initialization and
- a simple interface for downloading and loading pretrained models.
- """
-
- config_class = {{cookiecutter.camelcase_modelname}}Config
- base_model_prefix = "{{cookiecutter.lowercase_modelname}}"
-
-
-
-{{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""
-
- This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
- generic methods the library implements for all its model (such as downloading or saving, resizing the input
- embeddings, pruning heads etc.)
-
- This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass.
- Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general
- usage and behavior.
-
-
-
- TensorFlow models and layers in `transformers` accept two formats as input:
-
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional argument.
-
- The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
- and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
- pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second format outside of Keras methods like `fit()` and `predict()`, such as when creating
- your own layers or models with the Keras `Functional` API, there are three possibilities you
- can use to gather all the input Tensors in the first positional argument:
-
- - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
-
- Note that when creating models and layers with (subclassing)[https://keras.io/guides/making_new_layers_and_models_via_subclassing/]
- then you don't need to worry about any of this, as you can just pass inputs like you would to any other Python
- function!
-
-
-
- Args:
- config ([`~{{cookiecutter.camelcase_modelname}}Config`]): Model configuration class with all the parameters of the model.
- Initializing with a config file does not load the weights associated with the model, only the configuration.
- Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-{{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r"""
- Args:
- input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
- Indices of input sequence tokens in the vocabulary.
-
- Indices can be obtained using [`AutoTokenizer`]. See
- [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
- details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
-
- - 0 corresponds to a *sentence A* token,
- - 1 corresponds to a *sentence B* token.
-
- [What are token type IDs?](../glossary#token-type-ids)
- position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
-
- [What are position IDs?](../glossary#position-ids)
- head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated
- vectors than the model's internal embedding lookup matrix.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
- tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
- config will be used instead.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
- more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
- used instead.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This
- argument can be used in eager mode, in graph mode the value will always be set to True.
- training (`bool`, *optional*, defaults to `False`):
- Whether or not to use the model in training mode (some modules like dropout modules have different
- behaviors between training and evaluation).
-"""
-
-
-@add_start_docstrings(
- "The bare {{cookiecutter.modelname}} Model transformer outputing raw hidden-states without any specific head on top.",
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs):
- super().__init__(config, *inputs, **kwargs)
-
- self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
-
- @unpack_inputs
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=TFBaseModelOutputWithPastAndCrossAttentions,
- config_class=_CONFIG_FOR_DOC,
- )
- def call(
- self,
- input_ids: TFModelInputType | None = None,
- attention_mask: np.ndarray | tf.Tensor | None = None,
- token_type_ids: np.ndarray | tf.Tensor | None = None,
- position_ids: np.ndarray | tf.Tensor | None = None,
- head_mask: np.ndarray | tf.Tensor | None = None,
- inputs_embeds: np.ndarray | tf.Tensor | None = None,
- encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
- encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
- past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
- use_cache: Optional[bool] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- training: Optional[bool] = False,
- ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
- r"""
- encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
- the model is configured as a decoder.
- encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
- contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
- instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
- use_cache (`bool`, *optional*, defaults to `True`):
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up
- decoding (see `past_key_values`). Set to `False` during training, `True` during generation
- """
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids=input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- past_key_values=past_key_values,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- training=training,
- )
-
- return outputs
-
-
-
-@add_start_docstrings("""{{cookiecutter.modelname}} Model with a `language modeling` head on top. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING)
-class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFMaskedLanguageModelingLoss):
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs):
- super().__init__(config, *inputs, **kwargs)
-
- if config.is_decoder:
- logger.warning(
- "If you want to use `TF{{cookiecutter.camelcase_modelname}}ForMaskedLM` make sure `config.is_decoder=False` for "
- "bi-directional self-attention."
- )
-
- self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
- self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, input_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls")
-
- def get_lm_head(self) -> keras.layers.Layer:
- return self.mlm.predictions
-
- @unpack_inputs
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=TFMaskedLMOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def call(
- self,
- input_ids: TFModelInputType | None = None,
- attention_mask: np.ndarray | tf.Tensor | None = None,
- token_type_ids: np.ndarray | tf.Tensor | None = None,
- position_ids: np.ndarray | tf.Tensor | None = None,
- head_mask: np.ndarray | tf.Tensor | None = None,
- inputs_embeds: np.ndarray | tf.Tensor | None = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- labels: np.ndarray | tf.Tensor | None = None,
- training: Optional[bool] = False,
- ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
- r"""
- labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
- Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
- """
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids=input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- training=training,
- )
- sequence_output = outputs[0]
- prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
- loss = (
- None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
- )
-
- if not return_dict:
- output = (prediction_scores,) + outputs[2:]
- return ((loss,) + output) if loss is not None else output
-
- return TFMaskedLMOutput(
- loss=loss,
- logits=prediction_scores,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-@add_start_docstrings(
- """{{cookiecutter.modelname}} Model with a `language modeling` head on top for CLM fine-tuning. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING
-)
-class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFCausalLanguageModelingLoss):
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs):
- super().__init__(config, *inputs, **kwargs)
-
- if not config.is_decoder:
- logger.warning("If you want to use `TF{{cookiecutter.camelcase_modelname}}ForCausalLM` as a standalone, add `is_decoder=True.`")
-
- self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
- self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, input_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls")
-
- def get_lm_head(self) -> keras.layers.Layer:
- return self.mlm.predictions
-
- def prepare_inputs_for_generation(self, inputs, past_key_values=None, attention_mask=None, **model_kwargs):
- # cut decoder_input_ids if past is used
- if past_key_values:
- inputs = tf.expand_dims(inputs[:, -1], -1)
-
- return {
- "input_ids": inputs,
- "attention_mask": attention_mask,
- "past_key_values": past_key_values,
- "use_cache": model_kwargs["use_cache"],
- }
-
- @unpack_inputs
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=TFCausalLMOutputWithCrossAttentions,
- config_class=_CONFIG_FOR_DOC,
- )
- def call(
- self,
- input_ids: TFModelInputType | None = None,
- attention_mask: np.ndarray | tf.Tensor | None = None,
- token_type_ids: np.ndarray | tf.Tensor | None = None,
- position_ids: np.ndarray | tf.Tensor | None = None,
- head_mask: np.ndarray | tf.Tensor | None = None,
- inputs_embeds: np.ndarray | tf.Tensor | None = None,
- encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
- encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
- past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
- use_cache: Optional[bool] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- labels: np.ndarray | tf.Tensor | None = None,
- training: Optional[bool] = False,
- ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
- r"""
- encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
- the model is configured as a decoder.
- encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
- contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
- instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
- use_cache (`bool`, *optional*, defaults to `True`):
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up
- decoding (see `past_key_values`). Set to `False` during training, `True` during generation
- labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
- Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., config.vocab_size - 1]`.
- """
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids=input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- past_key_values=past_key_values,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- training=training,
- )
- sequence_output = outputs[0]
- logits = self.mlm(sequence_output=sequence_output, training=training)
- loss = None
-
- if labels is not None:
- # shift labels to the left and cut last logit token
- shifted_logits = logits[:, :-1]
- labels = labels[:, 1:]
- loss = self.hf_compute_loss(labels=labels, logits=shifted_logits)
-
- if not return_dict:
- output = (logits,) + outputs[2:]
- return ((loss,) + output) if loss is not None else output
-
- return TFCausalLMOutputWithCrossAttentions(
- loss=loss,
- logits=logits,
- past_key_values=outputs.past_key_values,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- cross_attentions=outputs.cross_attentions,
- )
-
-
-
-class TF{{cookiecutter.camelcase_modelname}}ClassificationHead(keras.layers.Layer):
- """Head for sentence-level classification tasks."""
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs):
- super().__init__(config, *inputs, **kwargs)
-
- self.dense = keras.layers.Dense(
- units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
- )
- self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
- self.out_proj = keras.layers.Dense(
- units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
- )
-
- if isinstance(config.hidden_act, str):
- self.classifier_act_fn = get_tf_activation(config.hidden_act)
- else:
- self.classifier_act_fn = config.hidden_act
-
- def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
- hidden_states = hidden_states[:, 0, :] # take token (equiv. to [CLS])
- hidden_states = self.dropout(inputs=hidden_states, training=training)
- hidden_states = self.dense(inputs=hidden_states)
- hidden_states = self.classifier_act_fn(hidden_states)
- hidden_states = self.dropout(inputs=hidden_states, training=training)
- hidden_states = self.out_proj(hidden_states)
-
- return hidden_states
-
-
-@add_start_docstrings(
- """{{cookiecutter.modelname}} Model transformer with a sequence classification/regression head on top
- e.g., for GLUE tasks. """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFSequenceClassificationLoss):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs):
- super().__init__(config, *inputs, **kwargs)
-
- self.num_labels = config.num_labels
-
- self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
- self.classifier = TF{{cookiecutter.camelcase_modelname}}ClassificationHead(config, name="classifier")
-
- @unpack_inputs
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=TFSequenceClassifierOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def call(
- self,
- input_ids: TFModelInputType | None = None,
- attention_mask: np.ndarray | tf.Tensor | None = None,
- token_type_ids: np.ndarray | tf.Tensor | None = None,
- position_ids: np.ndarray | tf.Tensor | None = None,
- head_mask: np.ndarray | tf.Tensor | None = None,
- inputs_embeds: np.ndarray | tf.Tensor | None = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- labels: np.ndarray | tf.Tensor | None = None,
- training: Optional[bool] = False,
- ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
- r"""
- labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
- Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
- """
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids=input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- training=training,
- )
- logits = self.classifier(hidden_states=outputs[0], training=training)
- loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
-
- if not return_dict:
- output = (logits,) + outputs[1:]
-
- return ((loss,) + output) if loss is not None else output
-
- return TFSequenceClassifierOutput(
- loss=loss,
- logits=logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-@add_start_docstrings(
- """{{cookiecutter.modelname}} Model with a multiple choice classification head on top (a linear layer on top of
- the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFMultipleChoiceLoss):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs):
- super().__init__(config, *inputs, **kwargs)
-
- self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
- self.sequence_summary = TFSequenceSummary(
- config, config.initializer_range, name="sequence_summary"
- )
- self.classifier = keras.layers.Dense(
- units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
- )
-
- @unpack_inputs
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=TFMultipleChoiceModelOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def call(
- self,
- input_ids: TFModelInputType | None = None,
- attention_mask: np.ndarray | tf.Tensor | None = None,
- token_type_ids: np.ndarray | tf.Tensor | None = None,
- position_ids: np.ndarray | tf.Tensor | None = None,
- head_mask: np.ndarray | tf.Tensor | None = None,
- inputs_embeds: np.ndarray | tf.Tensor | None = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- labels: np.ndarray | tf.Tensor | None = None,
- training: Optional[bool] = False,
- ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
- r"""
- labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
- Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
- `input_ids` above)
- """
-
- if input_ids is not None:
- num_choices = shape_list(input_ids)[1]
- seq_length = shape_list(input_ids)[2]
- else:
- num_choices = shape_list(inputs_embeds)[1]
- seq_length = shape_list(inputs_embeds)[2]
-
- flat_input_ids = (
- tf.reshape(tensor=input_ids, shape=(-1, seq_length)) if input_ids is not None else None
- )
- flat_attention_mask = (
- tf.reshape(tensor=attention_mask, shape=(-1, seq_length))
- if attention_mask is not None
- else None
- )
- flat_token_type_ids = (
- tf.reshape(tensor=token_type_ids, shape=(-1, seq_length))
- if token_type_ids is not None
- else None
- )
- flat_position_ids = (
- tf.reshape(tensor=position_ids, shape=(-1, seq_length))
- if position_ids is not None
- else None
- )
- flat_inputs_embeds = (
- tf.reshape(
- tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3])
- )
- if inputs_embeds is not None
- else None
- )
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids=flat_input_ids,
- attention_mask=flat_attention_mask,
- token_type_ids=flat_token_type_ids,
- position_ids=flat_position_ids,
- head_mask=head_mask,
- inputs_embeds=flat_inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- training=training,
- )
- logits = self.sequence_summary(inputs=outputs[0], training=training)
- logits = self.classifier(inputs=logits)
- reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))
- loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)
-
- if not return_dict:
- output = (reshaped_logits,) + outputs[1:]
-
- return ((loss,) + output) if loss is not None else output
-
- return TFMultipleChoiceModelOutput(
- loss=loss,
- logits=reshaped_logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-@add_start_docstrings(
- """{{cookiecutter.modelname}} Model with a token classification head on top (a linear layer on top of
- the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFTokenClassificationLoss):
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs):
- super().__init__(config, *inputs, **kwargs)
-
- self.num_labels = config.num_labels
-
- self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
- self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
- self.classifier = keras.layers.Dense(
- units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
- )
-
- @unpack_inputs
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=TFTokenClassifierOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def call(
- self,
- input_ids: TFModelInputType | None = None,
- attention_mask: np.ndarray | tf.Tensor | None = None,
- token_type_ids: np.ndarray | tf.Tensor | None = None,
- position_ids: np.ndarray | tf.Tensor | None = None,
- head_mask: np.ndarray | tf.Tensor | None = None,
- inputs_embeds: np.ndarray | tf.Tensor | None = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- labels: np.ndarray | tf.Tensor | None = None,
- training: Optional[bool] = False,
- ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
- r"""
- labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
- Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
- """
-
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids=input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- training=training,
- )
- sequence_output = outputs[0]
- sequence_output = self.dropout(inputs=sequence_output, training=training)
- logits = self.classifier(inputs=sequence_output)
- loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
-
- if not return_dict:
- output = (logits,) + outputs[1:]
- return ((loss,) + output) if loss is not None else output
-
- return TFTokenClassifierOutput(
- loss=loss,
- logits=logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-@add_start_docstrings(
- """{{cookiecutter.modelname}} Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
- layer on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFQuestionAnsweringLoss):
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs):
- super().__init__(config, *inputs, **kwargs)
-
- self.num_labels = config.num_labels
-
- self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
- self.qa_outputs = keras.layers.Dense(
- units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
- )
-
- @unpack_inputs
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=TFQuestionAnsweringModelOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def call(
- self,
- input_ids: TFModelInputType | None = None,
- attention_mask: np.ndarray | tf.Tensor | None = None,
- token_type_ids: np.ndarray | tf.Tensor | None = None,
- position_ids: np.ndarray | tf.Tensor | None = None,
- head_mask: np.ndarray | tf.Tensor | None = None,
- inputs_embeds: np.ndarray | tf.Tensor | None = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- start_positions: np.ndarray | tf.Tensor | None = None,
- end_positions: np.ndarray | tf.Tensor | None = None,
- training: Optional[bool] = False,
- ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
- r"""
- start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
- Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
- sequence are not taken into account for computing the loss.
- end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
- Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
- sequence are not taken into account for computing the loss.
- """
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids=input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- training=training,
- )
- sequence_output = outputs[0]
- logits = self.qa_outputs(inputs=sequence_output)
- start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
- start_logits = tf.squeeze(input=start_logits, axis=-1)
- end_logits = tf.squeeze(input=end_logits, axis=-1)
- loss = None
-
- if start_positions is not None and end_positions is not None:
- labels = {"start_position": start_positions}
- labels["end_position"] = end_positions
- loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
-
- if not return_dict:
- output = (start_logits, end_logits) + outputs[2:]
- return ((loss,) + output) if loss is not None else output
-
- return TFQuestionAnsweringModelOutput(
- loss=loss,
- start_logits=start_logits,
- end_logits=end_logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-{% else %}
-import random
-from typing import Optional, Tuple, Union
-
-import tensorflow as tf
-
-from ...activations_tf import get_tf_activation
-from ...utils import (
- add_code_sample_docstrings,
- add_start_docstrings,
- add_start_docstrings_to_model_forward,
- replace_return_docstrings,
-)
-from ...modeling_tf_outputs import (
- TFBaseModelOutput,
- TFBaseModelOutputWithPastAndCrossAttentions,
- TFSeq2SeqLMOutput,
- TFSeq2SeqModelOutput,
-)
-
-# Public API
-from ...modeling_tf_utils import (
- DUMMY_INPUTS,
- TFPreTrainedModel,
- keras_serializable,
- unpack_inputs,
-)
-from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
-from ...utils import ContextManagers, logging
-from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "{{cookiecutter.checkpoint_identifier}}"
-_CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config"
-_TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer"
-
-
-LARGE_NEGATIVE = -1e8
-
-
-# Copied from transformers.models.bart.modeling_tf_bart.shift_tokens_right
-def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
- pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
- decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
- start_tokens = tf.fill((shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype))
- shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
- # replace possible -100 values in labels by `pad_token_id`
- shifted_input_ids = tf.where(
- shifted_input_ids == -100,
- tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
- shifted_input_ids,
- )
-
- # "Verify that `labels` has only positive values and -100"
- assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=shifted_input_ids.dtype))
-
- # Make sure the assertion op is called by wrapping the result in an identity no-op
- with tf.control_dependencies([assert_gte0]):
- shifted_input_ids = tf.identity(shifted_input_ids)
-
- return shifted_input_ids
-
-
-def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
- """
- Make causal mask used for bi-directional self-attention.
- """
- bsz, tgt_len = input_ids_shape
- mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
- mask_cond = tf.range(shape_list(mask)[-1])
-
- mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
-
- if past_key_values_length > 0:
- mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
-
- return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
-
-
-def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
- """
- Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
- """
- src_len = shape_list(mask)[1]
- tgt_len = tgt_len if tgt_len is not None else src_len
- one_cst = tf.constant(1.0)
- mask = tf.cast(mask, dtype=one_cst.dtype)
- expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
-
- return (one_cst - expanded_mask) * LARGE_NEGATIVE
-
-
-class TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(keras.layers.Embedding):
- """
- This module learns positional embeddings up to a fixed maximum size.
- """
-
- def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
- super().__init__(num_embeddings, embedding_dim, **kwargs)
-
- def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
- """Input is expected to be of size [bsz x seqlen]."""
- seq_len = input_shape[1]
- position_ids = tf.range(seq_len, delta=1, name="range")
- position_ids += past_key_values_length
- return super().call(tf.cast(position_ids, dtype=tf.int32))
-
-
-class TF{{cookiecutter.camelcase_modelname}}Attention(keras.layers.Layer):
- """Multi-headed attention from "Attention Is All You Need"""
-
- def __init__(
- self,
- embed_dim: int,
- num_heads: int,
- dropout: float = 0.0,
- is_decoder: bool = False,
- bias: bool = True,
- **kwargs,
- ):
- super().__init__(**kwargs)
- self.embed_dim = embed_dim
-
- self.num_heads = num_heads
- self.dropout = keras.layers.Dropout(dropout)
- self.head_dim = embed_dim // num_heads
- assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
- self.scaling = self.head_dim ** -0.5
- self.is_decoder = is_decoder
-
- self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
- self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
- self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
- self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
-
- def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
- return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
-
- def call(
- self,
- hidden_states: tf.Tensor,
- key_value_states: tf.Tensor | None = None,
- past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
- attention_mask: tf.Tensor | None = None,
- layer_head_mask: tf.Tensor | None = None,
- training=False,
- ) -> Tuple[tf.Tensor, tf.Tensor | None]:
- """Input shape: Batch x Time x Channel"""
-
- # if key_value_states are provided this layer is used as a cross-attention layer
- # for the decoder
- is_cross_attention = key_value_states is not None
- bsz, tgt_len, embed_dim = shape_list(hidden_states)
-
- # get query proj
- query_states = self.q_proj(hidden_states) * self.scaling
- # get key, value proj
- if is_cross_attention and past_key_value is not None:
- # reuse k,v, cross_attentions
- key_states = past_key_value[0]
- value_states = past_key_value[1]
- elif is_cross_attention:
- # cross_attentions
- key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
- value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
- elif past_key_value is not None:
- # reuse k, v, self_attention
- key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
- value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
- key_states = tf.concat([past_key_value[0], key_states], axis=2)
- value_states = tf.concat([past_key_value[1], value_states], axis=2)
- else:
- # self_attention
- key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
- value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
- if self.is_decoder:
- # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
- # Further calls to cross_attention layer can then reuse all cross-attention
- # key/value_states (first "if" case)
- # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
- # all previous decoder key/value_states. Further calls to uni-directional self-attention
- # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
- # if encoder bi-directional self-attention `past_key_value` is always `None`
- past_key_value = (key_states, value_states)
-
- proj_shape = (bsz * self.num_heads, -1, self.head_dim)
- query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape)
- key_states = tf.reshape(key_states, proj_shape)
- value_states = tf.reshape(value_states, proj_shape)
-
- src_len = shape_list(key_states)[1]
- attn_weights = tf.matmul(query_states, key_states, transpose_b=True)
-
- tf.debugging.assert_equal(
- shape_list(attn_weights),
- [bsz * self.num_heads, tgt_len, src_len],
- message=f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {shape_list(attn_weights)}",
- )
-
- if attention_mask is not None:
- tf.debugging.assert_equal(
- shape_list(attention_mask),
- [bsz, 1, tgt_len, src_len],
- message=f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {shape_list(attention_mask)}",
- )
-
- attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
- attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
-
- attn_weights = stable_softmax(attn_weights, axis=-1)
-
- if layer_head_mask is not None:
- tf.debugging.assert_equal(
- shape_list(layer_head_mask),
- [self.num_heads],
- message=f"Head mask for a single layer should be of size {(self.num_heads)}, but is {shape_list(layer_head_mask)}",
- )
-
- attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape(
- attn_weights, (bsz, self.num_heads, tgt_len, src_len)
- )
- attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
-
- attn_probs = self.dropout(attn_weights, training=training)
-
- attn_output = tf.matmul(attn_probs, value_states)
-
- tf.debugging.assert_equal(
- shape_list(attn_output),
- [bsz * self.num_heads, tgt_len, self.head_dim],
- message=f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {shape_list(attn_output)}",
- )
-
- attn_output = tf.transpose(
- tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3)
- )
- attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim))
-
- attn_output = self.out_proj(attn_output)
- attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len))
-
- return attn_output, attn_weights, past_key_value
-
-
-class TF{{cookiecutter.camelcase_modelname}}EncoderLayer(keras.layers.Layer):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(**kwargs)
- self.embed_dim = config.d_model
- self.self_attn = TF{{cookiecutter.camelcase_modelname}}Attention(
- self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
- )
- self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
- self.dropout = keras.layers.Dropout(config.dropout)
- self.activation_fn = get_tf_activation(config.activation_function)
- self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
- self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
- self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
- self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
-
- def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
- """
- Args:
- hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
- attention_mask (`tf.Tensor`): attention mask of size
- *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
- layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
- *(encoder_attention_heads,)*
- """
- residual = hidden_states
- hidden_states, self_attn_weights, _ = self.self_attn(
- hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask
- )
-
- tf.debugging.assert_equal(
- shape_list(hidden_states),
- shape_list(residual),
- message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
- )
-
- hidden_states = self.dropout(hidden_states, training=training)
- hidden_states = residual + hidden_states
- hidden_states = self.self_attn_layer_norm(hidden_states)
-
- residual = hidden_states
- hidden_states = self.activation_fn(self.fc1(hidden_states))
- hidden_states = self.activation_dropout(hidden_states, training=training)
- hidden_states = self.fc2(hidden_states)
- hidden_states = self.dropout(hidden_states, training=training)
- hidden_states = residual + hidden_states
- hidden_states = self.final_layer_norm(hidden_states)
-
- return hidden_states, self_attn_weights
-
-
-class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(keras.layers.Layer):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(**kwargs)
- self.embed_dim = config.d_model
- self.self_attn = TF{{cookiecutter.camelcase_modelname}}Attention(
- embed_dim=self.embed_dim,
- num_heads=config.decoder_attention_heads,
- dropout=config.attention_dropout,
- name="self_attn",
- is_decoder=True,
- )
- self.dropout = keras.layers.Dropout(config.dropout)
- self.activation_fn = get_tf_activation(config.activation_function)
- self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
-
- self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
- self.encoder_attn = TF{{cookiecutter.camelcase_modelname}}Attention(
- self.embed_dim,
- config.decoder_attention_heads,
- dropout=config.attention_dropout,
- name="encoder_attn",
- is_decoder=True,
- )
- self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
- self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
- self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
- self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
-
- def call(
- self,
- hidden_states,
- attention_mask: tf.Tensor | None = None,
- encoder_hidden_states: tf.Tensor | None = None,
- encoder_attention_mask: tf.Tensor | None = None,
- layer_head_mask: tf.Tensor | None = None,
- cross_attn_layer_head_mask: tf.Tensor | None = None,
- past_key_value: Tuple[tf.Tensor] | None = None,
- training=False,
- ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
- """
- Args:
- hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
- attention_mask (`tf.Tensor`): attention mask of size
- *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
- encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
- encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
- *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
- layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
- *(decoder_attention_heads,)*
- cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
- *(decoder_attention_heads,)*
- past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
- """
- residual = hidden_states
-
- # Self Attention
- # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
- self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
- # add present self-attn cache to positions 1,2 of present_key_value tuple
- hidden_states, self_attn_weights, present_key_value = self.self_attn(
- hidden_states=hidden_states,
- past_key_value=self_attn_past_key_value,
- attention_mask=attention_mask,
- layer_head_mask=layer_head_mask,
- )
- hidden_states = self.dropout(hidden_states, training=training)
- hidden_states = residual + hidden_states
- hidden_states = self.self_attn_layer_norm(hidden_states)
-
- # Cross-Attention Block
- cross_attn_present_key_value = None
- cross_attn_weights = None
- if encoder_hidden_states is not None:
- residual = hidden_states
-
- # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
- cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
- hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
- hidden_states=hidden_states,
- key_value_states=encoder_hidden_states,
- attention_mask=encoder_attention_mask,
- layer_head_mask=cross_attn_layer_head_mask,
- past_key_value=cross_attn_past_key_value,
- )
- hidden_states = self.dropout(hidden_states, training=training)
- hidden_states = residual + hidden_states
- hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
- # add cross-attn to positions 3,4 of present_key_value tuple
- present_key_value = present_key_value + cross_attn_present_key_value
-
- # Fully Connected
- residual = hidden_states
- hidden_states = self.activation_fn(self.fc1(hidden_states))
- hidden_states = self.activation_dropout(hidden_states, training=training)
- hidden_states = self.fc2(hidden_states)
- hidden_states = self.dropout(hidden_states, training=training)
- hidden_states = residual + hidden_states
- hidden_states = self.final_layer_norm(hidden_states)
-
- return (
- hidden_states,
- self_attn_weights,
- cross_attn_weights,
- present_key_value,
- )
-
-
-class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
- config_class = {{cookiecutter.camelcase_modelname}}Config
- base_model_prefix = "model"
-
-
-{{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""
- This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
- generic methods the library implements for all its model (such as downloading or saving, resizing the input
- embeddings, pruning heads etc.)
-
- This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
- it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
- and behavior.
-
-
-
- TensorFlow models and layers in `transformers` accept two formats as input:
-
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional argument.
-
- The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
- and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
- pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second format outside of Keras methods like `fit()` and `predict()`, such as when creating
- your own layers or models with the Keras `Functional` API, there are three possibilities you
- can use to gather all the input Tensors in the first positional argument:
-
- - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
-
- Note that when creating models and layers with (subclassing)[https://keras.io/guides/making_new_layers_and_models_via_subclassing/]
- then you don't need to worry about any of this, as you can just pass inputs like you would to any other Python
- function!
-
-
-
- Args:
- config ([`~{{cookiecutter.camelcase_modelname}}Config`]): Model configuration class with all the parameters of the model.
- Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
- model weights.
-"""
-
-{{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r"""
- Args:
- input_ids (`tf.Tensor` of shape `({0})`):
- Indices of input sequence tokens in the vocabulary.
-
- Indices can be obtained using [`~{{cookiecutter.camelcase_modelname}}Tokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
- details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
- Indices of decoder input sequence tokens in the vocabulary.
-
- Indices can be obtained using [`~{{cookiecutter.camelcase_modelname}}Tokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
- details.
-
- [What are input IDs?](../glossary#input-ids)
-
- {{cookiecutter.camelcase_modelname}} uses the `eos_token_id` as the starting token for
- `decoder_input_ids` generation. If `past_key_values` is used, optionally only the last
- `decoder_input_ids` have to be input (see `past_key_values`).
-
- For translation and summarization training, `decoder_input_ids` should be provided. If no
- `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
- the right for denoising pre-training following the paper.
- decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
- will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
- head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- encoder_outputs (`tf.FloatTensor`, *optional*):
- hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
- of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
- past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
- contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
- instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
- use_cache (`bool`, *optional*, defaults to `True`):
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up
- decoding (see `past_key_values`). Set to `False` during training, `True` during generation
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
- tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
- config will be used instead.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
- more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
- used instead.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This
- argument can be used in eager mode, in graph mode the value will always be set to True.
- training (`bool`, *optional*, defaults to `False`):
- Whether or not to use the model in training mode (some modules like dropout modules have different
- behaviors between training and evaluation).
-"""
-
-
-@keras_serializable
-class TF{{cookiecutter.camelcase_modelname}}Encoder(keras.layers.Layer):
- config_class = {{cookiecutter.camelcase_modelname}}Config
- """
- Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- [`TF{{cookiecutter.camelcase_modelname}}EncoderLayer`].
-
- Args:
- config: {{cookiecutter.camelcase_modelname}}Config
- """
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
- super().__init__(**kwargs)
- self.config = config
- self.dropout = keras.layers.Dropout(config.dropout)
- self.layerdrop = config.encoder_layerdrop
- self.padding_idx = config.pad_token_id
- self.max_source_positions = config.max_position_embeddings
- self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
-
- self.embed_tokens = embed_tokens
- self.embed_positions = TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(
- config.max_position_embeddings,
- config.d_model,
- name="embed_positions",
- )
- self.layers = [TF{{cookiecutter.camelcase_modelname}}EncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
- self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
-
- def get_embed_tokens(self):
- return self.embed_tokens
-
- def set_embed_tokens(self, embed_tokens):
- self.embed_tokens = embed_tokens
-
- @unpack_inputs
- def call(
- self,
- input_ids=None,
- inputs_embeds=None,
- attention_mask=None,
- head_mask=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- training=False,
- ):
- """
- Args:
- input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
- provide it.
-
- Indices can be obtained using [`~{{cookiecutter.camelcase_modelname}}Tokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
- for details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert `input_ids` indices
- into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
- returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
- in the config will be used instead.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
- for more detail. This argument can be used only in eager mode, in graph mode the value in the config
- will be used instead.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This
- argument can be used in eager mode, in graph mode the value will always be set to True.
- training (`bool`, *optional*, defaults to `False`):
- Whether or not to use the model in training mode (some modules like dropout modules have different
- behaviors between training and evaluation).
- """
-
- if input_ids is not None and inputs_embeds is not None:
- raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
- elif input_ids is not None:
- input_shape = shape_list(input_ids)
- elif inputs_embeds is not None:
- input_shape = shape_list(inputs_embeds)[:-1]
- else:
- raise ValueError("You have to specify either input_ids or inputs_embeds")
-
- if inputs_embeds is None:
- check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
- inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-
- embed_pos = self.embed_positions(input_shape)
- hidden_states = inputs_embeds + embed_pos
- hidden_states = self.layernorm_embedding(hidden_states)
- hidden_states = self.dropout(hidden_states, training=training)
-
- # check attention mask and invert
- if attention_mask is not None:
- # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
- attention_mask = _expand_mask(attention_mask)
-
- encoder_states = () if output_hidden_states else None
- all_attentions = () if output_attentions else None
-
- # check if head_mask has a correct number of layers specified if desired
- if head_mask is not None:
- tf.debugging.assert_equal(
- shape_list(head_mask)[0],
- len(self.layers),
- message=f"The head_mask should be specified for {len(self.layers)} layers, but it is for {shape_list(head_mask)[0]}.",
- )
-
- # encoder layers
- for idx, encoder_layer in enumerate(self.layers):
-
- if output_hidden_states:
- encoder_states = encoder_states + (hidden_states,)
- # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
- dropout_probability = random.uniform(0, 1)
- if training and (dropout_probability < self.layerdrop): # skip the layer
- continue
-
- hidden_states, attn = encoder_layer(
- hidden_states,
- attention_mask,
- head_mask[idx] if head_mask is not None else None,
- )
-
- if output_attentions:
- all_attentions += (attn,)
-
- if output_hidden_states:
- encoder_states = encoder_states + (hidden_states,)
-
- if not return_dict:
- return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
- return TFBaseModelOutput(
- last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
- )
-
-
-@keras_serializable
-class TF{{cookiecutter.camelcase_modelname}}Decoder(keras.layers.Layer):
- config_class = {{cookiecutter.camelcase_modelname}}Config
- """
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TF{{cookiecutter.camelcase_modelname}}DecoderLayer`]
-
- Args:
- config: {{cookiecutter.camelcase_modelname}}Config
- embed_tokens: output embedding
- """
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
- super().__init__(**kwargs)
- self.config = config
- self.padding_idx = config.pad_token_id
- self.embed_tokens = embed_tokens
- self.layerdrop = config.decoder_layerdrop
- self.embed_positions = TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(
- config.max_position_embeddings,
- config.d_model,
- name="embed_positions",
- )
- self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
- self.layers = [TF{{cookiecutter.camelcase_modelname}}DecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
- self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
-
- self.dropout = keras.layers.Dropout(config.dropout)
-
- def get_embed_tokens(self):
- return self.embed_tokens
-
- def set_embed_tokens(self, embed_tokens):
- self.embed_tokens = embed_tokens
-
- @unpack_inputs
- def call(
- self,
- input_ids=None,
- inputs_embeds=None,
- attention_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- head_mask=None,
- cross_attn_head_mask=None,
- past_key_values=None,
- use_cache=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- training=False,
- ):
- r"""
- Args:
- input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
- provide it.
-
- Indices can be obtained using [`~{{cookiecutter.camelcase_modelname}}Tokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
- for details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
- of the decoder.
- encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
- Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
- Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
- decoding.
-
- If `past_key_values` are used, the user can optionally input only the last
- `decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size,
- sequence_length)`.
- inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices
- into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
- returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
- in the config will be used instead.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
- for more detail. This argument can be used only in eager mode, in graph mode the value in the config
- will be used instead.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This
- argument can be used in eager mode, in graph mode the value will always be set to True.
- training (`bool`, *optional*, defaults to `False`):
- Whether or not to use the model in training mode (some modules like dropout modules have different
- behaviors between training and evaluation).
- """
-
- if input_ids is not None and inputs_embeds is not None:
- raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
- elif input_ids is not None:
- input_shape = shape_list(input_ids)
- elif inputs_embeds is not None:
- input_shape = shape_list(inputs_embeds)[:-1]
- else:
- raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
- past_key_values_length = (
- shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0
- )
-
- # embed positions
- positions = self.embed_positions(input_shape, past_key_values_length)
-
- if inputs_embeds is None:
- check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
- inputs_embeds = self.embed_tokens(input_ids)
-
- hidden_states = inputs_embeds
-
- attention_mask, combined_attention_mask = self.compute_combined_attns_mask(
- input_ids, attention_mask, input_shape, past_key_values_length
- )
-
- if encoder_hidden_states is not None and encoder_attention_mask is not None:
- # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
- encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1])
-
- hidden_states = self.layernorm_embedding(hidden_states + positions)
- hidden_states = self.dropout(hidden_states, training=training)
-
- # decoder layers
- all_hidden_states = () if output_hidden_states else None
- all_self_attns = () if output_attentions else None
- all_cross_attns = () if (output_attentions and encoder_hidden_states is not None) else None
- present_key_values = () if use_cache else None
-
- # check if head_mask and cross_attn_head_mask have a correct number of layers specified if desired
- for attn_mask_name, attn_mask in [("head_mask", head_mask), ("cross_attn_head_mask", cross_attn_head_mask)]:
- if attn_mask is not None:
- tf.debugging.assert_equal(
- shape_list(attn_mask)[0],
- len(self.layers),
- message=f"The {attn_mask_name} should be specified for {len(self.layers)} layers, but it is for {shape_list(attn_mask)[0]}.",
- )
-
- for idx, decoder_layer in enumerate(self.layers):
- # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
- if output_hidden_states:
- all_hidden_states += (hidden_states,)
-
- dropout_probability = random.uniform(0, 1)
-
- if training and (dropout_probability < self.layerdrop):
- continue
-
- past_key_value = past_key_values[idx] if past_key_values is not None else None
-
- hidden_states, layer_self_attn, layer_cross_attn, present_key_value = decoder_layer(
- hidden_states,
- attention_mask=combined_attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- layer_head_mask=head_mask[idx] if head_mask is not None else None,
- cross_attn_layer_head_mask=cross_attn_head_mask[idx]
- if cross_attn_head_mask is not None
- else None,
- past_key_value=past_key_value,
- )
-
- if use_cache:
- present_key_values += (present_key_value,)
-
- if output_attentions:
- all_self_attns += (layer_self_attn,)
-
- if encoder_hidden_states is not None:
- all_cross_attns += (layer_cross_attn,)
-
- if output_hidden_states:
- all_hidden_states += (hidden_states,)
-
- if not return_dict:
- return hidden_states, present_key_values, all_hidden_states, all_self_attns, all_cross_attns
- else:
- return TFBaseModelOutputWithPastAndCrossAttentions(
- last_hidden_state=hidden_states,
- past_key_values=present_key_values,
- hidden_states=all_hidden_states,
- attentions=all_self_attns,
- cross_attentions=all_cross_attns,
- )
-
- @tf.function
- def compute_combined_attns_mask(self, input_ids, attention_mask, input_shape, past_key_values_length):
- # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
- combined_attention_mask = None
- if input_shape[-1] > 1:
- combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
- else:
- combined_attention_mask = _expand_mask(
- tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1]
- )
-
- if attention_mask is None and input_ids is not None and input_shape[-1] > 1:
- attention_mask = tf.cast(
- tf.math.not_equal(input_ids, self.config.pad_token_id), input_ids.dtype
- )
- attention_mask = tf.concat(
- [
- tf.ones((input_shape[0], past_key_values_length), dtype=attention_mask.dtype),
- attention_mask,
- ],
- axis=-1,
- )
- else:
- attention_mask = tf.ones((input_shape[0], input_shape[1] + past_key_values_length))
-
- return attention_mask, combined_attention_mask
-
-
-@keras_serializable
-class TF{{cookiecutter.camelcase_modelname}}MainLayer(keras.layers.Layer):
- config_class = {{cookiecutter.camelcase_modelname}}Config
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(**kwargs)
-
- self.config = config
- self.shared = keras.layers.Embedding(
- input_dim=config.vocab_size,
- output_dim=config.d_model,
- embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
- name="model.shared"
- )
- # Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
- self.shared.load_weight_prefix = "model.shared"
-
- self.encoder = TF{{cookiecutter.camelcase_modelname}}Encoder(config, self.shared, name="encoder")
- self.decoder = TF{{cookiecutter.camelcase_modelname}}Decoder(config, self.shared, name="decoder")
-
- def get_input_embeddings(self):
- return self.shared
-
- def set_input_embeddings(self, new_embeddings):
- self.shared = new_embeddings
- self.encoder.embed_tokens = self.shared
- self.decoder.embed_tokens = self.shared
-
- @unpack_inputs
- def call(
- self,
- input_ids=None,
- attention_mask=None,
- decoder_input_ids=None,
- decoder_attention_mask=None,
- head_mask=None,
- decoder_head_mask=None,
- cross_attn_head_mask=None,
- encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
- past_key_values=None,
- inputs_embeds=None,
- decoder_inputs_embeds=None,
- use_cache=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- training=False,
- **kwargs
- ):
-
- if decoder_input_ids is None and decoder_inputs_embeds is None:
- use_cache = False
-
- if encoder_outputs is None:
- encoder_outputs = self.encoder(
- input_ids=input_ids,
- attention_mask=attention_mask,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- training=training,
- )
- # If the user passed a tuple for encoder_outputs, we wrap it in a TFBaseModelOutput when return_dict=True
- elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput):
- encoder_outputs = TFBaseModelOutput(
- last_hidden_state=encoder_outputs[0],
- hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
- attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
- )
- # If the user passed a TFBaseModelOutput for encoder_outputs, we wrap it in a tuple when return_dict=False
- elif not return_dict and not isinstance(encoder_outputs, tuple):
- encoder_outputs = encoder_outputs.to_tuple()
-
- decoder_outputs = self.decoder(
- decoder_input_ids,
- attention_mask=decoder_attention_mask,
- encoder_hidden_states=encoder_outputs[0],
- encoder_attention_mask=attention_mask,
- head_mask=decoder_head_mask,
- cross_attn_head_mask=cross_attn_head_mask,
- past_key_values=past_key_values,
- inputs_embeds=decoder_inputs_embeds,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- training=training,
- )
-
- if not return_dict:
- return decoder_outputs + encoder_outputs
-
- return TFSeq2SeqModelOutput(
- last_hidden_state=decoder_outputs.last_hidden_state,
- past_key_values=decoder_outputs.past_key_values,
- decoder_hidden_states=decoder_outputs.hidden_states,
- decoder_attentions=decoder_outputs.attentions,
- cross_attentions=decoder_outputs.cross_attentions,
- encoder_last_hidden_state=encoder_outputs.last_hidden_state,
- encoder_hidden_states=encoder_outputs.hidden_states,
- encoder_attentions=encoder_outputs.attentions,
- )
-
- def build(self, input_shape=None):
- # The shared/tied weights expect to be in the model base namespace
- # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
- # the current one.
- with tf.name_scope(self.shared.load_weight_prefix + '/' + self.shared.name + '/'):
- self.shared.build(None)
-
-
-@add_start_docstrings(
- "The bare {{cookiecutter.uppercase_modelname}} Model outputting raw hidden-states without any specific head on top.",
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs):
- super().__init__(config, *inputs, **kwargs)
-
- self.model = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="model")
-
- def get_encoder(self):
- return self.model.encoder
-
- def get_decoder(self):
- return self.model.decoder
-
- @unpack_inputs
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=TFSeq2SeqModelOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def call(
- self,
- input_ids=None,
- attention_mask=None,
- decoder_input_ids=None,
- decoder_attention_mask=None,
- head_mask=None,
- decoder_head_mask=None,
- cross_attn_head_mask=None,
- encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
- past_key_values=None,
- inputs_embeds=None,
- decoder_inputs_embeds=None,
- use_cache=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- training=False,
- **kwargs
- ):
-
- outputs = self.model(
- input_ids=input_ids,
- attention_mask=attention_mask,
- decoder_input_ids=decoder_input_ids,
- decoder_attention_mask=decoder_attention_mask,
- head_mask=head_mask,
- decoder_head_mask=decoder_head_mask,
- cross_attn_head_mask=cross_attn_head_mask,
- encoder_outputs=encoder_outputs,
- past_key_values=past_key_values,
- inputs_embeds=inputs_embeds,
- decoder_inputs_embeds=decoder_inputs_embeds,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- training=training,
- )
-
- return outputs
-
-
-# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
-class BiasLayer(keras.layers.Layer):
- """
- Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
- so all weights have to be registered in a layer.
- """
-
- def __init__(self, shape, initializer, trainable, name, **kwargs):
- super().__init__(name=name, **kwargs)
- # Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of
- # "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see:
- # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214
- self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable)
-
- def call(self, x):
- return x + self.bias
-
-
-@add_start_docstrings(
- "The {{cookiecutter.uppercase_modelname}} Model with a language modeling head. Can be used for summarization.",
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel):
- _keys_to_ignore_on_load_unexpected = [
- r"model.encoder.embed_tokens.weight",
- r"model.decoder.embed_tokens.weight",
- ]
-
- def __init__(self, config, *inputs, **kwargs):
- super().__init__(config, *inputs, **kwargs)
- self.model = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="model")
- self.use_cache = config.use_cache
- # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
- self.bias_layer = BiasLayer(
- name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
- )
-
- def get_decoder(self):
- return self.model.decoder
-
- def get_encoder(self):
- return self.model.encoder
-
- def get_bias(self):
- return {"final_logits_bias": self.bias_layer.bias}
-
- def set_bias(self, value):
- # Replaces the existing layers containing bias for correct (de)serialization.
- vocab_size = value["final_logits_bias"].shape[-1]
- self.bias_layer = BiasLayer(
- name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False
- )
- self.bias_layer.bias.assign(value["final_logits_bias"])
-
- def get_output_embeddings(self):
- return self.get_input_embeddings()
-
- def set_output_embeddings(self, value):
- self.set_input_embeddings(value)
-
- @unpack_inputs
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING)
- @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
- def call(
- self,
- input_ids=None,
- attention_mask=None,
- decoder_input_ids=None,
- decoder_attention_mask=None,
- head_mask=None,
- decoder_head_mask=None,
- cross_attn_head_mask=None,
- encoder_outputs: Optional[TFBaseModelOutput] = None,
- past_key_values=None,
- inputs_embeds=None,
- decoder_inputs_embeds=None,
- use_cache=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- labels=None,
- training=False,
- ):
- """
- Returns:
-
- Examples:
-
- ```python
- >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
- >>> import tensorflow as tf
- >>> mname = '{{cookiecutter.checkpoint_identifier}}'
- >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained(mname)
- >>> TXT = "My friends are but they eat too many carbs."
- >>> model = TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained(mname)
- >>> batch = tokenizer([TXT], return_tensors='tf')
- >>> logits = model(inputs=batch.input_ids).logits
- >>> probs = tf.nn.softmax(logits[0])
- >>> # probs[5] is associated with the mask token
- ```"""
-
- if labels is not None:
- use_cache = False
- if decoder_input_ids is None and decoder_inputs_embeds is None:
- decoder_input_ids = shift_tokens_right(
- labels, self.config.pad_token_id, self.config.decoder_start_token_id
- )
-
- outputs = self.model(
- input_ids,
- attention_mask=attention_mask,
- decoder_input_ids=decoder_input_ids,
- encoder_outputs=encoder_outputs,
- decoder_attention_mask=decoder_attention_mask,
- head_mask=head_mask,
- decoder_head_mask=decoder_head_mask,
- cross_attn_head_mask=cross_attn_head_mask,
- past_key_values=past_key_values,
- inputs_embeds=inputs_embeds,
- decoder_inputs_embeds=decoder_inputs_embeds,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- training=training
- )
- lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True)
- lm_logits = self.bias_layer(lm_logits)
- masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits)
-
- if not return_dict:
- output = (lm_logits,) + outputs[1:]
- return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
- return TFSeq2SeqLMOutput(
- loss=masked_lm_loss,
- logits=lm_logits,
- past_key_values=outputs.past_key_values, # index 1 of d outputs
- decoder_hidden_states=outputs.decoder_hidden_states, # index 2 of d outputs
- decoder_attentions=outputs.decoder_attentions, # index 3 of d outputs
- cross_attentions=outputs.cross_attentions, # index 4 of d outputs
- encoder_last_hidden_state=outputs.encoder_last_hidden_state, # index 0 of encoder outputs
- encoder_hidden_states=outputs.encoder_hidden_states, # 1 of e out
- encoder_attentions=outputs.encoder_attentions, # 2 of e out
- )
-
- def prepare_inputs_for_generation(
- self,
- decoder_input_ids,
- past_key_values=None,
- attention_mask=None,
- head_mask=None,
- decoder_head_mask=None,
- cross_attn_head_mask=None,
- use_cache=None,
- encoder_outputs=None,
- **kwargs
- ):
- # cut decoder_input_ids if past is used
- if past_key_values is not None:
- decoder_input_ids = decoder_input_ids[:, -1:]
-
- return {
- "input_ids": None, # needs to be passed to make Keras.layer.__call__ happy
- "encoder_outputs": encoder_outputs,
- "past_key_values": past_key_values,
- "decoder_input_ids": decoder_input_ids,
- "attention_mask": attention_mask,
- "head_mask": head_mask,
- "decoder_head_mask": decoder_head_mask,
- "cross_attn_head_mask": cross_attn_head_mask,
- "use_cache": use_cache, # change this to avoid caching (presumably for debugging)
- }
-
- def hf_compute_loss(self, labels, logits):
- """CrossEntropyLoss that ignores pad tokens"""
- loss_fn = keras.losses.SparseCategoricalCrossentropy(
- from_logits=True,
- reduction=keras.losses.Reduction.NONE,
- )
- melted_labels = tf.reshape(labels, (-1,))
- active_loss = tf.not_equal(melted_labels, self.config.pad_token_id)
- reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
- labels = tf.boolean_mask(melted_labels, active_loss)
- return loss_fn(labels, reduced_logits)
-{% endif -%}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
deleted file mode 100755
index db109b27fc8aae..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ /dev/null
@@ -1,3264 +0,0 @@
-# coding=utf-8
-# Copyright 2022 {{cookiecutter.authors}} The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch {{cookiecutter.modelname}} model. """
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-
-
-import math
-import os
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from typing import Optional, Tuple, Union
-
-from ...activations import ACT2FN
-from ...utils import (
- add_code_sample_docstrings,
- add_start_docstrings,
- add_start_docstrings_to_model_forward,
- replace_return_docstrings,
-)
-from ...modeling_outputs import (
- BaseModelOutputWithPastAndCrossAttentions,
- CausalLMOutputWithCrossAttentions,
- MaskedLMOutput,
- MultipleChoiceModelOutput,
- QuestionAnsweringModelOutput,
- SequenceClassifierOutput,
- TokenClassifierOutput,
-)
-from ...modeling_utils import PreTrainedModel, SequenceSummary
-from ...pytorch_utils import (
- apply_chunking_to_forward,
- find_pruneable_heads_and_indices,
- prune_linear_layer,
-)
-from ...utils import logging
-from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "{{cookiecutter.checkpoint_identifier}}"
-_CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config"
-
-
-def load_tf_weights_in_{{cookiecutter.lowercase_modelname}}(model, config, tf_checkpoint_path):
- """Load tf checkpoints in a pytorch model."""
- try:
- import re
-
- import numpy as np
- import tensorflow as tf
- except ImportError:
- logger.error(
- "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
- "https://www.tensorflow.org/install/ for installation instructions."
- )
- raise
- tf_path = os.path.abspath(tf_checkpoint_path)
- logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
- # Load weights from TF model
- init_vars = tf.train.list_variables(tf_path)
- names = []
- arrays = []
- for name, shape in init_vars:
- logger.info(f"Loading TF weight {name} with shape {shape}")
- array = tf.train.load_variable(tf_path, name)
- names.append(name)
- arrays.append(array)
-
- for name, array in zip(names, arrays):
- name = name.split("/")
- # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
- # which are not required for using pretrained model
- if any(
- n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
- for n in name
- ):
- logger.info(f"Skipping {'/'.join(name)}")
- continue
- pointer = model
- for m_name in name:
- if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
- scope_names = re.split(r"_(\d+)", m_name)
- else:
- scope_names = [m_name]
- if scope_names[0] == "kernel" or scope_names[0] == "gamma":
- pointer = getattr(pointer, "weight")
- elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
- pointer = getattr(pointer, "bias")
- elif scope_names[0] == "output_weights":
- pointer = getattr(pointer, "weight")
- elif scope_names[0] == "squad":
- pointer = getattr(pointer, "classifier")
- else:
- try:
- pointer = getattr(pointer, scope_names[0])
- except AttributeError:
- logger.info(f"Skipping {'/'.join(name)}")
- continue
- if len(scope_names) >= 2:
- num = int(scope_names[1])
- pointer = pointer[num]
- if m_name[-11:] == "_embeddings":
- pointer = getattr(pointer, "weight")
- elif m_name == "kernel":
- array = np.transpose(array)
- try:
- assert (
- pointer.shape == array.shape
- ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
- except AssertionError as e:
- e.args += (pointer.shape, array.shape)
- raise
- logger.info(f"Initialize PyTorch weight {name}")
- pointer.data = torch.from_numpy(array)
- return model
-
-
-# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}Embeddings(nn.Module):
- """Construct the embeddings from word, position and token_type embeddings."""
-
- def __init__(self, config):
- super().__init__()
- self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
- self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
- self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
- # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
- # any TensorFlow checkpoint file
- self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
- self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
- # position_ids (1, len position emb) is contiguous in memory and exported when serialized
- self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
- self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
- self.register_buffer(
- "token_type_ids",
- torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
- persistent=False,
- )
-
- def forward(
- self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
- ):
- if input_ids is not None:
- input_shape = input_ids.size()
- else:
- input_shape = inputs_embeds.size()[:-1]
-
- seq_length = input_shape[1]
-
- if position_ids is None:
- position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
-
- # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
- # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
- # issue #5664
- if token_type_ids is None:
- if hasattr(self, "token_type_ids"):
- buffered_token_type_ids = self.token_type_ids[:, :seq_length]
- buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
- token_type_ids = buffered_token_type_ids_expanded
- else:
- token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
-
- if inputs_embeds is None:
- inputs_embeds = self.word_embeddings(input_ids)
- token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
- embeddings = inputs_embeds + token_type_embeddings
- if self.position_embedding_type == "absolute":
- position_embeddings = self.position_embeddings(position_ids)
- embeddings += position_embeddings
- embeddings = self.LayerNorm(embeddings)
- embeddings = self.dropout(embeddings)
- return embeddings
-
-
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}SelfAttention(nn.Module):
- def __init__(self, config, position_embedding_type=None):
- super().__init__()
- if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
- raise ValueError(
- f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
- f"heads ({config.num_attention_heads})"
- )
-
- self.num_attention_heads = config.num_attention_heads
- self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
- self.all_head_size = self.num_attention_heads * self.attention_head_size
-
- self.query = nn.Linear(config.hidden_size, self.all_head_size)
- self.key = nn.Linear(config.hidden_size, self.all_head_size)
- self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
- self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
- self.position_embedding_type = position_embedding_type or getattr(config, "position_embedding_type", "absolute")
- if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
- self.max_position_embeddings = config.max_position_embeddings
- self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
-
- self.is_decoder = config.is_decoder
-
- def transpose_for_scores(self, x):
- new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
- x = x.view(*new_x_shape)
- return x.permute(0, 2, 1, 3)
-
- def forward(
- self,
- hidden_states,
- attention_mask=None,
- head_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- past_key_value=None,
- output_attentions=False,
- ):
- mixed_query_layer = self.query(hidden_states)
-
- # If this is instantiated as a cross-attention module, the keys
- # and values come from an encoder; the attention mask needs to be
- # such that the encoder's padding tokens are not attended to.
- is_cross_attention = encoder_hidden_states is not None
-
- if is_cross_attention and past_key_value is not None:
- # reuse k,v, cross_attentions
- key_layer = past_key_value[0]
- value_layer = past_key_value[1]
- attention_mask = encoder_attention_mask
- elif is_cross_attention:
- key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
- value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
- attention_mask = encoder_attention_mask
- elif past_key_value is not None:
- key_layer = self.transpose_for_scores(self.key(hidden_states))
- value_layer = self.transpose_for_scores(self.value(hidden_states))
- key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
- value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
- else:
- key_layer = self.transpose_for_scores(self.key(hidden_states))
- value_layer = self.transpose_for_scores(self.value(hidden_states))
-
- query_layer = self.transpose_for_scores(mixed_query_layer)
-
- if self.is_decoder:
- # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
- # Further calls to cross_attention layer can then reuse all cross-attention
- # key/value_states (first "if" case)
- # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
- # all previous decoder key/value_states. Further calls to uni-directional self-attention
- # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
- # if encoder bi-directional self-attention `past_key_value` is always `None`
- past_key_value = (key_layer, value_layer)
-
- # Take the dot product between "query" and "key" to get the raw attention scores.
- attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
- if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
- seq_length = hidden_states.size()[1]
- position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
- position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
- distance = position_ids_l - position_ids_r
- positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
- positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
-
- if self.position_embedding_type == "relative_key":
- relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
- attention_scores = attention_scores + relative_position_scores
- elif self.position_embedding_type == "relative_key_query":
- relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
- relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
- attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
- attention_scores = attention_scores / math.sqrt(self.attention_head_size)
- if attention_mask is not None:
- # Apply the attention mask is (precomputed for all layers in {{cookiecutter.camelcase_modelname}}Model forward() function)
- attention_scores = attention_scores + attention_mask
-
- # Normalize the attention scores to probabilities.
- attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
- # This is actually dropping out entire tokens to attend to, which might
- # seem a bit unusual, but is taken from the original Transformer paper.
- attention_probs = self.dropout(attention_probs)
-
- # Mask heads if we want to
- if head_mask is not None:
- attention_probs = attention_probs * head_mask
-
- context_layer = torch.matmul(attention_probs, value_layer)
-
- context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
- new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
- context_layer = context_layer.view(*new_context_layer_shape)
-
- outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
- if self.is_decoder:
- outputs = outputs + (past_key_value,)
- return outputs
-
-
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}SelfOutput(nn.Module):
- def __init__(self, config):
- super().__init__()
- self.dense = nn.Linear(config.hidden_size, config.hidden_size)
- self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
- self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
- def forward(self, hidden_states, input_tensor):
- hidden_states = self.dense(hidden_states)
- hidden_states = self.dropout(hidden_states)
- hidden_states = self.LayerNorm(hidden_states + input_tensor)
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}Attention(nn.Module):
- def __init__(self, config, position_embedding_type=None):
- super().__init__()
- self.self = {{cookiecutter.camelcase_modelname}}SelfAttention(config, position_embedding_type=position_embedding_type)
- self.output = {{cookiecutter.camelcase_modelname}}SelfOutput(config)
- self.pruned_heads = set()
-
- def prune_heads(self, heads):
- if len(heads) == 0:
- return
- heads, index = find_pruneable_heads_and_indices(
- heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
- )
-
- # Prune linear layers
- self.self.query = prune_linear_layer(self.self.query, index)
- self.self.key = prune_linear_layer(self.self.key, index)
- self.self.value = prune_linear_layer(self.self.value, index)
- self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
- # Update hyper params and store pruned heads
- self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
- self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
- self.pruned_heads = self.pruned_heads.union(heads)
-
- def forward(
- self,
- hidden_states,
- attention_mask=None,
- head_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- past_key_value=None,
- output_attentions=False,
- ):
- self_outputs = self.self(
- hidden_states,
- attention_mask,
- head_mask,
- encoder_hidden_states,
- encoder_attention_mask,
- past_key_value,
- output_attentions,
- )
- attention_output = self.output(self_outputs[0], hidden_states)
- outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
- return outputs
-
-
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}Intermediate(nn.Module):
- def __init__(self, config):
- super().__init__()
- self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
- if isinstance(config.hidden_act, str):
- self.intermediate_act_fn = ACT2FN[config.hidden_act]
- else:
- self.intermediate_act_fn = config.hidden_act
-
- def forward(self, hidden_states):
- hidden_states = self.dense(hidden_states)
- hidden_states = self.intermediate_act_fn(hidden_states)
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}Output(nn.Module):
- def __init__(self, config):
- super().__init__()
- self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
- self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
- self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
- def forward(self, hidden_states, input_tensor):
- hidden_states = self.dense(hidden_states)
- hidden_states = self.dropout(hidden_states)
- hidden_states = self.LayerNorm(hidden_states + input_tensor)
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}Layer(nn.Module):
- def __init__(self, config):
- super().__init__()
- self.chunk_size_feed_forward = config.chunk_size_feed_forward
- self.seq_len_dim = 1
- self.attention = {{cookiecutter.camelcase_modelname}}Attention(config)
- self.is_decoder = config.is_decoder
- self.add_cross_attention = config.add_cross_attention
- if self.add_cross_attention:
- assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
- self.crossattention = {{cookiecutter.camelcase_modelname}}Attention(config, position_embedding_type="absolute")
- self.intermediate = {{cookiecutter.camelcase_modelname}}Intermediate(config)
- self.output = {{cookiecutter.camelcase_modelname}}Output(config)
-
- def forward(
- self,
- hidden_states,
- attention_mask=None,
- head_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- past_key_value=None,
- output_attentions=False,
- ):
- # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
- self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
- self_attention_outputs = self.attention(
- hidden_states,
- attention_mask,
- head_mask,
- output_attentions=output_attentions,
- past_key_value=self_attn_past_key_value,
- )
- attention_output = self_attention_outputs[0]
-
- # if decoder, the last output is tuple of self-attn cache
- if self.is_decoder:
- outputs = self_attention_outputs[1:-1]
- present_key_value = self_attention_outputs[-1]
- else:
- outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
-
- cross_attn_present_key_value = None
- if self.is_decoder and encoder_hidden_states is not None:
- assert hasattr(
- self, "crossattention"
- ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
-
- # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
- cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
- cross_attention_outputs = self.crossattention(
- attention_output,
- attention_mask,
- head_mask,
- encoder_hidden_states,
- encoder_attention_mask,
- cross_attn_past_key_value,
- output_attentions,
- )
- attention_output = cross_attention_outputs[0]
- outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
-
- # add cross-attn cache to positions 3,4 of present_key_value tuple
- cross_attn_present_key_value = cross_attention_outputs[-1]
- present_key_value = present_key_value + cross_attn_present_key_value
-
- layer_output = apply_chunking_to_forward(
- self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
- )
- outputs = (layer_output,) + outputs
-
- # if decoder, return the attn key/values as the last output
- if self.is_decoder:
- outputs = outputs + (present_key_value,)
-
- return outputs
-
- def feed_forward_chunk(self, attention_output):
- intermediate_output = self.intermediate(attention_output)
- layer_output = self.output(intermediate_output, attention_output)
- return layer_output
-
-
-# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}Encoder(nn.Module):
- def __init__(self, config):
- super().__init__()
- self.config = config
- self.layer = nn.ModuleList([{{cookiecutter.camelcase_modelname}}Layer(config) for _ in range(config.num_hidden_layers)])
- self.gradient_checkpointing = False
-
- def forward(
- self,
- hidden_states,
- attention_mask=None,
- head_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- past_key_values=None,
- use_cache=None,
- output_attentions=False,
- output_hidden_states=False,
- return_dict=True,
- ):
- if self.gradient_checkpointing and self.training and use_cache:
- logger.warning(
- "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
- )
- use_cache = False
-
- all_hidden_states = () if output_hidden_states else None
- all_self_attentions = () if output_attentions else None
- all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
- next_decoder_cache = () if use_cache else None
-
- for i, layer_module in enumerate(self.layer):
- if output_hidden_states:
- all_hidden_states = all_hidden_states + (hidden_states,)
-
- layer_head_mask = head_mask[i] if head_mask is not None else None
- past_key_value = past_key_values[i] if past_key_values is not None else None
-
- if self.gradient_checkpointing and self.training:
- layer_outputs = self._gradient_checkpointing_func(
- layer_module.__call__,
- hidden_states,
- attention_mask,
- layer_head_mask,
- encoder_hidden_states,
- encoder_attention_mask,
- past_key_value,
- output_attentions,
- )
- else:
- layer_outputs = layer_module(
- hidden_states,
- attention_mask,
- layer_head_mask,
- encoder_hidden_states,
- encoder_attention_mask,
- past_key_value,
- output_attentions,
- )
-
- hidden_states = layer_outputs[0]
- if use_cache:
- next_decoder_cache += (layer_outputs[-1],)
- if output_attentions:
- all_self_attentions = all_self_attentions + (layer_outputs[1],)
- if self.config.add_cross_attention:
- all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
-
- if output_hidden_states:
- all_hidden_states = all_hidden_states + (hidden_states,)
-
- if not return_dict:
- return tuple(
- v
- for v in [
- hidden_states,
- next_decoder_cache,
- all_hidden_states,
- all_self_attentions,
- all_cross_attentions,
- ]
- if v is not None
- )
- return BaseModelOutputWithPastAndCrossAttentions(
- last_hidden_state=hidden_states,
- past_key_values=next_decoder_cache,
- hidden_states=all_hidden_states,
- attentions=all_self_attentions,
- cross_attentions=all_cross_attentions,
- )
-
-
-# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}PredictionHeadTransform(nn.Module):
- def __init__(self, config):
- super().__init__()
- self.dense = nn.Linear(config.hidden_size, config.hidden_size)
- if isinstance(config.hidden_act, str):
- self.transform_act_fn = ACT2FN[config.hidden_act]
- else:
- self.transform_act_fn = config.hidden_act
- self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
- def forward(self, hidden_states):
- hidden_states = self.dense(hidden_states)
- hidden_states = self.transform_act_fn(hidden_states)
- hidden_states = self.LayerNorm(hidden_states)
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}LMPredictionHead(nn.Module):
- def __init__(self, config):
- super().__init__()
- self.transform = {{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config)
-
- # The output weights are the same as the input embeddings, but there is
- # an output-only bias for each token.
- self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
- self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
- # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
- self.decoder.bias = self.bias
-
- def forward(self, hidden_states):
- hidden_states = self.transform(hidden_states)
- hidden_states = self.decoder(hidden_states)
- return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}OnlyMLMHead(nn.Module):
- def __init__(self, config):
- super().__init__()
- self.predictions = {{cookiecutter.camelcase_modelname}}LMPredictionHead(config)
-
- def forward(self, sequence_output):
- prediction_scores = self.predictions(sequence_output)
- return prediction_scores
-
-
-class {{cookiecutter.camelcase_modelname}}PreTrainedModel(PreTrainedModel):
- """
- An abstract class to handle weights initialization and
- a simple interface for downloading and loading pretrained models.
- """
-
- config_class = {{cookiecutter.camelcase_modelname}}Config
- load_tf_weights = load_tf_weights_in_{{cookiecutter.lowercase_modelname}}
- base_model_prefix = "{{cookiecutter.lowercase_modelname}}"
- supports_gradient_checkpointing = True
- _keys_to_ignore_on_load_missing = [r"position_ids"]
-
- def _init_weights(self, module):
- """ Initialize the weights """
- if isinstance(module, nn.Linear):
- # Slightly different from the TF version which uses truncated_normal for initialization
- # cf https://github.com/pytorch/pytorch/pull/5617
- module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
- if module.bias is not None:
- module.bias.data.zero_()
- elif isinstance(module, nn.Embedding):
- module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
- if module.padding_idx is not None:
- module.weight.data[module.padding_idx].zero_()
- elif isinstance(module, nn.LayerNorm):
- module.bias.data.zero_()
- module.weight.data.fill_(1.0)
-
-
-{{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""
- This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
- Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
- usage and behavior.
-
- Parameters:
- config ([`~{{cookiecutter.camelcase_modelname}}Config`]): Model configuration class with all the parameters of the model.
- Initializing with a config file does not load the weights associated with the model, only the configuration.
- Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-{{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r"""
- Args:
- input_ids (`torch.LongTensor` of shape `({0})`):
- Indices of input sequence tokens in the vocabulary.
-
- Indices can be obtained using [`{{cookiecutter.camelcase_modelname}}Tokenizer`].
- See [`PreTrainedTokenizer.encode`] and
- [`PreTrainedTokenizer.__call__`] for details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
-
- - 0 corresponds to a *sentence A* token,
- - 1 corresponds to a *sentence B* token.
-
- [What are token type IDs?](../glossary#token-type-ids)
- position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
- Indices of positions of each input sequence tokens in the position embeddings.
- Selected in the range `[0, config.max_position_embeddings - 1]`.
-
- [What are position IDs?](../glossary#position-ids)
- head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert *input_ids* indices into associated vectors
- than the model's internal embedding lookup matrix.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
- tensors for more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
- more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
- "The bare {{cookiecutter.modelname}} Model transformer outputting raw hidden-states without any specific head on top.",
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class {{cookiecutter.camelcase_modelname}}Model({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- """
-
- The model can behave as an encoder (with only self-attention) as well
- as a decoder, in which case a layer of cross-attention is added between
- the self-attention layers, following the architecture described in [Attention is
- all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
- Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
- To behave as an decoder the model needs to be initialized with the
- `is_decoder` argument of the configuration set to `True`.
- To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
- argument and `add_cross_attention` set to `True`; an
- `encoder_hidden_states` is then expected as an input to the forward pass.
- """
-
- def __init__(self, config):
- super().__init__(config)
- self.config = config
-
- self.embeddings = {{cookiecutter.camelcase_modelname}}Embeddings(config)
- self.encoder = {{cookiecutter.camelcase_modelname}}Encoder(config)
-
- # Initialize weights and apply final processing
- self.post_init()
-
- def get_input_embeddings(self):
- return self.embeddings.word_embeddings
-
- def set_input_embeddings(self, value):
- self.embeddings.word_embeddings = value
-
- def _prune_heads(self, heads_to_prune):
- """Prunes heads of the model.
- heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
- See base class PreTrainedModel
- """
- for layer, heads in heads_to_prune.items():
- self.encoder.layer[layer].attention.prune_heads(heads)
-
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=BaseModelOutputWithPastAndCrossAttentions,
- config_class=_CONFIG_FOR_DOC,
- )
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- token_type_ids=None,
- position_ids=None,
- head_mask=None,
- inputs_embeds=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- past_key_values=None,
- use_cache=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
- if the model is configured as a decoder.
- encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on the padding token indices of the encoder input. This mask
- is used in the cross-attention if the model is configured as a decoder.
- Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
- past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
- Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
- instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
- use_cache (`bool`, *optional*):
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up
- decoding (see `past_key_values`).
- """
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
- if self.config.is_decoder:
- use_cache = use_cache if use_cache is not None else self.config.use_cache
- else:
- use_cache = False
-
- if input_ids is not None and inputs_embeds is not None:
- raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
- elif input_ids is not None:
- self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
- input_shape = input_ids.size()
- elif inputs_embeds is not None:
- input_shape = inputs_embeds.size()[:-1]
- else:
- raise ValueError("You have to specify either input_ids or inputs_embeds")
-
- batch_size, seq_length = input_shape
- device = input_ids.device if input_ids is not None else inputs_embeds.device
-
- # past_key_values_length
- past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
-
- if attention_mask is None:
- attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
-
- if token_type_ids is None:
- if hasattr(self.embeddings, "token_type_ids"):
- buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
- buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
- token_type_ids = buffered_token_type_ids_expanded
- else:
- token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
- # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
- # ourselves in which case we just need to make it broadcastable to all heads.
- extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
-
- # If a 2D or 3D attention mask is provided for the cross-attention
- # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
- if self.config.is_decoder and encoder_hidden_states is not None:
- encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
- encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
- if encoder_attention_mask is None:
- encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
- encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
- else:
- encoder_extended_attention_mask = None
-
- # Prepare head mask if needed
- # 1.0 in head_mask indicate we keep the head
- # attention_probs has shape bsz x n_heads x N x N
- # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
- # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
- head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
- embedding_output = self.embeddings(
- input_ids=input_ids,
- position_ids=position_ids,
- token_type_ids=token_type_ids,
- inputs_embeds=inputs_embeds,
- past_key_values_length=past_key_values_length,
- )
- encoder_outputs = self.encoder(
- embedding_output,
- attention_mask=extended_attention_mask,
- head_mask=head_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_extended_attention_mask,
- past_key_values=past_key_values,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
- sequence_output = encoder_outputs[0]
-
- if not return_dict:
- return (sequence_output,) + encoder_outputs[1:]
-
- return BaseModelOutputWithPastAndCrossAttentions(
- last_hidden_state=sequence_output,
- past_key_values=encoder_outputs.past_key_values,
- hidden_states=encoder_outputs.hidden_states,
- attentions=encoder_outputs.attentions,
- cross_attentions=encoder_outputs.cross_attentions,
- )
-
-
-@add_start_docstrings("""{{cookiecutter.modelname}} Model with a `language modeling` head on top. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING)
-class {{cookiecutter.camelcase_modelname}}ForMaskedLM({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- def __init__(self, config):
- super().__init__(config)
-
- if config.is_decoder:
- logger.warning(
- "If you want to use `{{cookiecutter.camelcase_modelname}}ForMaskedLM` make sure `config.is_decoder=False` for "
- "bi-directional self-attention."
- )
-
- self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config)
- self.cls = {{cookiecutter.camelcase_modelname}}OnlyMLMHead(config)
-
- # Initialize weights and apply final processing
- self.post_init()
-
- def get_output_embeddings(self):
- return self.cls.predictions.decoder
-
- def set_output_embeddings(self, new_embeddings):
- self.cls.predictions.decoder = new_embeddings
-
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=MaskedLMOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- token_type_ids=None,
- position_ids=None,
- head_mask=None,
- inputs_embeds=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- labels=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
- Labels for computing the masked language modeling loss.
- Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring)
- Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels
- in `[0, ..., config.vocab_size]`.
- """
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- sequence_output = outputs[0]
- prediction_scores = self.cls(sequence_output)
-
- masked_lm_loss = None
- if labels is not None:
- loss_fct = CrossEntropyLoss() # -100 index = padding token
- masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-
- if not return_dict:
- output = (prediction_scores,) + outputs[1:]
- return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
- return MaskedLMOutput(
- loss=masked_lm_loss,
- logits=prediction_scores,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
- def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
- input_shape = input_ids.shape
- effective_batch_size = input_shape[0]
-
- # add a dummy token
- assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
- attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
- dummy_token = torch.full(
- (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
- )
- input_ids = torch.cat([input_ids, dummy_token], dim=1)
-
- return {"input_ids": input_ids, "attention_mask": attention_mask}
-
-
-@add_start_docstrings(
- """{{cookiecutter.modelname}} Model with a `language modeling` head on top for CLM fine-tuning. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING
-)
-class {{cookiecutter.camelcase_modelname}}ForCausalLM({{cookiecutter.camelcase_modelname}}PreTrainedModel):
-
- _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
-
- def __init__(self, config):
- super().__init__(config)
-
- if not config.is_decoder:
- logger.warning("If you want to use `{{cookiecutter.camelcase_modelname}}ForCausalLM` as a standalone, add `is_decoder=True.`")
-
- self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config)
- self.cls = {{cookiecutter.camelcase_modelname}}OnlyMLMHead(config)
-
- # Initialize weights and apply final processing
- self.post_init()
-
- def get_output_embeddings(self):
- return self.cls.predictions.decoder
-
- def set_output_embeddings(self, new_embeddings):
- self.cls.predictions.decoder = new_embeddings
-
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- token_type_ids=None,
- position_ids=None,
- inputs_embeds=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- head_mask=None,
- cross_attn_head_mask=None,
- past_key_values=None,
- labels=None,
- use_cache=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
- the model is configured as a decoder.
- encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
- past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
- tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
- additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
- model.
-
- Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
- decoding.
-
- If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
- instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
- Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
- `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
- ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
- use_cache (`bool`, *optional*):
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up
- decoding (see `past_key_values`).
-
- Returns:
-
- Example:
-
- ```python
- >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, {{cookiecutter.camelcase_modelname}}ForCausalLM, {{cookiecutter.camelcase_modelname}}Config
- >>> import torch
-
- >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- >>> config = {{cookiecutter.camelcase_modelname}}Config.from_pretrained("{{cookiecutter.checkpoint_identifier}}")
- >>> config.is_decoder = True
- >>> model = {{cookiecutter.camelcase_modelname}}ForCausalLM.from_pretrained('{{cookiecutter.checkpoint_identifier}}', config=config)
-
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
-
- >>> prediction_logits = outputs.logits
- ```
-"""
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- past_key_values=past_key_values,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- sequence_output = outputs[0]
- prediction_scores = self.cls(sequence_output)
-
- lm_loss = None
- if labels is not None:
- # we are doing next-token prediction; shift prediction scores and input ids by one
- shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
- labels = labels[:, 1:].contiguous()
- loss_fct = CrossEntropyLoss()
- lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-
- if not return_dict:
- output = (prediction_scores,) + outputs[1:]
- return ((lm_loss,) + output) if lm_loss is not None else output
-
- return CausalLMOutputWithCrossAttentions(
- loss=lm_loss,
- logits=prediction_scores,
- past_key_values=outputs.past_key_values,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- cross_attentions=outputs.cross_attentions,
- )
-
- def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
- input_shape = input_ids.shape
-
- # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
- if attention_mask is None:
- attention_mask = input_ids.new_ones(input_shape)
-
- # cut decoder_input_ids if past is used
- if past_key_values is not None:
- input_ids = input_ids[:, -1:]
-
- return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
- def _reorder_cache(self, past_key_values, beam_idx):
- reordered_past = ()
- for layer_past in past_key_values:
- reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2]) + layer_past[2:],)
- return reordered_past
-
-class {{cookiecutter.camelcase_modelname}}ClassificationHead(nn.Module):
- """Head for sentence-level classification tasks."""
-
- def __init__(self, config):
- super().__init__()
- self.dense = nn.Linear(config.hidden_size, config.hidden_size)
- self.dropout = nn.Dropout(config.hidden_dropout_prob)
- self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
-
- self.config = config
-
- def forward(self, features, **kwargs):
- x = features[:, 0, :] # take token (equiv. to [CLS])
- x = self.dropout(x)
- x = self.dense(x)
- x = ACT2FN[self.config.hidden_act](x)
- x = self.dropout(x)
- x = self.out_proj(x)
- return x
-
-
-@add_start_docstrings(
- """{{cookiecutter.modelname}} Model transformer with a sequence classification/regression head on top (a linear layer on top of
- the pooled output) e.g. for GLUE tasks. """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class {{cookiecutter.camelcase_modelname}}ForSequenceClassification({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- def __init__(self, config):
- super().__init__(config)
- self.num_labels = config.num_labels
- self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config)
- self.classifier = {{cookiecutter.camelcase_modelname}}ClassificationHead(config)
-
- # Initialize weights and apply final processing
- self.post_init()
-
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=SequenceClassifierOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- token_type_ids=None,
- position_ids=None,
- head_mask=None,
- inputs_embeds=None,
- labels=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
- Labels for computing the sequence classification/regression loss.
- Indices should be in `[0, ..., config.num_labels - 1]`.
- If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
- """
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- sequence_output = outputs[0]
- logits = self.classifier(sequence_output)
-
- loss = None
- if labels is not None:
- if self.config.problem_type is None:
- if self.num_labels == 1:
- self.config.problem_type = "regression"
- elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
- self.config.problem_type = "single_label_classification"
- else:
- self.config.problem_type = "multi_label_classification"
-
- if self.config.problem_type == "regression":
- loss_fct = MSELoss()
- if self.num_labels == 1:
- loss = loss_fct(logits.squeeze(), labels.squeeze())
- else:
- loss = loss_fct(logits, labels)
- elif self.config.problem_type == "single_label_classification":
- loss_fct = CrossEntropyLoss()
- loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
- elif self.config.problem_type == "multi_label_classification":
- loss_fct = BCEWithLogitsLoss()
- loss = loss_fct(logits, labels)
- if not return_dict:
- output = (logits,) + outputs[1:]
- return ((loss,) + output) if loss is not None else output
-
- return SequenceClassifierOutput(
- loss=loss,
- logits=logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-@add_start_docstrings(
- """{{cookiecutter.modelname}} Model with a multiple choice classification head on top (a linear layer on top of
- the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class {{cookiecutter.camelcase_modelname}}ForMultipleChoice({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- def __init__(self, config):
- super().__init__(config)
-
- self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config)
- self.sequence_summary = SequenceSummary(config)
- self.classifier = nn.Linear(config.hidden_size, 1)
-
- # Initialize weights and apply final processing
- self.post_init()
-
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=MultipleChoiceModelOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- token_type_ids=None,
- position_ids=None,
- head_mask=None,
- inputs_embeds=None,
- labels=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
- Labels for computing the multiple choice classification loss.
- Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension
- of the input tensors. (See `input_ids` above)
- """
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
- num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
-
- input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
- attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
- token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
- position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
- inputs_embeds = (
- inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
- if inputs_embeds is not None
- else None
- )
-
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- sequence_output = outputs[0]
-
- pooled_output = self.sequence_summary(sequence_output)
- logits = self.classifier(pooled_output)
- reshaped_logits = logits.view(-1, num_choices)
-
- loss = None
- if labels is not None:
- loss_fct = CrossEntropyLoss()
- loss = loss_fct(reshaped_logits, labels)
-
- if not return_dict:
- output = (reshaped_logits,) + outputs[1:]
- return ((loss,) + output) if loss is not None else output
-
- return MultipleChoiceModelOutput(
- loss=loss,
- logits=reshaped_logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-@add_start_docstrings(
- """{{cookiecutter.modelname}} Model with a token classification head on top (a linear layer on top of
- the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class {{cookiecutter.camelcase_modelname}}ForTokenClassification({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- def __init__(self, config):
- super().__init__(config)
- self.num_labels = config.num_labels
-
- self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config)
- self.dropout = nn.Dropout(config.hidden_dropout_prob)
- self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
- # Initialize weights and apply final processing
- self.post_init()
-
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=TokenClassifierOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- token_type_ids=None,
- position_ids=None,
- head_mask=None,
- inputs_embeds=None,
- labels=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
- Labels for computing the token classification loss.
- Indices should be in `[0, ..., config.num_labels - 1]`.
- """
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- sequence_output = outputs[0]
-
- sequence_output = self.dropout(sequence_output)
- logits = self.classifier(sequence_output)
-
- loss = None
- if labels is not None:
- loss_fct = CrossEntropyLoss()
- loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
- if not return_dict:
- output = (logits,) + outputs[1:]
- return ((loss,) + output) if loss is not None else output
-
- return TokenClassifierOutput(
- loss=loss,
- logits=logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-
-
-@add_start_docstrings(
- """{{cookiecutter.modelname}} Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
- layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class {{cookiecutter.camelcase_modelname}}ForQuestionAnswering({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- def __init__(self, config):
- super().__init__(config)
-
- config.num_labels = 2
- self.num_labels = config.num_labels
-
- self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config)
- self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
- # Initialize weights and apply final processing
- self.post_init()
-
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=QuestionAnsweringModelOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- token_type_ids=None,
- position_ids=None,
- head_mask=None,
- inputs_embeds=None,
- start_positions=None,
- end_positions=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
- Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`).
- Position outside of the sequence are not taken into account for computing the loss.
- end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
- Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`).
- Position outside of the sequence are not taken into account for computing the loss.
- """
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
- outputs = self.{{cookiecutter.lowercase_modelname}}(
- input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- sequence_output = outputs[0]
-
- logits = self.qa_outputs(sequence_output)
- start_logits, end_logits = logits.split(1, dim=-1)
- start_logits = start_logits.squeeze(-1)
- end_logits = end_logits.squeeze(-1)
-
- total_loss = None
- if start_positions is not None and end_positions is not None:
- # If we are on multi-GPU, split add a dimension
- if len(start_positions.size()) > 1:
- start_positions = start_positions.squeeze(-1)
- if len(end_positions.size()) > 1:
- end_positions = end_positions.squeeze(-1)
- # sometimes the start/end positions are outside our model inputs, we ignore these terms
- ignored_index = start_logits.size(1)
- start_positions = start_positions.clamp(0, ignored_index)
- end_positions = end_positions.clamp(0, ignored_index)
-
- loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
- start_loss = loss_fct(start_logits, start_positions)
- end_loss = loss_fct(end_logits, end_positions)
- total_loss = (start_loss + end_loss) / 2
-
- if not return_dict:
- output = (start_logits, end_logits) + outputs[1:]
- return ((total_loss,) + output) if total_loss is not None else output
-
- return QuestionAnsweringModelOutput(
- loss=total_loss,
- start_logits=start_logits,
- end_logits=end_logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
-{% else %}
-import math
-import copy
-from typing import Optional, Tuple, List, Union
-
-import torch
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from ...activations import ACT2FN
-from ...utils import (
- add_code_sample_docstrings,
- add_end_docstrings,
- add_start_docstrings,
- add_start_docstrings_to_model_forward,
- replace_return_docstrings,
-)
-from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
-from ...modeling_outputs import (
- BaseModelOutput,
- BaseModelOutputWithPastAndCrossAttentions,
- Seq2SeqLMOutput,
- Seq2SeqModelOutput,
- Seq2SeqQuestionAnsweringModelOutput,
- Seq2SeqSequenceClassifierOutput,
- CausalLMOutputWithCrossAttentions
-)
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
-from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "{{cookiecutter.checkpoint_identifier}}"
-_CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config"
-
-
-
-def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
- """
- Shift input ids one token to the right.
- """
- shifted_input_ids = input_ids.new_zeros(input_ids.shape)
- shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
- shifted_input_ids[:, 0] = decoder_start_token_id
-
- assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
- # replace possible -100 values in labels by `pad_token_id`
- shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
-
- return shifted_input_ids
-
-
-class {{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(nn.Embedding):
- """
- This module learns positional embeddings up to a fixed maximum size.
- """
-
- def __init__(self, num_embeddings: int, embedding_dim: int):
- super().__init__(num_embeddings, embedding_dim)
-
- def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
- """`input_ids_shape` is expected to be [bsz x seqlen]."""
- bsz, seq_len = input_ids_shape[:2]
- positions = torch.arange(
- past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
- )
- return super().forward(positions)
-
-
-class {{cookiecutter.camelcase_modelname}}Attention(nn.Module):
- """Multi-headed attention from 'Attention Is All You Need' paper"""
-
- def __init__(
- self,
- embed_dim: int,
- num_heads: int,
- dropout: float = 0.0,
- is_decoder: bool = False,
- bias: bool = True,
- ):
- super().__init__()
- self.embed_dim = embed_dim
- self.num_heads = num_heads
- self.dropout = dropout
- self.head_dim = embed_dim // num_heads
- assert (
- self.head_dim * num_heads == self.embed_dim
- ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
- self.scaling = self.head_dim ** -0.5
- self.is_decoder = is_decoder
-
- self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
- self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
- self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
- self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-
- def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
- return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
- def forward(
- self,
- hidden_states: torch.Tensor,
- key_value_states: Optional[torch.Tensor] = None,
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
- attention_mask: Optional[torch.Tensor] = None,
- layer_head_mask: Optional[torch.Tensor] = None,
- output_attentions: bool = False,
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
- """Input shape: Batch x Time x Channel"""
-
- # if key_value_states are provided this layer is used as a cross-attention layer
- # for the decoder
- is_cross_attention = key_value_states is not None
- bsz, tgt_len, embed_dim = hidden_states.size()
-
- # get query proj
- query_states = self.q_proj(hidden_states) * self.scaling
- # get key, value proj
- if is_cross_attention and past_key_value is not None:
- # reuse k,v, cross_attentions
- key_states = past_key_value[0]
- value_states = past_key_value[1]
- elif is_cross_attention:
- # cross_attentions
- key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
- value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
- elif past_key_value is not None:
- # reuse k, v, self_attention
- key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
- value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
- key_states = torch.cat([past_key_value[0], key_states], dim=2)
- value_states = torch.cat([past_key_value[1], value_states], dim=2)
- else:
- # self_attention
- key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
- value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
- if self.is_decoder:
- # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
- # Further calls to cross_attention layer can then reuse all cross-attention
- # key/value_states (first "if" case)
- # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
- # all previous decoder key/value_states. Further calls to uni-directional self-attention
- # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
- # if encoder bi-directional self-attention `past_key_value` is always `None`
- past_key_value = (key_states, value_states)
-
- proj_shape = (bsz * self.num_heads, -1, self.head_dim)
- query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
- key_states = key_states.view(*proj_shape)
- value_states = value_states.view(*proj_shape)
-
- src_len = key_states.size(1)
- attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
- if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
- raise ValueError(
- f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
- )
-
- if attention_mask is not None:
- if attention_mask.size() != (bsz, 1, tgt_len, src_len):
- raise ValueError(
- f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
- )
- attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
- attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
- attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
- if layer_head_mask is not None:
- if layer_head_mask.size() != (self.num_heads,):
- raise ValueError(
- f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
- )
- attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
- attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
- if output_attentions:
- # this operation is a bit akward, but it's required to
- # make sure that attn_weights keeps its gradient.
- # In order to do so, attn_weights have to reshaped
- # twice and have to be reused in the following
- attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
- attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
- else:
- attn_weights_reshaped = None
-
- attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-
- attn_output = torch.bmm(attn_probs, value_states)
-
- if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
- raise ValueError(
- f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
- )
-
- attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
- attn_output = attn_output.transpose(1, 2)
- attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
-
- attn_output = self.out_proj(attn_output)
-
- return attn_output, attn_weights_reshaped, past_key_value
-
-
-class {{cookiecutter.camelcase_modelname}}EncoderLayer(nn.Module):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config):
- super().__init__()
- self.embed_dim = config.d_model
- self.self_attn = {{cookiecutter.camelcase_modelname}}Attention(
- embed_dim=self.embed_dim,
- num_heads=config.encoder_attention_heads,
- dropout=config.attention_dropout,
- )
- self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
- self.dropout = config.dropout
- self.activation_fn = ACT2FN[config.activation_function]
- self.activation_dropout = config.activation_dropout
- self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
- self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
- self.final_layer_norm = nn.LayerNorm(self.embed_dim)
-
- def forward(
- self,
- hidden_states: torch.Tensor,
- attention_mask: torch.Tensor,
- layer_head_mask: torch.Tensor,
- output_attentions: bool = False,
- ):
- """
- Args:
- hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
- attention_mask (`torch.FloatTensor`): attention mask of size
- *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
- layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
- *(config.encoder_attention_heads,)*.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
- returned tensors for more detail.
- """
- residual = hidden_states
- hidden_states, attn_weights, _ = self.self_attn(
- hidden_states=hidden_states,
- attention_mask=attention_mask,
- layer_head_mask=layer_head_mask,
- output_attentions=output_attentions,
- )
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
- hidden_states = residual + hidden_states
- hidden_states = self.self_attn_layer_norm(hidden_states)
-
- residual = hidden_states
- hidden_states = self.activation_fn(self.fc1(hidden_states))
- hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
- hidden_states = self.fc2(hidden_states)
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
- hidden_states = residual + hidden_states
- hidden_states = self.final_layer_norm(hidden_states)
-
- if hidden_states.dtype == torch.float16 and (torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()):
- clamp_value = torch.finfo(hidden_states.dtype).max - 1000
- hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
- outputs = (hidden_states,)
-
- if output_attentions:
- outputs += (attn_weights,)
-
- return outputs
-
-
-class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config):
- super().__init__()
- self.embed_dim = config.d_model
-
- self.self_attn = {{cookiecutter.camelcase_modelname}}Attention(
- embed_dim=self.embed_dim,
- num_heads=config.decoder_attention_heads,
- dropout=config.attention_dropout,
- is_decoder=True,
- )
- self.dropout = config.dropout
- self.activation_fn = ACT2FN[config.activation_function]
- self.activation_dropout = config.activation_dropout
-
- self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
- self.encoder_attn = {{cookiecutter.camelcase_modelname}}Attention(
- self.embed_dim,
- config.decoder_attention_heads,
- dropout=config.attention_dropout,
- is_decoder=True,
- )
- self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
- self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
- self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
- self.final_layer_norm = nn.LayerNorm(self.embed_dim)
-
- def forward(
- self,
- hidden_states: torch.Tensor,
- attention_mask: Optional[torch.Tensor] = None,
- encoder_hidden_states: Optional[torch.Tensor] = None,
- encoder_attention_mask: Optional[torch.Tensor] = None,
- layer_head_mask: Optional[torch.Tensor] = None,
- cross_layer_head_mask: Optional[torch.Tensor] = None,
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
- output_attentions: Optional[bool] = False,
- use_cache: Optional[bool] = True,
- ):
- """
- Args:
- hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
- attention_mask (`torch.FloatTensor`): attention mask of size
- *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
- encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
- encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
- *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
- layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
- *(encoder_attention_heads,)*.
- cross_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
- size *(decoder_attention_heads,)*.
- past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
- returned tensors for more detail.
- """
- residual = hidden_states
-
- # Self Attention
- # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
- self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
- # add present self-attn cache to positions 1,2 of present_key_value tuple
- hidden_states, self_attn_weights, present_key_value = self.self_attn(
- hidden_states=hidden_states,
- past_key_value=self_attn_past_key_value,
- attention_mask=attention_mask,
- layer_head_mask=layer_head_mask,
- output_attentions=output_attentions,
- )
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
- hidden_states = residual + hidden_states
- hidden_states = self.self_attn_layer_norm(hidden_states)
-
- # Cross-Attention Block
- cross_attn_present_key_value = None
- cross_attn_weights = None
- if encoder_hidden_states is not None:
- residual = hidden_states
-
- # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
- cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
- hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
- hidden_states=hidden_states,
- key_value_states=encoder_hidden_states,
- attention_mask=encoder_attention_mask,
- layer_head_mask=cross_layer_head_mask,
- past_key_value=cross_attn_past_key_value,
- output_attentions=output_attentions,
- )
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
- hidden_states = residual + hidden_states
- hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
- # add cross-attn to positions 3,4 of present_key_value tuple
- present_key_value = present_key_value + cross_attn_present_key_value
-
- # Fully Connected
- residual = hidden_states
- hidden_states = self.activation_fn(self.fc1(hidden_states))
- hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
- hidden_states = self.fc2(hidden_states)
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
- hidden_states = residual + hidden_states
- hidden_states = self.final_layer_norm(hidden_states)
-
- outputs = (hidden_states,)
-
- if output_attentions:
- outputs += (self_attn_weights, cross_attn_weights)
-
- if use_cache:
- outputs += (present_key_value,)
-
- return outputs
-
-
-# Copied from transformers.models.bart.modeling_bart.BartClassificationHead with Bart->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}ClassificationHead(nn.Module):
- """Head for sentence-level classification tasks."""
-
- def __init__(
- self,
- input_dim: int,
- inner_dim: int,
- num_classes: int,
- pooler_dropout: float,
- ):
- super().__init__()
- self.dense = nn.Linear(input_dim, inner_dim)
- self.dropout = nn.Dropout(p=pooler_dropout)
- self.out_proj = nn.Linear(inner_dim, num_classes)
-
- def forward(self, hidden_states: torch.Tensor):
- hidden_states = self.dropout(hidden_states)
- hidden_states = self.dense(hidden_states)
- hidden_states = torch.tanh(hidden_states)
- hidden_states = self.dropout(hidden_states)
- hidden_states = self.out_proj(hidden_states)
- return hidden_states
-
-
-class {{cookiecutter.camelcase_modelname}}PreTrainedModel(PreTrainedModel):
- config_class = {{cookiecutter.camelcase_modelname}}Config
- base_model_prefix = "model"
- supports_gradient_checkpointing = True
-
- def _init_weights(self, module):
- std = self.config.init_std
- if isinstance(module, nn.Linear):
- module.weight.data.normal_(mean=0.0, std=std)
- if module.bias is not None:
- module.bias.data.zero_()
- elif isinstance(module, nn.Embedding):
- module.weight.data.normal_(mean=0.0, std=std)
- if module.padding_idx is not None:
- module.weight.data[module.padding_idx].zero_()
-
-
-{{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""
- This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
- methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
- pruning heads etc.)
-
- This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
- subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
- general usage and behavior.
-
- Parameters:
- config ([`~{{cookiecutter.camelcase_modelname}}Config`]):
- Model configuration class with all the parameters of the model.
- Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
- weights.
-"""
-
-{{cookiecutter.uppercase_modelname}}_GENERATION_EXAMPLE = r"""
- Summarization example:
-
- ```python
- >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-
- >>> model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
- >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
-
- >>> # Generate Summary
- >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5)
- >>> print(tokenizer.decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
- ```
-"""
-
-{{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r"""
- Args:
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
- it.
-
- Indices can be obtained using [`~{{cookiecutter.camelcase_modelname}}Tokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
- details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
- Provide for translation and summarization training. By default, the model will create this tensor by
- shifting the `input_ids` to the right, following the paper.
- decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
- Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
- also be used by default.
-
- If you want to change padding behavior, you should read [`modeling_{{cookiecutter.lowercase_modelname}}._prepare_decoder_attention_mask`] and
- modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
- information on the default strategy.
- head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
- Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
- `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
- *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
- cross-attention of the decoder.
- past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
- of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
- Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
- If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
- instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
- than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
- Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
- representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
- have to be input (see `past_key_values`). This is useful if you want more control over how to convert
- `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
-
- If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
- takes the value of `inputs_embeds`.
- use_cache (`bool`, *optional*):
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up
- decoding (see `past_key_values`).
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
- tensors for more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
- more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-{{cookiecutter.uppercase_modelname}}_STANDALONE_INPUTS_DOCSTRING = r"""
- Args:
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
- it.
-
- Indices can be obtained using [`ProphetNetTokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
- details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
- tensors for more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
- more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- """
- Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- [`{{cookiecutter.camelcase_modelname}}EncoderLayer`].
-
- Args:
- config: {{cookiecutter.camelcase_modelname}}Config
- embed_tokens (nn.Embedding): output embedding
- """
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None):
- super().__init__(config)
-
- self.dropout = config.dropout
- self.layerdrop = config.encoder_layerdrop
-
- embed_dim = config.d_model
- self.padding_idx = config.pad_token_id
- self.max_source_positions = config.max_position_embeddings
- self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
-
- if embed_tokens is not None:
- self.embed_tokens = embed_tokens
- else:
- self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
-
- self.embed_positions = {{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(
- config.max_position_embeddings,
- embed_dim,
- )
- self.layers = nn.ModuleList([{{cookiecutter.camelcase_modelname}}EncoderLayer(config) for _ in range(config.encoder_layers)])
- self.layernorm_embedding = nn.LayerNorm(embed_dim)
-
- self.gradient_checkpointing = False
- # Initialize weights and apply final processing
- self.post_init()
-
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- head_mask=None,
- inputs_embeds=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- Args:
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
- provide it.
-
- Indices can be obtained using [`~{{cookiecutter.camelcase_modelname}}Tokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
- for details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert `input_ids` indices
- into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
- returned tensors for more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
- for more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
- """
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
- # retrieve input_ids and inputs_embeds
- if input_ids is not None and inputs_embeds is not None:
- raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
- elif input_ids is not None:
- self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
- input_shape = input_ids.size()
- input_ids = input_ids.view(-1, input_shape[-1])
- elif inputs_embeds is not None:
- input_shape = inputs_embeds.size()[:-1]
- else:
- raise ValueError("You have to specify either input_ids or inputs_embeds")
-
- if inputs_embeds is None:
- inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-
- embed_pos = self.embed_positions(input_shape)
-
- hidden_states = inputs_embeds + embed_pos
- hidden_states = self.layernorm_embedding(hidden_states)
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
- # expand attention_mask
- if attention_mask is not None:
- # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
- attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
-
- encoder_states = () if output_hidden_states else None
- all_attentions = () if output_attentions else None
-
- # check if head_mask has a correct number of layers specified if desired
- if head_mask is not None:
- assert head_mask.size()[0] == (
- len(self.layers)
- ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
-
- for idx, encoder_layer in enumerate(self.layers):
- if output_hidden_states:
- encoder_states = encoder_states + (hidden_states,)
- # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
- dropout_probability = torch.randn([])
- if self.training and (dropout_probability < self.layerdrop): # skip the layer
- layer_outputs = (None, None)
- else:
- if self.gradient_checkpointing and self.training:
- layer_outputs = self._gradient_checkpointing_func(
- encoder_layer.__call__,
- hidden_states,
- attention_mask,
- (head_mask[idx] if head_mask is not None else None),
- output_attentions,
- )
- else:
- layer_outputs = encoder_layer(
- hidden_states,
- attention_mask,
- layer_head_mask=(head_mask[idx] if head_mask is not None else None),
- output_attentions=output_attentions,
- )
-
- hidden_states = layer_outputs[0]
-
- if output_attentions:
- all_attentions = all_attentions + (layer_outputs[1],)
-
- if output_hidden_states:
- encoder_states = encoder_states + (hidden_states,)
-
- if not return_dict:
- return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
- return BaseModelOutput(
- last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
- )
-
-
-class {{cookiecutter.camelcase_modelname}}Decoder({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- """
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`{{cookiecutter.camelcase_modelname}}DecoderLayer`]
-
- Args:
- config: {{cookiecutter.camelcase_modelname}}Config
- embed_tokens (nn.Embedding): output embedding
- """
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None):
- super().__init__(config)
- self.dropout = config.dropout
- self.layerdrop = config.decoder_layerdrop
- self.padding_idx = config.pad_token_id
- self.max_target_positions = config.max_position_embeddings
- self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
-
- if embed_tokens is not None:
- self.embed_tokens = embed_tokens
- else:
- self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
-
- self.embed_positions = {{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(
- config.max_position_embeddings,
- config.d_model,
- )
- self.layers = nn.ModuleList([{{cookiecutter.camelcase_modelname}}DecoderLayer(config) for _ in range(config.decoder_layers)])
- self.layernorm_embedding = nn.LayerNorm(config.d_model)
-
- self.gradient_checkpointing = False
- # Initialize weights and apply final processing
- self.post_init()
-
- def get_input_embeddings(self):
- return self.embed_tokens
-
- def set_input_embeddings(self, value):
- self.embed_tokens = value
-
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- head_mask=None,
- cross_attn_head_mask=None,
- past_key_values=None,
- inputs_embeds=None,
- use_cache=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- Args:
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
- provide it.
-
- Indices can be obtained using [`~{{cookiecutter.camelcase_modelname}}Tokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
- for details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
- of the decoder.
- encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
- Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
- tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
- Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
- decoding.
-
- If `past_key_values` are used, the user can optionally input only the last
- `decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size,
- sequence_length)`.
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
- than the model's internal embedding lookup matrix.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
- returned tensors for more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
- for more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
- """
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- use_cache = use_cache if use_cache is not None else self.config.use_cache
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
- # retrieve input_ids and inputs_embeds
- if input_ids is not None and inputs_embeds is not None:
- raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
- elif input_ids is not None:
- input_shape = input_ids.size()
- input_ids = input_ids.view(-1, input_shape[-1])
- elif inputs_embeds is not None:
- input_shape = inputs_embeds.size()[:-1]
- else:
- raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
- # past_key_values_length
- past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
- if inputs_embeds is None:
- inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-
- attention_mask = _prepare_4d_causal_attention_mask(attention_mask, input_shape, inputs_embeds, past_key_values_length)
-
- # expand encoder attention mask
- if encoder_hidden_states is not None and encoder_attention_mask is not None:
- # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
- encoder_attention_mask = _prepare_4d_attention_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
-
- # embed positions
- positions = self.embed_positions(input_shape, past_key_values_length)
-
- hidden_states = inputs_embeds + positions
- hidden_states = self.layernorm_embedding(hidden_states)
-
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
- # decoder layers
- if self.gradient_checkpointing and self.training and use_cache:
- logger.warning("`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...")
- use_cache = False
-
- all_hidden_states = () if output_hidden_states else None
- all_self_attns = () if output_attentions else None
- all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
- next_decoder_cache = () if use_cache else None
-
- # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
- for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
- if attn_mask is not None:
- assert attn_mask.size()[0] == (
- len(self.layers)
- ), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
- for idx, decoder_layer in enumerate(self.layers):
- # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
- if output_hidden_states:
- all_hidden_states += (hidden_states,)
- dropout_probability = torch.randn([])
- if self.training and (dropout_probability < self.layerdrop):
- continue
-
- past_key_value = past_key_values[idx] if past_key_values is not None else None
-
- if self.gradient_checkpointing and self.training:
- layer_outputs = self._gradient_checkpointing_func(
- decoder_layer.__call__,
- hidden_states,
- attention_mask,
- encoder_hidden_states,
- encoder_attention_mask,
- head_mask[idx] if head_mask is not None else None,
- cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
- None,
- output_attentions,
- use_cache,
- )
- else:
-
- layer_outputs = decoder_layer(
- hidden_states,
- attention_mask=attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- layer_head_mask=(head_mask[idx] if head_mask is not None else None),
- cross_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
- past_key_value=past_key_value,
- output_attentions=output_attentions,
- use_cache=use_cache,
- )
- hidden_states = layer_outputs[0]
-
- if use_cache:
- next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
-
- if output_attentions:
- all_self_attns += (layer_outputs[1],)
-
- if encoder_hidden_states is not None:
- all_cross_attentions += (layer_outputs[2],)
-
- # add hidden states from the last decoder layer
- if output_hidden_states:
- all_hidden_states += (hidden_states,)
-
- next_cache = next_decoder_cache if use_cache else None
- if not return_dict:
- return tuple(
- v
- for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
- if v is not None
- )
- return BaseModelOutputWithPastAndCrossAttentions(
- last_hidden_state=hidden_states,
- past_key_values=next_cache,
- hidden_states=all_hidden_states,
- attentions=all_self_attns,
- cross_attentions=all_cross_attentions,
- )
-
-
-@add_start_docstrings(
- "The bare {{cookiecutter.modelname}} Model outputting raw hidden-states without any specific head on top.",
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class {{cookiecutter.camelcase_modelname}}Model({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config):
- super().__init__(config)
-
- padding_idx, vocab_size = config.pad_token_id, config.vocab_size
- self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
-
- self.encoder = {{cookiecutter.camelcase_modelname}}Encoder(config, self.shared)
- self.decoder = {{cookiecutter.camelcase_modelname}}Decoder(config, self.shared)
-
- # Initialize weights and apply final processing
- self.post_init()
-
- def get_input_embeddings(self):
- return self.shared
-
- def set_input_embeddings(self, value):
- self.shared = value
- self.encoder.embed_tokens = self.shared
- self.decoder.embed_tokens = self.shared
-
- def get_encoder(self):
- return self.encoder
-
- def get_decoder(self):
- return self.decoder
-
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING)
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=Seq2SeqModelOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- decoder_input_ids=None,
- decoder_attention_mask=None,
- head_mask=None,
- decoder_head_mask=None,
- cross_attn_head_mask=None,
- encoder_outputs=None,
- past_key_values=None,
- inputs_embeds=None,
- decoder_inputs_embeds=None,
- use_cache=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- use_cache = use_cache if use_cache is not None else self.config.use_cache
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
- if encoder_outputs is None:
- encoder_outputs = self.encoder(
- input_ids=input_ids,
- attention_mask=attention_mask,
- head_mask=head_mask,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
- # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
- elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
- encoder_outputs = BaseModelOutput(
- last_hidden_state=encoder_outputs[0],
- hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
- attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
- )
-
- # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
- decoder_outputs = self.decoder(
- input_ids=decoder_input_ids,
- attention_mask=decoder_attention_mask,
- encoder_hidden_states=encoder_outputs[0],
- encoder_attention_mask=attention_mask,
- head_mask=decoder_head_mask,
- cross_attn_head_mask=cross_attn_head_mask,
- past_key_values=past_key_values,
- inputs_embeds=decoder_inputs_embeds,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- if not return_dict:
- return decoder_outputs + encoder_outputs
-
- return Seq2SeqModelOutput(
- last_hidden_state=decoder_outputs.last_hidden_state,
- past_key_values=decoder_outputs.past_key_values,
- decoder_hidden_states=decoder_outputs.hidden_states,
- decoder_attentions=decoder_outputs.attentions,
- cross_attentions=decoder_outputs.cross_attentions,
- encoder_last_hidden_state=encoder_outputs.last_hidden_state,
- encoder_hidden_states=encoder_outputs.hidden_states,
- encoder_attentions=encoder_outputs.attentions,
- )
-
-
-@add_start_docstrings(
- "The {{cookiecutter.modelname}} Model with a language modeling head. Can be used for summarization.", {{cookiecutter.uppercase_modelname}}_START_DOCSTRING
-)
-class {{cookiecutter.camelcase_modelname}}ForConditionalGeneration({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- base_model_prefix = "model"
- _keys_to_ignore_on_load_missing = [
- r"final_logits_bias",
- r"encoder\.version",
- r"decoder\.version",
- r"lm_head\.weight",
- ]
-
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config):
- super().__init__(config)
- self.model = {{cookiecutter.camelcase_modelname}}Model(config)
- self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
- self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
-
- # Initialize weights and apply final processing
- self.post_init()
-
- def get_encoder(self):
- return self.model.get_encoder()
-
- def get_decoder(self):
- return self.model.get_decoder()
-
- def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
- new_embeddings = super().resize_token_embeddings(new_num_tokens)
- self._resize_final_logits_bias(new_num_tokens)
- return new_embeddings
-
- def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
- old_num_tokens = self.final_logits_bias.shape[-1]
- if new_num_tokens <= old_num_tokens:
- new_bias = self.final_logits_bias[:, :new_num_tokens]
- else:
- extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
- new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
- self.register_buffer("final_logits_bias", new_bias)
-
- def get_output_embeddings(self):
- return self.lm_head
-
- def set_output_embeddings(self, new_embeddings):
- self.lm_head = new_embeddings
-
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING)
- @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
- @add_end_docstrings({{cookiecutter.uppercase_modelname}}_GENERATION_EXAMPLE)
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- decoder_input_ids=None,
- decoder_attention_mask=None,
- head_mask=None,
- decoder_head_mask=None,
- cross_attn_head_mask=None,
- encoder_outputs=None,
- past_key_values=None,
- inputs_embeds=None,
- decoder_inputs_embeds=None,
- labels=None,
- use_cache=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
- Returns:
-
- Conditional generation example:
-
- ```python
- >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration
- >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- >>> TXT = "My friends are but they eat too many carbs."
-
- >>> model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
- >>> logits = model(input_ids).logits
-
- >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
- >>> probs = logits[0, masked_index].softmax(dim=0)
- >>> values, predictions = probs.topk(5)
-
- >>> tokenizer.decode(predictions).split()
- ```
-"""
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
- if labels is not None:
- if use_cache:
- logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
- use_cache = False
- if decoder_input_ids is None and decoder_inputs_embeds is None:
- decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
-
- outputs = self.model(
- input_ids,
- attention_mask=attention_mask,
- decoder_input_ids=decoder_input_ids,
- encoder_outputs=encoder_outputs,
- decoder_attention_mask=decoder_attention_mask,
- head_mask=head_mask,
- decoder_head_mask=decoder_head_mask,
- cross_attn_head_mask=cross_attn_head_mask,
- past_key_values=past_key_values,
- inputs_embeds=inputs_embeds,
- decoder_inputs_embeds=decoder_inputs_embeds,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
- lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
-
- masked_lm_loss = None
- if labels is not None:
- loss_fct = CrossEntropyLoss()
- masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
-
- if not return_dict:
- output = (lm_logits,) + outputs[1:]
- return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
- return Seq2SeqLMOutput(
- loss=masked_lm_loss,
- logits=lm_logits,
- past_key_values=outputs.past_key_values,
- decoder_hidden_states=outputs.decoder_hidden_states,
- decoder_attentions=outputs.decoder_attentions,
- cross_attentions=outputs.cross_attentions,
- encoder_last_hidden_state=outputs.encoder_last_hidden_state,
- encoder_hidden_states=outputs.encoder_hidden_states,
- encoder_attentions=outputs.encoder_attentions,
- )
-
- def prepare_inputs_for_generation(
- self,
- decoder_input_ids,
- past_key_values=None,
- attention_mask=None,
- head_mask=None,
- decoder_head_mask=None,
- cross_attn_head_mask=None,
- use_cache=None,
- encoder_outputs=None,
- **kwargs
- ):
- # cut decoder_input_ids if past is used
- if past_key_values is not None:
- decoder_input_ids = decoder_input_ids[:, -1:]
-
- return {
- "input_ids": None, # encoder_outputs is defined. input_ids not needed
- "encoder_outputs": encoder_outputs,
- "past_key_values": past_key_values,
- "decoder_input_ids": decoder_input_ids,
- "attention_mask": attention_mask,
- "head_mask": head_mask,
- "decoder_head_mask": decoder_head_mask,
- "cross_attn_head_mask": cross_attn_head_mask,
- "use_cache": use_cache, # change this to avoid caching (presumably for debugging)
- }
-
- @staticmethod
- def _reorder_cache(past_key_values, beam_idx):
- reordered_past = ()
- for layer_past in past_key_values:
- reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),)
- return reordered_past
-
-
-@add_start_docstrings(
- """
- {{cookiecutter.camelcase_modelname}} model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
- tasks.
- """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class {{cookiecutter.camelcase_modelname}}ForSequenceClassification({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
- super().__init__(config, **kwargs)
- self.model = {{cookiecutter.camelcase_modelname}}Model(config)
- self.classification_head = {{cookiecutter.camelcase_modelname}}ClassificationHead(
- config.d_model,
- config.d_model,
- config.num_labels,
- config.classifier_dropout,
- )
- self.model._init_weights(self.classification_head.dense)
- self.model._init_weights(self.classification_head.out_proj)
-
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING)
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=Seq2SeqSequenceClassifierOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- decoder_input_ids=None,
- decoder_attention_mask=None,
- encoder_outputs=None,
- inputs_embeds=None,
- decoder_inputs_embeds=None,
- labels=None,
- use_cache=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
- Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
- """
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
- if labels is not None:
- use_cache = False
-
- if input_ids is None and inputs_embeds is not None:
- raise NotImplementedError(
- f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
- )
-
- outputs = self.model(
- input_ids,
- attention_mask=attention_mask,
- decoder_input_ids=decoder_input_ids,
- decoder_attention_mask=decoder_attention_mask,
- encoder_outputs=encoder_outputs,
- inputs_embeds=inputs_embeds,
- decoder_inputs_embeds=decoder_inputs_embeds,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
- hidden_states = outputs[0] # last hidden state
-
- eos_mask = input_ids.eq(self.config.eos_token_id).to(hidden_states.device)
-
- if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
- raise ValueError("All examples must have the same number of tokens.")
- sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
- :, -1, :
- ]
- logits = self.classification_head(sentence_representation)
-
- loss = None
- if labels is not None:
- if self.config.problem_type is None:
- if self.config.num_labels == 1:
- self.config.problem_type = "regression"
- elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
- self.config.problem_type = "single_label_classification"
- else:
- self.config.problem_type = "multi_label_classification"
-
- if self.config.problem_type == "regression":
- loss_fct = MSELoss()
- if self.config.num_labels == 1:
- loss = loss_fct(logits.squeeze(), labels.squeeze())
- else:
- loss = loss_fct(logits, labels)
- elif self.config.problem_type == "single_label_classification":
- loss_fct = CrossEntropyLoss()
- loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
- elif self.config.problem_type == "multi_label_classification":
- loss_fct = BCEWithLogitsLoss()
- loss = loss_fct(logits, labels)
- if not return_dict:
- output = (logits,) + outputs[1:]
- return ((loss,) + output) if loss is not None else output
-
- return Seq2SeqSequenceClassifierOutput(
- loss=loss,
- logits=logits,
- past_key_values=outputs.past_key_values,
- decoder_hidden_states=outputs.decoder_hidden_states,
- decoder_attentions=outputs.decoder_attentions,
- cross_attentions=outputs.cross_attentions,
- encoder_last_hidden_state=outputs.encoder_last_hidden_state,
- encoder_hidden_states=outputs.encoder_hidden_states,
- encoder_attentions=outputs.encoder_attentions,
- )
-
-
-@add_start_docstrings(
- """
- {{cookiecutter.modelname}} Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
- layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
- """,
- {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
-)
-class {{cookiecutter.camelcase_modelname}}ForQuestionAnswering({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- def __init__(self, config):
- super().__init__(config)
-
- config.num_labels = 2
- self.num_labels = config.num_labels
-
- self.model = {{cookiecutter.camelcase_modelname}}Model(config)
- self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
- self.model._init_weights(self.qa_outputs)
-
- @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING)
- @add_code_sample_docstrings(
- checkpoint=_CHECKPOINT_FOR_DOC,
- output_type=Seq2SeqQuestionAnsweringModelOutput,
- config_class=_CONFIG_FOR_DOC,
- )
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- decoder_input_ids=None,
- decoder_attention_mask=None,
- encoder_outputs=None,
- start_positions=None,
- end_positions=None,
- inputs_embeds=None,
- decoder_inputs_embeds=None,
- use_cache=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
- Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
- are not taken into account for computing the loss.
- end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
- Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
- are not taken into account for computing the loss.
- """
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
- if start_positions is not None and end_positions is not None:
- use_cache = False
-
- outputs = self.model(
- input_ids,
- attention_mask=attention_mask,
- decoder_input_ids=decoder_input_ids,
- decoder_attention_mask=decoder_attention_mask,
- encoder_outputs=encoder_outputs,
- inputs_embeds=inputs_embeds,
- decoder_inputs_embeds=decoder_inputs_embeds,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- sequence_output = outputs[0]
-
- logits = self.qa_outputs(sequence_output)
- start_logits, end_logits = logits.split(1, dim=-1)
- start_logits = start_logits.squeeze(-1)
- end_logits = end_logits.squeeze(-1)
-
- total_loss = None
- if start_positions is not None and end_positions is not None:
- # If we are on multi-GPU, split add a dimension
- if len(start_positions.size()) > 1:
- start_positions = start_positions.squeeze(-1)
- if len(end_positions.size()) > 1:
- end_positions = end_positions.squeeze(-1)
- # sometimes the start/end positions are outside our model inputs, we ignore these terms
- ignored_index = start_logits.size(1)
- start_positions = start_positions.clamp(0, ignored_index)
- end_positions = end_positions.clamp(0, ignored_index)
-
- loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
- start_loss = loss_fct(start_logits, start_positions)
- end_loss = loss_fct(end_logits, end_positions)
- total_loss = (start_loss + end_loss) / 2
-
- if not return_dict:
- output = (
- start_logits,
- end_logits,
- ) + outputs[1:]
- return ((total_loss,) + output) if total_loss is not None else output
-
- return Seq2SeqQuestionAnsweringModelOutput(
- loss=total_loss,
- start_logits=start_logits,
- end_logits=end_logits,
- past_key_values=outputs.past_key_values,
- decoder_hidden_states=outputs.decoder_hidden_states,
- decoder_attentions=outputs.decoder_attentions,
- cross_attentions=outputs.cross_attentions,
- encoder_last_hidden_state=outputs.encoder_last_hidden_state,
- encoder_hidden_states=outputs.encoder_hidden_states,
- encoder_attentions=outputs.encoder_attentions,
- )
-
-# Copied from transformers.models.bart.modeling_bart.BartDecoderWrapper with Bart->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}DecoderWrapper({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- """
- This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
- used in combination with the [`EncoderDecoderModel`] framework.
- """
-
- def __init__(self, config):
- super().__init__(config)
- self.decoder = {{cookiecutter.camelcase_modelname}}Decoder(config)
-
- def forward(self, *args, **kwargs):
- return self.decoder(*args, **kwargs)
-
-
-# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->{{cookiecutter.camelcase_modelname}}
-class {{cookiecutter.camelcase_modelname}}ForCausalLM({{cookiecutter.camelcase_modelname}}PreTrainedModel):
- def __init__(self, config):
- config = copy.deepcopy(config)
- config.is_decoder = True
- config.is_encoder_decoder = False
- super().__init__(config)
- self.model = {{cookiecutter.camelcase_modelname}}DecoderWrapper(config)
-
- self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
- # Initialize weights and apply final processing
- self.post_init()
-
- def get_input_embeddings(self):
- return self.model.decoder.embed_tokens
-
- def set_input_embeddings(self, value):
- self.model.decoder.embed_tokens = value
-
- def get_output_embeddings(self):
- return self.lm_head
-
- def set_output_embeddings(self, new_embeddings):
- self.lm_head = new_embeddings
-
- def set_decoder(self, decoder):
- self.model.decoder = decoder
-
- def get_decoder(self):
- return self.model.decoder
-
- @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
- def forward(
- self,
- input_ids=None,
- attention_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- head_mask=None,
- cross_attn_head_mask=None,
- past_key_values=None,
- inputs_embeds=None,
- labels=None,
- use_cache=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None,
- ):
- r"""
- Args:
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
- provide it.
-
- Indices can be obtained using [`~{{cookiecutter.camelcase_modelname}}Tokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
- for details.
-
- [What are input IDs?](../glossary#input-ids)
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
-
- [What are attention masks?](../glossary#attention-mask)
- encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
- if the model is configured as a decoder.
- encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
- in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
-
- - 1 indicates the head is **not masked**,
- - 0 indicates the head is **masked**.
-
- past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
- Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
- decoding.
-
- If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
- instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
- ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
- use_cache (`bool`, *optional*):
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up
- decoding (see `past_key_values`).
-
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
- returned tensors for more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
- for more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-
- Returns:
-
- Example:
-
- ```python
- >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, {{cookiecutter.camelcase_modelname}}ForCausalLM
-
- >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('facebook/bart-large')
- >>> model = {{cookiecutter.camelcase_modelname}}ForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
- >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
-
- >>> logits = outputs.logits
- ```
-"""
-
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
- # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
- outputs = self.model.decoder(
- input_ids=input_ids,
- attention_mask=attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- head_mask=head_mask,
- cross_attn_head_mask=cross_attn_head_mask,
- past_key_values=past_key_values,
- inputs_embeds=inputs_embeds,
- use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- logits = self.lm_head(outputs[0])
-
- loss = None
- if labels is not None:
- loss_fct = CrossEntropyLoss()
- loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
-
- if not return_dict:
- output = (logits,) + outputs[1:]
- return (loss,) + output if loss is not None else output
-
- return CausalLMOutputWithCrossAttentions(
- loss=loss,
- logits=logits,
- past_key_values=outputs.past_key_values,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- cross_attentions=outputs.cross_attentions,
- )
-
- def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs):
- # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
- if attention_mask is None:
- attention_mask = input_ids.new_ones(input_ids.shape)
-
- if past_key_values:
- input_ids = input_ids[:, -1:]
- # first step, decoder_cached_states are empty
- return {
- "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed
- "attention_mask": attention_mask,
- "past_key_values": past_key_values,
- "use_cache": use_cache,
- }
-
- @staticmethod
- def _reorder_cache(past_key_values, beam_idx):
- reordered_past = ()
- for layer_past in past_key_values:
- reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),)
- return reordered_past
-{% endif -%}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_flax_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_flax_{{cookiecutter.lowercase_modelname}}.py
deleted file mode 100644
index a01ab3e19adf58..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_flax_{{cookiecutter.lowercase_modelname}}.py
+++ /dev/null
@@ -1,669 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-
-import unittest
-
-from transformers import is_flax_available, {{cookiecutter.camelcase_modelname}}Config
-from transformers.testing_utils import require_flax, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-if is_flax_available():
- import numpy as np
- from transformers import (
- Flax{{cookiecutter.camelcase_modelname}}ForCausalLM,
- Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM,
- Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
- Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification,
- Flax{{cookiecutter.camelcase_modelname}}Model,
- )
-
-
-class Flax{{cookiecutter.camelcase_modelname}}ModelTester:
- def __init__(
- self,
- parent,
- batch_size=13,
- seq_length=7,
- is_training=True,
- use_input_mask=True,
- use_token_type_ids=True,
- use_labels=True,
- vocab_size=99,
- hidden_size=32,
- num_hidden_layers=5,
- num_attention_heads=4,
- intermediate_size=37,
- hidden_act="gelu",
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=512,
- type_vocab_size=16,
- type_sequence_label_size=2,
- initializer_range=0.02,
- num_labels=3,
- num_choices=4,
- scope=None,
- ):
- self.parent = parent
- self.batch_size = 13
- self.seq_length = 7
- self.is_training = True
- self.use_input_mask = True
- self.use_token_type_ids = True
- self.use_labels = True
- self.vocab_size = 99
- self.hidden_size = 32
- self.num_hidden_layers = 5
- self.num_attention_heads = 4
- self.intermediate_size = 37
- self.hidden_act = "gelu"
- self.hidden_dropout_prob = 0.1
- self.attention_probs_dropout_prob = 0.1
- self.max_position_embeddings = 512
- self.type_vocab_size = 16
- self.type_sequence_label_size = 2
- self.initializer_range = 0.02
- self.num_labels = 3
- self.num_choices = 4
- self.scope = None
-
- def prepare_config_and_inputs(self):
- input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
- input_mask = None
- if self.use_input_mask:
- input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
- token_type_ids = None
- if self.use_token_type_ids:
- token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
- sequence_labels = None
- token_labels = None
- choice_labels = None
- if self.use_labels:
- sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
- token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
- choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
- config = {{cookiecutter.camelcase_modelname}}Config(
- vocab_size=self.vocab_size,
- hidden_size=self.hidden_size,
- num_hidden_layers=self.num_hidden_layers,
- num_attention_heads=self.num_attention_heads,
- intermediate_size=self.intermediate_size,
- hidden_act=self.hidden_act,
- hidden_dropout_prob=self.hidden_dropout_prob,
- attention_probs_dropout_prob=self.attention_probs_dropout_prob,
- max_position_embeddings=self.max_position_embeddings,
- type_vocab_size=self.type_vocab_size,
- initializer_range=self.initializer_range,
- return_dict=True,
- )
-
- return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
- def create_and_check_model(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- model = Flax{{cookiecutter.camelcase_modelname}}Model(config=config)
- inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-
- inputs = [input_ids, input_mask]
-
- result = model(*inputs)
-
- self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
- def create_and_check_lm_head(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- config.is_decoder = True
- model = Flax{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
- inputs = {
- "input_ids": input_ids,
- "attention_mask": input_mask,
- "token_type_ids": token_type_ids,
- }
- prediction_scores = model(**inputs)["logits"]
- self.parent.assertListEqual(
- list(prediction_scores.shape), [self.batch_size, self.seq_length, self.vocab_size]
- )
-
- def create_and_check_for_masked_lm(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- model = Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM(config=config)
- inputs = {
- "input_ids": input_ids,
- "attention_mask": input_mask,
- "token_type_ids": token_type_ids,
- }
- result = model(**inputs)
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
- def create_and_check_for_sequence_classification(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- config.num_labels = self.num_labels
- model = Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification(config=config)
- inputs = {
- "input_ids": input_ids,
- "attention_mask": input_mask,
- "token_type_ids": token_type_ids,
- }
-
- result = model(**inputs)
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
- def create_and_check_for_multiple_choice(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- config.num_choices = self.num_choices
- model = Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice(config=config)
- multiple_choice_inputs_ids = np.tile(np.expand_dims(input_ids, 1), (1, self.num_choices, 1))
- multiple_choice_input_mask = np.tile(np.expand_dims(input_mask, 1), (1, self.num_choices, 1))
- multiple_choice_token_type_ids = np.tile(np.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
- inputs = {
- "input_ids": multiple_choice_inputs_ids,
- "attention_mask": multiple_choice_input_mask,
- "token_type_ids": multiple_choice_token_type_ids,
- }
- result = model(**inputs)
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
- def create_and_check_for_token_classification(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- config.num_labels = self.num_labels
- model = Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification(config=config)
- inputs = {
- "input_ids": input_ids,
- "attention_mask": input_mask,
- "token_type_ids": token_type_ids,
- }
- result = model(**inputs)
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
- def create_and_check_for_question_answering(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- model = Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(config=config)
- inputs = {
- "input_ids": input_ids,
- "attention_mask": input_mask,
- "token_type_ids": token_type_ids,
- }
-
- result = model(**inputs)
- self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
- self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
- def prepare_config_and_inputs_for_common(self):
- config_and_inputs = self.prepare_config_and_inputs()
- (
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- ) = config_and_inputs
- inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
- return config, inputs_dict
-
-
-@require_flax
-class Flax{{cookiecutter.camelcase_modelname}}ModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
- all_model_classes = (
- (
- Flax{{cookiecutter.camelcase_modelname}}Model,
- Flax{{cookiecutter.camelcase_modelname}}ForCausalLM,
- Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM,
- Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification,
- Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
- )
- if is_flax_available()
- else ()
- )
-
- test_head_masking = False
- test_onnx = False
-
- def setUp(self):
- self.model_tester = Flax{{cookiecutter.camelcase_modelname}}ModelTester(self)
- self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config, hidden_size=37)
-
- def test_config(self):
- self.config_tester.run_common_tests()
-
- def test_model(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_model(*config_and_inputs)
-
- def test_for_masked_lm(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
- def test_for_causal_lm(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_lm_head(*config_and_inputs)
-
- def test_for_multiple_choice(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
- def test_for_question_answering(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
- def test_for_sequence_classification(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
- def test_for_token_classification(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
- @slow
- def test_model_from_pretrained(self):
- model = Flax{{cookiecutter.camelcase_modelname}}Model.from_pretrained("{{cookiecutter.checkpoint_identifier}}")
- self.assertIsNotNone(model)
-
-
-def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
- """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
- if a is None and b is None:
- return True
- try:
- if _assert_tensors_equal(a, b, atol=atol):
- return True
- raise
- except Exception:
- if len(prefix) > 0:
- prefix = f"{prefix}: "
- raise AssertionError(f"{prefix}{a} != {b}")
-
-
-@require_flax
-class Flax{{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCase):
- @slow
- def test_inference_masked_lm(self):
- model = Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM.from_pretrained("{{cookiecutter.checkpoint_identifier}}")
- input_ids = np.array([[0, 1, 2, 3, 4, 5]])
- output = model(input_ids)[0]
-
- # TODO Replace vocab size
- vocab_size = 32000
-
- expected_shape = [1, 6, vocab_size]
- self.assertEqual(output.shape, expected_shape)
-
- print(output[:, :3, :3])
-
- # TODO Replace values below with what was printed above.
- expected_slice = np.array(
- [
- [
- [-0.05243197, -0.04498899, 0.05512108],
- [-0.07444685, -0.01064632, 0.04352357],
- [-0.05020351, 0.05530146, 0.00700043],
- ]
- ]
- )
- _assert_tensors_equal(output[:, :3, :3], expected_slice, atol=1e-4)
-
-{% else %}
-import unittest
-
-from transformers import (
- is_flax_available,
- {{cookiecutter.camelcase_modelname}}Config,
- {{cookiecutter.camelcase_modelname}}Tokenizer,
-)
-from transformers.testing_utils import require_sentencepiece, require_flax, require_tokenizers, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
- import numpy as np
- import jax.numpy as jnp
- from transformers import (
- Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
- Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- Flax{{cookiecutter.camelcase_modelname}}Model,
- )
-
-
-@require_flax
-class Flax{{cookiecutter.camelcase_modelname}}ModelTester:
- config_cls = {{cookiecutter.camelcase_modelname}}Config
- config_updates = {}
- hidden_act = "gelu"
-
- def __init__(
- self,
- parent,
- batch_size=13,
- seq_length=7,
- is_training=True,
- use_labels=False,
- vocab_size=99,
- hidden_size=32,
- num_hidden_layers=5,
- num_attention_heads=4,
- intermediate_size=37,
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=20,
- eos_token_id=2,
- pad_token_id=1,
- bos_token_id=0,
- ):
- self.parent = parent
- self.batch_size = batch_size
- self.seq_length = seq_length
- self.is_training = is_training
- self.use_labels = use_labels
- self.vocab_size = vocab_size
- self.hidden_size = hidden_size
- self.num_hidden_layers = num_hidden_layers
- self.num_attention_heads = num_attention_heads
- self.intermediate_size = intermediate_size
-
- self.hidden_dropout_prob = hidden_dropout_prob
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
- self.max_position_embeddings = max_position_embeddings
- self.eos_token_id = eos_token_id
- self.pad_token_id = pad_token_id
- self.bos_token_id = bos_token_id
-
- def prepare_config_and_inputs_for_common(self):
- input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size).clip(3, self.vocab_size)
- eos_tensor = np.expand_dims(np.array([self.eos_token_id] * self.batch_size), 1)
- input_ids = np.concatenate([input_ids, eos_tensor], axis=1)
-
- decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
- config = self.config_cls(
- vocab_size=self.vocab_size,
- d_model=self.hidden_size,
- encoder_layers=self.num_hidden_layers,
- decoder_layers=self.num_hidden_layers,
- encoder_attention_heads=self.num_attention_heads,
- decoder_attention_heads=self.num_attention_heads,
- encoder_ffn_dim=self.intermediate_size,
- decoder_ffn_dim=self.intermediate_size,
- dropout=self.hidden_dropout_prob,
- attention_dropout=self.attention_probs_dropout_prob,
- max_position_embeddings=self.max_position_embeddings,
- eos_token_ids=[2],
- bos_token_id=self.bos_token_id,
- pad_token_id=self.pad_token_id,
- decoder_start_token_id=self.pad_token_id,
- **self.config_updates,
- )
- inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(config, input_ids, decoder_input_ids)
- return config, inputs_dict
-
- def check_use_cache_forward(self, model_class_name, config, inputs_dict):
- max_decoder_length = 20
- model = model_class_name(config)
-
- encoder_outputs = model.encode(inputs_dict["input_ids"])
-
- decoder_input_ids, decoder_attention_mask = (
- inputs_dict["decoder_input_ids"],
- inputs_dict["decoder_attention_mask"],
- )
-
- past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
- decoder_attention_mask = jnp.ones((decoder_input_ids.shape[0], max_decoder_length), dtype="i4")
-
- decoder_position_ids = jnp.broadcast_to(
- jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
- (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
- )
- outputs_cache = model.decode(
- decoder_input_ids[:, :-1],
- encoder_outputs,
- decoder_attention_mask=decoder_attention_mask,
- past_key_values=past_key_values,
- decoder_position_ids=decoder_position_ids,
- )
-
- decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
- outputs_cache_next = model.decode(
- decoder_input_ids[:, -1:],
- encoder_outputs,
- decoder_attention_mask=decoder_attention_mask,
- past_key_values=outputs_cache.past_key_values,
- decoder_position_ids=decoder_position_ids,
- )
-
- outputs = model.decode(decoder_input_ids, encoder_outputs)
-
- diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
- self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
- def check_use_cache_forward_with_attn_mask(self, model_class_name, config, inputs_dict):
- max_decoder_length = 20
- model = model_class_name(config)
-
- encoder_outputs = model.encode(inputs_dict["input_ids"])
-
- decoder_input_ids, decoder_attention_mask = (
- inputs_dict["decoder_input_ids"],
- inputs_dict["decoder_attention_mask"],
- )
-
- decoder_attention_mask_cache = jnp.concatenate(
- [
- decoder_attention_mask,
- jnp.zeros((decoder_attention_mask.shape[0], max_decoder_length - decoder_attention_mask.shape[1])),
- ],
- axis=-1,
- )
-
- past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
- decoder_position_ids = jnp.broadcast_to(
- jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
- (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
- )
-
- outputs_cache = model.decode(
- decoder_input_ids[:, :-1],
- encoder_outputs,
- decoder_attention_mask=decoder_attention_mask_cache,
- past_key_values=past_key_values,
- decoder_position_ids=decoder_position_ids,
- )
- decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
- outputs_cache_next = model.decode(
- decoder_input_ids[:, -1:],
- encoder_outputs,
- past_key_values=outputs_cache.past_key_values,
- decoder_attention_mask=decoder_attention_mask_cache,
- decoder_position_ids=decoder_position_ids,
- )
-
- outputs = model.decode(decoder_input_ids, encoder_outputs, decoder_attention_mask=decoder_attention_mask)
-
- diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
- self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-def prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(
- config,
- input_ids,
- decoder_input_ids,
- attention_mask=None,
- decoder_attention_mask=None,
-):
- if attention_mask is None:
- attention_mask = np.not_equal(input_ids, config.pad_token_id).astype(np.int8)
- if decoder_attention_mask is None:
- decoder_attention_mask = np.concatenate([np.ones(decoder_input_ids[:, :1].shape, dtype=np.int8), np.not_equal(decoder_input_ids[:, 1:], config.pad_token_id).astype(np.int8)], axis=-1)
- return {
- "input_ids": input_ids,
- "decoder_input_ids": decoder_input_ids,
- "attention_mask": attention_mask,
- "decoder_attention_mask": decoder_attention_mask,
- }
-
-
-@require_flax
-class Flax{{cookiecutter.camelcase_modelname}}ModelTest(FlaxModelTesterMixin, unittest.TestCase):
- all_model_classes = (
- (
- Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
- Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- Flax{{cookiecutter.camelcase_modelname}}Model,
- ) if is_flax_available()
- else ()
- )
- all_generative_model_classes = (Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,) if is_flax_available() else ()
- is_encoder_decoder = True
- test_pruning = False
- test_head_masking = False
- test_onnx = False
-
- def setUp(self):
- self.model_tester = Flax{{cookiecutter.camelcase_modelname}}ModelTester(self)
- self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config)
-
- def test_config(self):
- self.config_tester.run_common_tests()
-
- def test_use_cache_forward(self):
- config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
- for model_class in self.all_model_classes:
- self.model_tester.check_use_cache_forward(model_class, config, inputs_dict)
-
- def test_use_cache_forward_with_attn_mask(self):
- config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
- for model_class in self.all_model_classes:
- self.model_tester.check_use_cache_forward_with_attn_mask(model_class, config, inputs_dict)
-
-
-def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
- """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
- if a is None and b is None:
- return True
- try:
- if _assert_tensors_equal(a, b, atol=atol):
- return True
- raise
- except Exception:
- if len(prefix) > 0:
- prefix = f"{prefix}: "
- raise AssertionError(f"{prefix}{a} != {b}")
-
-
-def _long_tensor(tok_lst):
- return np.array(tok_lst, dtype=np.int32)
-
-
-TOLERANCE = 1e-4
-
-
-@slow
-@require_sentencepiece
-@require_tokenizers
-@require_flax
-class Flax{{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCase):
- def test_inference_no_head(self):
- model = Flax{{cookiecutter.camelcase_modelname}}Model.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- # change to intended input here
- input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
- decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
- inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
- output = model(**inputs_dict)[0]
- expected_shape = (1, 11, 1024)
- self.assertEqual(output.shape, expected_shape)
- # change to expected output here
- expected_slice = np.array(
- [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
- )
- _assert_tensors_equal(output[:, :3, :3], expected_slice, atol=TOLERANCE)
-
- def test_inference_with_head(self):
- model = Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- # change to intended input here
- input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
- decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
- inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
- output = model(**inputs_dict)[0]
- expected_shape = (1, 11, 1024)
- self.assertEqual(output.shape, expected_shape)
- # change to expected output here
- expected_slice = np.array(
- [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
- )
- _assert_tensors_equal(output[:, :3, :3], expected_slice, atol=TOLERANCE)
-
- def test_seq_to_seq_generation(self):
- hf = Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- tok = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
- batch_input = [
- # string 1,
- # string 2,
- # string 3,
- # string 4,
- ]
-
- # The below article tests that we don't add any hypotheses outside of the top n_beams
- dct = tok.batch_encode_plus(
- batch_input,
- max_length=512,
- padding="max_length",
- truncation_strategy="only_first",
- truncation=True,
- return_tensors="np",
- )
-
- hypotheses_batch = hf.generate(
- input_ids=dct["input_ids"],
- attention_mask=dct["attention_mask"],
- num_beams=2,
- )
-
- EXPECTED = [
- # here expected 1,
- # here expected 2,
- # here expected 3,
- # here expected 4,
- ]
-
- generated = tok.batch_decode(
- hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
- )
- assert generated == EXPECTED
-{%- endif %}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
deleted file mode 100644
index a92a900947cc85..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ /dev/null
@@ -1,971 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-
-import unittest
-
-from transformers import is_tf_available, {{cookiecutter.camelcase_modelname}}Config
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_tf_available():
- import tensorflow as tf
-
- from transformers import (
- TF{{cookiecutter.camelcase_modelname}}ForCausalLM,
- TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
- TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
- TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- TF{{cookiecutter.camelcase_modelname}}ForTokenClassification,
- TF{{cookiecutter.camelcase_modelname}}Model,
- )
-
-
-class TF{{cookiecutter.camelcase_modelname}}ModelTester:
- def __init__(
- self,
- parent,
- batch_size=13,
- seq_length=7,
- is_training=True,
- use_input_mask=True,
- use_token_type_ids=True,
- use_labels=True,
- vocab_size=99,
- hidden_size=32,
- num_hidden_layers=5,
- num_attention_heads=4,
- intermediate_size=37,
- hidden_act="gelu",
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=512,
- type_vocab_size=16,
- type_sequence_label_size=2,
- initializer_range=0.02,
- num_labels=3,
- num_choices=4,
- scope=None,
- ):
- self.parent = parent
- self.batch_size = 13
- self.seq_length = 7
- self.is_training = True
- self.use_input_mask = True
- self.use_token_type_ids = True
- self.use_labels = True
- self.vocab_size = 99
- self.hidden_size = 32
- self.num_hidden_layers = 5
- self.num_attention_heads = 4
- self.intermediate_size = 37
- self.hidden_act = "gelu"
- self.hidden_dropout_prob = 0.1
- self.attention_probs_dropout_prob = 0.1
- self.max_position_embeddings = 512
- self.type_vocab_size = 16
- self.type_sequence_label_size = 2
- self.initializer_range = 0.02
- self.num_labels = 3
- self.num_choices = 4
- self.scope = None
-
- def prepare_config_and_inputs(self):
- input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
- input_mask = None
- if self.use_input_mask:
- input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
- token_type_ids = None
- if self.use_token_type_ids:
- token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
- sequence_labels = None
- token_labels = None
- choice_labels = None
- if self.use_labels:
- sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
- token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
- choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
- config = {{cookiecutter.camelcase_modelname}}Config(
- vocab_size=self.vocab_size,
- hidden_size=self.hidden_size,
- num_hidden_layers=self.num_hidden_layers,
- num_attention_heads=self.num_attention_heads,
- intermediate_size=self.intermediate_size,
- hidden_act=self.hidden_act,
- hidden_dropout_prob=self.hidden_dropout_prob,
- attention_probs_dropout_prob=self.attention_probs_dropout_prob,
- max_position_embeddings=self.max_position_embeddings,
- type_vocab_size=self.type_vocab_size,
- initializer_range=self.initializer_range,
- return_dict=True,
- )
-
- return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
- def prepare_config_and_inputs_for_decoder(self):
- (
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- ) = self.prepare_config_and_inputs()
-
- config.is_decoder = True
- encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
- encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
- return (
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- encoder_hidden_states,
- encoder_attention_mask,
- )
-
- def create_and_check_model(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- model = TF{{cookiecutter.camelcase_modelname}}Model(config=config)
- inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-
- inputs = [input_ids, input_mask]
- result = model(inputs)
-
- result = model(input_ids)
-
- self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
- def create_and_check_causal_lm_base_model(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- config.is_decoder = True
-
- model = TF{{cookiecutter.camelcase_modelname}}Model(config=config)
- inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
- result = model(inputs)
-
- inputs = [input_ids, input_mask]
- result = model(inputs)
-
- result = model(input_ids)
-
- self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
- def create_and_check_model_as_decoder(
- self,
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- encoder_hidden_states,
- encoder_attention_mask,
- ):
- config.add_cross_attention = True
-
- model = TF{{cookiecutter.camelcase_modelname}}Model(config=config)
- inputs = {
- "input_ids": input_ids,
- "attention_mask": input_mask,
- "token_type_ids": token_type_ids,
- "encoder_hidden_states": encoder_hidden_states,
- "encoder_attention_mask": encoder_attention_mask,
- }
- result = model(inputs)
-
- inputs = [input_ids, input_mask]
- result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
- # Also check the case where encoder outputs are not passed
- result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
- self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
- def create_and_check_causal_lm_model(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- config.is_decoder = True
-
- model = TF{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
- inputs = {
- "input_ids": input_ids,
- "attention_mask": input_mask,
- "token_type_ids": token_type_ids,
- }
- prediction_scores = model(inputs)["logits"]
- self.parent.assertListEqual(
- list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
- )
-
- def create_and_check_causal_lm_model_as_decoder(
- self,
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- encoder_hidden_states,
- encoder_attention_mask,
- ):
- config.add_cross_attention = True
-
- model = TF{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
- inputs = {
- "input_ids": input_ids,
- "attention_mask": input_mask,
- "token_type_ids": token_type_ids,
- "encoder_hidden_states": encoder_hidden_states,
- "encoder_attention_mask": encoder_attention_mask,
- }
- result = model(inputs)
-
- inputs = [input_ids, input_mask]
- result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
- prediction_scores = result["logits"]
- self.parent.assertListEqual(
- list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
- )
-
-
- def create_and_check_causal_lm_model_past(
- self,
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- ):
- config.is_decoder = True
-
- model = TF{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
-
- # first forward pass
- outputs = model(input_ids, use_cache=True)
- outputs_use_cache_conf = model(input_ids)
- outputs_no_past = model(input_ids, use_cache=False)
-
- self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
- self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
- past_key_values = outputs.past_key_values
-
- # create hypothetical next token and extent to next_input_ids
- next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
- # append to next input_ids and attn_mask
- next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-
- output_from_no_past = model(next_input_ids, output_hidden_states=True).hidden_states[0]
- output_from_past = model(
- next_tokens, past_key_values=past_key_values, output_hidden_states=True
- ).hidden_states[0]
-
- # select random slice
- random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
- output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
- output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
- # test that outputs are equal for slice
- tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
- def create_and_check_causal_lm_model_past_with_attn_mask(
- self,
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- ):
- config.is_decoder = True
-
- model = TF{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
-
- # create attention mask
- half_seq_length = self.seq_length // 2
- attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
- attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
- attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
- # first forward pass
- outputs = model(input_ids, attention_mask=attn_mask, use_cache=True)
-
- # create hypothetical next token and extent to next_input_ids
- next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
- past_key_values = outputs.past_key_values
-
- # change a random masked slice from input_ids
- random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
- random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
- vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
- condition = tf.transpose(
- tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
- )
- input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-
- # append to next input_ids and
- next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
- attn_mask = tf.concat(
- [attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)],
- axis=1,
- )
-
- output_from_no_past = model(
- next_input_ids,
- attention_mask=attn_mask,
- output_hidden_states=True,
- ).hidden_states[0]
- output_from_past = model(
- next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, output_hidden_states=True
- ).hidden_states[0]
-
- # select random slice
- random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
- output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
- output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
- # test that outputs are equal for slice
- tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
- def create_and_check_causal_lm_model_past_large_inputs(
- self,
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- ):
- config.is_decoder = True
-
- model = TF{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
-
- input_ids = input_ids[:1, :]
- input_mask = input_mask[:1, :]
- self.batch_size = 1
-
- # first forward pass
- outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
- past_key_values = outputs.past_key_values
-
- # create hypothetical next token and extent to next_input_ids
- next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
- next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
- # append to next input_ids and
- next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
- next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
- output_from_no_past = model(
- next_input_ids,
- attention_mask=next_attention_mask,
- output_hidden_states=True,
- ).hidden_states[0]
- output_from_past = model(
- next_tokens,
- attention_mask=next_attention_mask,
- past_key_values=past_key_values,
- output_hidden_states=True,
- ).hidden_states[0]
-
- self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
- # select random slice
- random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
- output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
- output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
- # test that outputs are equal for slice
- tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
- def create_and_check_decoder_model_past_large_inputs(
- self,
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- encoder_hidden_states,
- encoder_attention_mask,
- ):
- config.add_cross_attention = True
-
- model = TF{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
-
- input_ids = input_ids[:1, :]
- input_mask = input_mask[:1, :]
- encoder_hidden_states = encoder_hidden_states[:1, :, :]
- encoder_attention_mask = encoder_attention_mask[:1, :]
- self.batch_size = 1
-
- # first forward pass
- outputs = model(
- input_ids,
- attention_mask=input_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- use_cache=True,
- )
- past_key_values = outputs.past_key_values
-
- # create hypothetical next token and extent to next_input_ids
- next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
- next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
- # append to next input_ids and
- next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
- next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
- output_from_no_past = model(
- next_input_ids,
- attention_mask=next_attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- output_hidden_states=True,
- ).hidden_states[0]
- output_from_past = model(
- next_tokens,
- attention_mask=next_attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- past_key_values=past_key_values,
- output_hidden_states=True,
- ).hidden_states[0]
-
- self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
- # select random slice
- random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
- output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
- output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
- # test that outputs are equal for slice
- tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
- def create_and_check_for_masked_lm(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- model = TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(config=config)
- inputs = {
- "input_ids": input_ids,
- "attention_mask": input_mask,
- "token_type_ids": token_type_ids,
- }
- result = model(inputs)
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
- def create_and_check_for_sequence_classification(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- config.num_labels = self.num_labels
- model = TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(config=config)
- inputs = {
- "input_ids": input_ids,
- "attention_mask": input_mask,
- "token_type_ids": token_type_ids,
- }
-
- result = model(inputs)
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
- def create_and_check_for_multiple_choice(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- config.num_choices = self.num_choices
- model = TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(config=config)
- multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
- multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
- multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
- inputs = {
- "input_ids": multiple_choice_inputs_ids,
- "attention_mask": multiple_choice_input_mask,
- "token_type_ids": multiple_choice_token_type_ids,
- }
- result = model(inputs)
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
- def create_and_check_for_token_classification(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- config.num_labels = self.num_labels
- model = TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(config=config)
- inputs = {
- "input_ids": input_ids,
- "attention_mask": input_mask,
- "token_type_ids": token_type_ids,
- }
- result = model(inputs)
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
- def create_and_check_for_question_answering(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- model = TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(config=config)
- inputs = {
- "input_ids": input_ids,
- "attention_mask": input_mask,
- "token_type_ids": token_type_ids,
- }
-
- result = model(inputs)
- self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
- self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
- def prepare_config_and_inputs_for_common(self):
- config_and_inputs = self.prepare_config_and_inputs()
- (
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- ) = config_and_inputs
- inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
- return config, inputs_dict
-
-
-@require_tf
-class TF{{cookiecutter.camelcase_modelname}}ModelTest(TFModelTesterMixin, unittest.TestCase):
-
- all_model_classes = (
- (
- TF{{cookiecutter.camelcase_modelname}}Model,
- TF{{cookiecutter.camelcase_modelname}}ForCausalLM,
- TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
- TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- TF{{cookiecutter.camelcase_modelname}}ForTokenClassification,
- TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
- )
- if is_tf_available()
- else ()
- )
-
- test_head_masking = False
- test_onnx = False
-
- def setUp(self):
- self.model_tester = TF{{cookiecutter.camelcase_modelname}}ModelTester(self)
- self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config, hidden_size=37)
-
- def test_config(self):
- self.config_tester.run_common_tests()
-
- def test_model(self):
- """Test the base model"""
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_model(*config_and_inputs)
-
- @unittest.skip(reason="Template classes interact badly with this test.")
- def test_keras_fit(self):
- pass
-
- def test_causal_lm_base_model(self):
- """Test the base model of the causal LM model
-
- is_deocder=True, no cross_attention, no encoder outputs
- """
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
-
- def test_model_as_decoder(self):
- """Test the base model as a decoder (of an encoder-decoder architecture)
-
- is_deocder=True + cross_attention + pass encoder outputs
- """
- config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
- self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
- def test_for_masked_lm(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
- def test_for_causal_lm(self):
- """Test the causal LM model"""
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_causal_lm_model(*config_and_inputs)
-
- def test_causal_lm_model_as_decoder(self):
- """Test the causal LM model as a decoder"""
- config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
- self.model_tester.create_and_check_causal_lm_model_as_decoder(*config_and_inputs)
-
- def test_causal_lm_model_past(self):
- """Test causal LM model with `past_key_values`"""
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_causal_lm_model_past(*config_and_inputs)
-
- def test_causal_lm_model_past_with_attn_mask(self):
- """Test the causal LM model with `past_key_values` and `attention_mask`"""
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_causal_lm_model_past_with_attn_mask(*config_and_inputs)
-
- def test_causal_lm_model_past_with_large_inputs(self):
- """Test the causal LM model with `past_key_values` and a longer decoder sequence length"""
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_causal_lm_model_past_large_inputs(*config_and_inputs)
-
- def test_decoder_model_past_with_large_inputs(self):
- """Similar to `test_causal_lm_model_past_with_large_inputs` but with cross-attention"""
- config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
- self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
- def test_for_multiple_choice(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
- def test_for_question_answering(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
- def test_for_sequence_classification(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
- def test_for_token_classification(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
- @slow
- def test_model_from_pretrained(self):
- model = TF{{cookiecutter.camelcase_modelname}}Model.from_pretrained("{{cookiecutter.checkpoint_identifier}}")
- self.assertIsNotNone(model)
-
-@require_tf
-class TF{{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCase):
- @slow
- def test_inference_masked_lm(self):
- model = TF{{cookiecutter.camelcase_modelname}}ForMaskedLM.from_pretrained("{{cookiecutter.checkpoint_identifier}}")
- input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
- output = model(input_ids)[0]
-
- # TODO Replace vocab size
- vocab_size = 32000
-
- expected_shape = [1, 6, vocab_size]
- self.assertEqual(output.shape, expected_shape)
-
- print(output[:, :3, :3])
-
- # TODO Replace values below with what was printed above.
- expected_slice = tf.constant(
- [
- [
- [-0.05243197, -0.04498899, 0.05512108],
- [-0.07444685, -0.01064632, 0.04352357],
- [-0.05020351, 0.05530146, 0.00700043],
- ]
- ]
- )
- tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
-
-{% else %}
-import unittest
-
-from transformers import (
- is_tf_available,
- {{cookiecutter.camelcase_modelname}}Config,
- {{cookiecutter.camelcase_modelname}}Tokenizer,
-)
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-
-
-if is_tf_available():
- import tensorflow as tf
-
- from transformers import (
- TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
- TF{{cookiecutter.camelcase_modelname}}Model,
- )
-
-
-@require_tf
-class TF{{cookiecutter.camelcase_modelname}}ModelTester:
- config_cls = {{cookiecutter.camelcase_modelname}}Config
- config_updates = {}
- hidden_act = "gelu"
-
- def __init__(
- self,
- parent,
- batch_size=13,
- seq_length=7,
- is_training=True,
- use_labels=False,
- vocab_size=99,
- hidden_size=32,
- num_hidden_layers=5,
- num_attention_heads=4,
- intermediate_size=37,
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=20,
- eos_token_id=2,
- pad_token_id=1,
- bos_token_id=0,
- ):
- self.parent = parent
- self.batch_size = batch_size
- self.seq_length = seq_length
- self.is_training = is_training
- self.use_labels = use_labels
- self.vocab_size = vocab_size
- self.hidden_size = hidden_size
- self.num_hidden_layers = num_hidden_layers
- self.num_attention_heads = num_attention_heads
- self.intermediate_size = intermediate_size
-
- self.hidden_dropout_prob = hidden_dropout_prob
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
- self.max_position_embeddings = max_position_embeddings
- self.eos_token_id = eos_token_id
- self.pad_token_id = pad_token_id
- self.bos_token_id = bos_token_id
-
- def prepare_config_and_inputs_for_common(self):
- input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
- eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
- input_ids = tf.concat([input_ids, eos_tensor], axis=1)
-
- decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
- config = self.config_cls(
- vocab_size=self.vocab_size,
- d_model=self.hidden_size,
- encoder_layers=self.num_hidden_layers,
- decoder_layers=self.num_hidden_layers,
- encoder_attention_heads=self.num_attention_heads,
- decoder_attention_heads=self.num_attention_heads,
- encoder_ffn_dim=self.intermediate_size,
- decoder_ffn_dim=self.intermediate_size,
- dropout=self.hidden_dropout_prob,
- attention_dropout=self.attention_probs_dropout_prob,
- max_position_embeddings=self.max_position_embeddings,
- eos_token_ids=[2],
- bos_token_id=self.bos_token_id,
- pad_token_id=self.pad_token_id,
- decoder_start_token_id=self.pad_token_id,
- **self.config_updates,
- )
- inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(config, input_ids, decoder_input_ids)
- return config, inputs_dict
-
- def check_decoder_model_past_large_inputs(self, config, inputs_dict):
- model = TF{{cookiecutter.camelcase_modelname}}Model(config=config).get_decoder()
- input_ids = inputs_dict["input_ids"]
-
- input_ids = input_ids[:1, :]
- attention_mask = inputs_dict["attention_mask"][:1, :]
- self.batch_size = 1
-
- # first forward pass
- outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
- output, past_key_values = outputs.to_tuple()
-
- # create hypothetical next token and extent to next_input_ids
- next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
- next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
- # append to next input_ids and
- next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
- next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
-
- output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
- output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
-
- self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
- # select random slice
- random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
- output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
- output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
- # test that outputs are equal for slice
- tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-
-def prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(
- config,
- input_ids,
- decoder_input_ids,
- attention_mask=None,
- decoder_attention_mask=None,
-):
- if attention_mask is None:
- attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int32)
- if decoder_attention_mask is None:
- decoder_attention_mask = tf.concat([tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int32), tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int32)], axis=-1)
- return {
- "input_ids": input_ids,
- "decoder_input_ids": decoder_input_ids,
- "attention_mask": attention_mask,
- "decoder_attention_mask": decoder_attention_mask,
- }
-
-
-@require_tf
-class TF{{cookiecutter.camelcase_modelname}}ModelTest(TFModelTesterMixin, unittest.TestCase):
- all_model_classes = (TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration, TF{{cookiecutter.camelcase_modelname}}Model) if is_tf_available() else ()
- all_generative_model_classes = (TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,) if is_tf_available() else ()
- is_encoder_decoder = True
- test_pruning = False
- test_head_masking = False
- test_onnx = False
-
- def setUp(self):
- self.model_tester = TF{{cookiecutter.camelcase_modelname}}ModelTester(self)
- self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config)
-
- def test_config(self):
- self.config_tester.run_common_tests()
-
- def test_decoder_model_past_large_inputs(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
- self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
-
- @unittest.skip(reason="Template classes interact badly with this test.")
- def test_keras_fit(self):
- pass
-
-
-def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
- """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
- if a is None and b is None:
- return True
- try:
- if tf.debugging.assert_near(a, b, atol=atol):
- return True
- raise
- except Exception:
- if len(prefix) > 0:
- prefix = f"{prefix}: "
- raise AssertionError(f"{prefix}{a} != {b}")
-
-
-def _long_tensor(tok_lst):
- return tf.constant(tok_lst, dtype=tf.int32)
-
-
-TOLERANCE = 1e-4
-
-
-@slow
-@require_sentencepiece
-@require_tokenizers
-@require_tf
-class TF{{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCase):
- def test_inference_no_head(self):
- model = TF{{cookiecutter.camelcase_modelname}}Model.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- # change to intended input here
- input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
- decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
- inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
- output = model(**inputs_dict)[0]
- expected_shape = (1, 11, 1024)
- self.assertEqual(output.shape, expected_shape)
- # change to expected output here
- expected_slice = tf.Tensor(
- [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
- )
- tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE)
-
- def test_inference_with_head(self):
- model = TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- # change to intended input here
- input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
- decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
- inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
- output = model(**inputs_dict)[0]
- expected_shape = (1, 11, 1024)
- self.assertEqual(output.shape, expected_shape)
- # change to expected output here
- expected_slice = tf.Tensor(
- [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
- )
- tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE)
-
- def test_seq_to_seq_generation(self):
- hf = TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
- tok = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
- batch_input = [
- # string 1,
- # string 2,
- # string 3,
- # string 4,
- ]
-
- # The below article tests that we don't add any hypotheses outside of the top n_beams
- dct = tok.batch_encode_plus(
- batch_input,
- max_length=512,
- padding="max_length",
- truncation_strategy="only_first",
- truncation=True,
- return_tensors="tf",
- )
-
- hypotheses_batch = hf.generate(
- input_ids=dct["input_ids"],
- attention_mask=dct["attention_mask"],
- num_beams=2,
- )
-
- EXPECTED = [
- # here expected 1,
- # here expected 2,
- # here expected 3,
- # here expected 4,
- ]
-
- generated = tok.batch_decode(
- hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
- )
- assert generated == EXPECTED
-{%- endif %}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
deleted file mode 100644
index cdb5070e3d9955..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
+++ /dev/null
@@ -1,1069 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch {{cookiecutter.modelname}} model. """
-
-
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-import unittest
-
-from ...test_modeling_common import floats_tensor
-from transformers import is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from transformers import {{cookiecutter.camelcase_modelname}}Config
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_torch_available():
- import torch
-
- from transformers import (
- {{cookiecutter.camelcase_modelname}}ForCausalLM,
- {{cookiecutter.camelcase_modelname}}ForMaskedLM,
- {{cookiecutter.camelcase_modelname}}ForMultipleChoice,
- {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- {{cookiecutter.camelcase_modelname}}ForTokenClassification,
- {{cookiecutter.camelcase_modelname}}Model,
- )
- from transformers.models.{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_modelname}} import (
- {{cookiecutter.uppercase_modelname}} )
-
-
-class {{cookiecutter.camelcase_modelname}}ModelTester:
- def __init__(
- self,
- parent,
- batch_size=13,
- seq_length=7,
- is_training=True,
- use_input_mask=True,
- use_token_type_ids=True,
- use_labels=True,
- vocab_size=99,
- hidden_size=32,
- num_hidden_layers=5,
- num_attention_heads=4,
- intermediate_size=37,
- hidden_act="gelu",
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=512,
- type_vocab_size=16,
- type_sequence_label_size=2,
- initializer_range=0.02,
- num_labels=3,
- num_choices=4,
- scope=None,
- ):
- self.parent = parent
- self.batch_size = batch_size
- self.seq_length = seq_length
- self.is_training = is_training
- self.use_input_mask = use_input_mask
- self.use_token_type_ids = use_token_type_ids
- self.use_labels = use_labels
- self.vocab_size = vocab_size
- self.hidden_size = hidden_size
- self.num_hidden_layers = num_hidden_layers
- self.num_attention_heads = num_attention_heads
- self.intermediate_size = intermediate_size
- self.hidden_act = hidden_act
- self.hidden_dropout_prob = hidden_dropout_prob
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
- self.max_position_embeddings = max_position_embeddings
- self.type_vocab_size = type_vocab_size
- self.type_sequence_label_size = type_sequence_label_size
- self.initializer_range = initializer_range
- self.num_labels = num_labels
- self.num_choices = num_choices
- self.scope = scope
-
- def prepare_config_and_inputs(self):
- input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
- input_mask = None
- if self.use_input_mask:
- input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
- token_type_ids = None
- if self.use_token_type_ids:
- token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
- sequence_labels = None
- token_labels = None
- choice_labels = None
- if self.use_labels:
- sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
- token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
- choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
- config = self.get_config()
-
- return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
- def get_config(self):
- return {{cookiecutter.camelcase_modelname}}Config(
- vocab_size=self.vocab_size,
- hidden_size=self.hidden_size,
- num_hidden_layers=self.num_hidden_layers,
- num_attention_heads=self.num_attention_heads,
- intermediate_size=self.intermediate_size,
- hidden_act=self.hidden_act,
- hidden_dropout_prob=self.hidden_dropout_prob,
- attention_probs_dropout_prob=self.attention_probs_dropout_prob,
- max_position_embeddings=self.max_position_embeddings,
- type_vocab_size=self.type_vocab_size,
- is_decoder=False,
- initializer_range=self.initializer_range,
- )
-
- def prepare_config_and_inputs_for_decoder(self):
- (
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- ) = self.prepare_config_and_inputs()
-
- config.is_decoder = True
- encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
- encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
- return (
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- encoder_hidden_states,
- encoder_attention_mask,
- )
-
- def create_and_check_model(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- model = {{cookiecutter.camelcase_modelname}}Model(config=config)
- model.to(torch_device)
- model.eval()
- result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
- result = model(input_ids, token_type_ids=token_type_ids)
- result = model(input_ids)
- self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
- def create_and_check_model_as_decoder(
- self,
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- encoder_hidden_states,
- encoder_attention_mask,
- ):
- config.add_cross_attention = True
- model = {{cookiecutter.camelcase_modelname}}Model(config)
- model.to(torch_device)
- model.eval()
- result = model(
- input_ids,
- attention_mask=input_mask,
- token_type_ids=token_type_ids,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- )
- result = model(
- input_ids,
- attention_mask=input_mask,
- token_type_ids=token_type_ids,
- encoder_hidden_states=encoder_hidden_states,
- )
- result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
- self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
- def create_and_check_for_causal_lm(
- self,
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- encoder_hidden_states,
- encoder_attention_mask,
- ):
- model = {{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
- model.to(torch_device)
- model.eval()
- result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
- def create_and_check_for_masked_lm(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- model = {{cookiecutter.camelcase_modelname}}ForMaskedLM(config=config)
- model.to(torch_device)
- model.eval()
- result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
- def create_and_check_decoder_model_past_large_inputs(
- self,
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- encoder_hidden_states,
- encoder_attention_mask,
- ):
- config.is_decoder = True
- config.add_cross_attention = True
- model = {{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
- model.to(torch_device)
- model.eval()
-
- # first forward pass
- outputs = model(
- input_ids,
- attention_mask=input_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- use_cache=True,
- )
- past_key_values = outputs.past_key_values
-
- # create hypothetical multiple next token and extent to next_input_ids
- next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
- next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
- # append to next input_ids and
- next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
- next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
- output_from_no_past = model(
- next_input_ids,
- attention_mask=next_attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- output_hidden_states=True,
- )["hidden_states"][0]
- output_from_past = model(
- next_tokens,
- attention_mask=next_attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- past_key_values=past_key_values,
- output_hidden_states=True,
- )["hidden_states"][0]
-
- # select random slice
- random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
- output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
- output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
- self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
- # test that outputs are equal for slice
- self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
- def create_and_check_for_question_answering(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- model = {{cookiecutter.camelcase_modelname}}ForQuestionAnswering(config=config)
- model.to(torch_device)
- model.eval()
- result = model(
- input_ids,
- attention_mask=input_mask,
- token_type_ids=token_type_ids,
- start_positions=sequence_labels,
- end_positions=sequence_labels,
- )
- self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
- self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
- def create_and_check_for_sequence_classification(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- config.num_labels = self.num_labels
- model = {{cookiecutter.camelcase_modelname}}ForSequenceClassification(config)
- model.to(torch_device)
- model.eval()
- result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
- def create_and_check_for_token_classification(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- config.num_labels = self.num_labels
- model = {{cookiecutter.camelcase_modelname}}ForTokenClassification(config=config)
- model.to(torch_device)
- model.eval()
- result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
- def create_and_check_for_multiple_choice(
- self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
- ):
- config.num_choices = self.num_choices
- model = {{cookiecutter.camelcase_modelname}}ForMultipleChoice(config=config)
- model.to(torch_device)
- model.eval()
- multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
- multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
- multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
- result = model(
- multiple_choice_inputs_ids,
- attention_mask=multiple_choice_input_mask,
- token_type_ids=multiple_choice_token_type_ids,
- labels=choice_labels,
- )
- self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
- def prepare_config_and_inputs_for_common(self):
- config_and_inputs = self.prepare_config_and_inputs()
- (
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- ) = config_and_inputs
- inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
- return config, inputs_dict
-
-
-@require_torch
-class {{cookiecutter.camelcase_modelname}}ModelTest(ModelTesterMixin, unittest.TestCase):
-
- all_model_classes = (
- (
- {{cookiecutter.camelcase_modelname}}Model,
- {{cookiecutter.camelcase_modelname}}ForMaskedLM,
- {{cookiecutter.camelcase_modelname}}ForCausalLM,
- {{cookiecutter.camelcase_modelname}}ForMultipleChoice,
- {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- {{cookiecutter.camelcase_modelname}}ForTokenClassification,
- )
- if is_torch_available()
- else ()
- )
- all_generative_model_classes = ({{cookiecutter.camelcase_modelname}}ForCausalLM,) if is_torch_available() else ()
-
- def setUp(self):
- self.model_tester = {{cookiecutter.camelcase_modelname}}ModelTester(self)
- self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config, hidden_size=37)
-
- def test_config(self):
- self.config_tester.run_common_tests()
-
- def test_model(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_model(*config_and_inputs)
-
- def test_model_various_embeddings(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- for type in ["absolute", "relative_key", "relative_key_query"]:
- config_and_inputs[0].position_embedding_type = type
- self.model_tester.create_and_check_model(*config_and_inputs)
-
- def test_for_masked_lm(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
- def test_for_multiple_choice(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
- def test_decoder_model_past_with_large_inputs(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
- self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
- def test_for_question_answering(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
- def test_for_sequence_classification(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
- def test_for_token_classification(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
- def test_model_as_decoder(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
- self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
- def test_model_as_decoder_with_default_input_mask(self):
- # This regression test was failing with PyTorch < 1.3
- (
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- encoder_hidden_states,
- encoder_attention_mask,
- ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
- input_mask = None
-
- self.model_tester.create_and_check_model_as_decoder(
- config,
- input_ids,
- token_type_ids,
- input_mask,
- sequence_labels,
- token_labels,
- choice_labels,
- encoder_hidden_states,
- encoder_attention_mask,
- )
-
- @slow
- def test_model_from_pretrained(self):
- model_name = "{{coockiecutter.checkpoint_identifier}}"
- model = {{cookiecutter.camelcase_modelname}}Model.from_pretrained(model_name)
- self.assertIsNotNone(model)
-
-
-@require_torch
-class {{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCase):
- @slow
- def test_inference_masked_lm(self):
- model = {{cookiecutter.camelcase_modelname}}ForMaskedLM.from_pretrained("{{cookiecutter.checkpoint_identifier}}")
- input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
- output = model(input_ids)[0]
-
- # TODO Replace vocab size
- vocab_size = 32000
-
- expected_shape = torch.Size((1, 6, vocab_size))
- self.assertEqual(output.shape, expected_shape)
-
- # TODO Replace values below with what was printed above.
- expected_slice = torch.tensor(
- [[[-0.0483, 0.1188, -0.0313], [-0.0606, 0.1435, 0.0199], [-0.0235, 0.1519, 0.0175]]]
- )
-
- self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-
-{% else -%}
-import copy
-import tempfile
-import unittest
-
-from transformers import is_torch_available
-from transformers.utils import cached_property
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_torch_available():
- import torch
-
- from transformers import (
- {{cookiecutter.camelcase_modelname}}Config,
- {{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
- {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- {{cookiecutter.camelcase_modelname}}ForCausalLM,
- {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- {{cookiecutter.camelcase_modelname}}Model,
- {{cookiecutter.camelcase_modelname}}Tokenizer,
- )
- from transformers.models.{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_modelname}} import (
- {{cookiecutter.camelcase_modelname}}Decoder,
- {{cookiecutter.camelcase_modelname}}Encoder,
- )
-
-
-def prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(
- config,
- input_ids,
- decoder_input_ids,
- attention_mask=None,
- decoder_attention_mask=None,
-):
- if attention_mask is None:
- attention_mask = input_ids.ne(config.pad_token_id)
- if decoder_attention_mask is None:
- decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
- return {
- "input_ids": input_ids,
- "decoder_input_ids": decoder_input_ids,
- "attention_mask": attention_mask,
- "decoder_attention_mask": attention_mask,
- }
-
-
-@require_torch
-class {{cookiecutter.camelcase_modelname}}ModelTester:
- def __init__(
- self,
- parent,
- batch_size=13,
- seq_length=7,
- is_training=True,
- use_labels=False,
- vocab_size=99,
- hidden_size=16,
- num_hidden_layers=2,
- num_attention_heads=4,
- intermediate_size=4,
- hidden_act="gelu",
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=20,
- eos_token_id=2,
- pad_token_id=1,
- bos_token_id=0,
- ):
- self.parent = parent
- self.batch_size = batch_size
- self.seq_length = seq_length
- self.is_training = is_training
- self.use_labels = use_labels
- self.vocab_size = vocab_size
- self.hidden_size = hidden_size
- self.num_hidden_layers = num_hidden_layers
- self.num_attention_heads = num_attention_heads
- self.intermediate_size = intermediate_size
- self.hidden_act = hidden_act
- self.hidden_dropout_prob = hidden_dropout_prob
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
- self.max_position_embeddings = max_position_embeddings
- self.eos_token_id = eos_token_id
- self.pad_token_id = pad_token_id
- self.bos_token_id = bos_token_id
-
- def prepare_config_and_inputs(self):
- input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
- input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
- 3,
- )
- input_ids[:, -1] = self.eos_token_id # Eos Token
-
- decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
- config = {{cookiecutter.camelcase_modelname}}Config(
- vocab_size=self.vocab_size,
- d_model=self.hidden_size,
- encoder_layers=self.num_hidden_layers,
- decoder_layers=self.num_hidden_layers,
- encoder_attention_heads=self.num_attention_heads,
- decoder_attention_heads=self.num_attention_heads,
- encoder_ffn_dim=self.intermediate_size,
- decoder_ffn_dim=self.intermediate_size,
- dropout=self.hidden_dropout_prob,
- attention_dropout=self.attention_probs_dropout_prob,
- max_position_embeddings=self.max_position_embeddings,
- eos_token_id=self.eos_token_id,
- bos_token_id=self.bos_token_id,
- pad_token_id=self.pad_token_id,
- )
- inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(config, input_ids, decoder_input_ids)
- return config, inputs_dict
-
- def prepare_config_and_inputs_for_common(self):
- config, inputs_dict = self.prepare_config_and_inputs()
- return config, inputs_dict
-
- def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
- model = {{cookiecutter.camelcase_modelname}}Model(config=config).get_decoder().to(torch_device).eval()
- input_ids = inputs_dict["input_ids"]
- attention_mask = inputs_dict["attention_mask"]
-
- # first forward pass
- outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
- output, past_key_values = outputs.to_tuple()
-
- # create hypothetical multiple next token and extent to next_input_ids
- next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
- next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
- # append to next input_ids and
- next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
- next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
-
- output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
- output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)["last_hidden_state"]
-
- # select random slice
- random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
- output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
- output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
- self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
- # test that outputs are equal for slice
- self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
-
- def check_encoder_decoder_model_standalone(self, config, inputs_dict):
- model = {{cookiecutter.camelcase_modelname}}Model(config=config).to(torch_device).eval()
- outputs = model(**inputs_dict)
-
- encoder_last_hidden_state = outputs.encoder_last_hidden_state
- last_hidden_state = outputs.last_hidden_state
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- encoder = model.get_encoder()
- encoder.save_pretrained(tmpdirname)
- encoder = {{cookiecutter.camelcase_modelname}}Encoder.from_pretrained(tmpdirname).to(torch_device)
-
- encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
- 0
- ]
-
- self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- decoder = model.get_decoder()
- decoder.save_pretrained(tmpdirname)
- decoder = {{cookiecutter.camelcase_modelname}}Decoder.from_pretrained(tmpdirname).to(torch_device)
-
- last_hidden_state_2 = decoder(
- input_ids=inputs_dict["decoder_input_ids"],
- attention_mask=inputs_dict["decoder_attention_mask"],
- encoder_hidden_states=encoder_last_hidden_state,
- encoder_attention_mask=inputs_dict["attention_mask"],
- )[0]
-
- self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_torch
-class {{cookiecutter.camelcase_modelname}}ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
- all_model_classes = (
- ({{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, {{cookiecutter.camelcase_modelname}}ForSequenceClassification, {{cookiecutter.camelcase_modelname}}ForQuestionAnswering)
- if is_torch_available()
- else ()
- )
- all_generative_model_classes = ({{cookiecutter.camelcase_modelname}}ForConditionalGeneration,) if is_torch_available() else ()
- is_encoder_decoder = True
- test_pruning = False
- test_head_masking = False
- test_missing_keys = False
-
- def setUp(self):
- self.model_tester = {{cookiecutter.camelcase_modelname}}ModelTester(self)
- self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config)
-
- def test_config(self):
- self.config_tester.run_common_tests()
-
- def test_save_load_strict(self):
- config, inputs_dict = self.model_tester.prepare_config_and_inputs()
- for model_class in self.all_model_classes:
- model = model_class(config)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- model.save_pretrained(tmpdirname)
- model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
- self.assertEqual(info["missing_keys"], [])
-
- def test_decoder_model_past_with_large_inputs(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
- def test_encoder_decoder_model_standalone(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
- self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
- # {{cookiecutter.camelcase_modelname}}ForSequenceClassification does not support inputs_embeds
- def test_inputs_embeds(self):
- config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
- for model_class in ({{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, {{cookiecutter.camelcase_modelname}}ForQuestionAnswering):
- model = model_class(config)
- model.to(torch_device)
- model.eval()
-
- inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
- if not self.is_encoder_decoder:
- input_ids = inputs["input_ids"]
- del inputs["input_ids"]
- else:
- encoder_input_ids = inputs["input_ids"]
- decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
- del inputs["input_ids"]
- inputs.pop("decoder_input_ids", None)
-
- wte = model.get_input_embeddings()
- if not self.is_encoder_decoder:
- inputs["inputs_embeds"] = wte(input_ids)
- else:
- inputs["inputs_embeds"] = wte(encoder_input_ids)
- inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
- with torch.no_grad():
- model(**inputs)[0]
-
- def test_generate_fp16(self):
- config, input_dict = self.model_tester.prepare_config_and_inputs()
- input_ids = input_dict["input_ids"]
- attention_mask = input_ids.ne(1).to(torch_device)
- model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration(config).eval().to(torch_device)
- if torch_device == "cuda":
- model.half()
- model.generate(input_ids, attention_mask=attention_mask)
- model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
- """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
- if a is None and b is None:
- return True
- try:
- if torch.allclose(a, b, atol=atol):
- return True
- raise
- except Exception:
- pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
- if a.numel() > 100:
- msg = f"tensor values are {pct_different:.1%} percent different."
- else:
- msg = f"{a} != {b}"
- if prefix:
- msg = prefix + ": " + msg
- raise AssertionError(msg)
-
-
-def _long_tensor(tok_lst):
- return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
-
-
-TOLERANCE = 1e-4
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-@slow
-class {{cookiecutter.camelcase_modelname}}ModelIntegrationTests(unittest.TestCase):
- @cached_property
- def default_tokenizer(self):
- return {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
- def test_inference_no_head(self):
- model = {{cookiecutter.camelcase_modelname}}Model.from_pretrained('{{cookiecutter.checkpoint_identifier}}').to(torch_device)
- input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
- decoder_input_ids = _long_tensor([[2, 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588]])
- inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
- with torch.no_grad():
- output = model(**inputs_dict)[0]
- expected_shape = torch.Size((1, 11, 1024))
- self.assertEqual(output.shape, expected_shape)
- # change to expected output here
- expected_slice = torch.tensor(
- [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
- )
- self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
-
- def test_inference_head(self):
- model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}').to(torch_device)
-
- # change to intended input
- input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
- decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
- inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
- with torch.no_grad():
- output = model(**inputs_dict)[0]
- expected_shape = torch.Size((1, 11, model.config.vocab_size))
- self.assertEqual(output.shape, expected_shape)
- # change to expected output here
- expected_slice = torch.tensor(
- [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
- )
- self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
-
- def test_seq_to_seq_generation(self):
- hf = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}').to(torch_device)
- tok = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
- batch_input = [
- # string 1,
- # string 2,
- # string 3,
- # string 4,
- ]
-
- # The below article tests that we don't add any hypotheses outside of the top n_beams
- dct = tok.batch_encode_plus(
- batch_input,
- max_length=512,
- padding="max_length",
- truncation_strategy="only_first",
- truncation=True,
- return_tensors="pt",
- )
-
- hypotheses_batch = hf.generate(
- input_ids=dct["input_ids"].to(torch_device),
- attention_mask=dct["attention_mask"].to(torch_device),
- num_beams=2,
- )
-
- EXPECTED = [
- # here expected 1,
- # here expected 2,
- # here expected 3,
- # here expected 4,
- ]
-
- generated = tok.batch_decode(
- hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
- )
- assert generated == EXPECTED
-
-
-class {{cookiecutter.camelcase_modelname}}StandaloneDecoderModelTester:
- def __init__(
- self,
- parent,
- vocab_size=99,
- batch_size=13,
- d_model=16,
- decoder_seq_length=7,
- is_training=True,
- is_decoder=True,
- use_attention_mask=True,
- use_cache=False,
- use_labels=True,
- decoder_start_token_id=2,
- decoder_ffn_dim=32,
- decoder_layers=4,
- encoder_attention_heads=4,
- decoder_attention_heads=4,
- max_position_embeddings=30,
- is_encoder_decoder=False,
- pad_token_id=0,
- bos_token_id=1,
- eos_token_id=2,
- scope=None,
- ):
- self.parent = parent
- self.batch_size = batch_size
- self.decoder_seq_length = decoder_seq_length
- # For common tests
- self.seq_length = self.decoder_seq_length
- self.is_training = is_training
- self.use_attention_mask = use_attention_mask
- self.use_labels = use_labels
-
- self.vocab_size = vocab_size
- self.d_model = d_model
- self.hidden_size = d_model
- self.num_hidden_layers = decoder_layers
- self.decoder_layers = decoder_layers
- self.decoder_ffn_dim = decoder_ffn_dim
- self.encoder_attention_heads = encoder_attention_heads
- self.decoder_attention_heads = decoder_attention_heads
- self.num_attention_heads = decoder_attention_heads
- self.eos_token_id = eos_token_id
- self.bos_token_id = bos_token_id
- self.pad_token_id = pad_token_id
- self.decoder_start_token_id = decoder_start_token_id
- self.use_cache = use_cache
- self.max_position_embeddings = max_position_embeddings
- self.is_encoder_decoder = is_encoder_decoder
-
- self.scope = None
- self.decoder_key_length = decoder_seq_length
- self.base_model_out_len = 2
- self.decoder_attention_idx = 1
-
- def prepare_config_and_inputs(self):
- input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
- attention_mask = None
- if self.use_attention_mask:
- attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
- lm_labels = None
- if self.use_labels:
- lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
- config = {{cookiecutter.camelcase_modelname}}Config(
- vocab_size=self.vocab_size,
- d_model=self.d_model,
- decoder_layers=self.decoder_layers,
- decoder_ffn_dim=self.decoder_ffn_dim,
- encoder_attention_heads=self.encoder_attention_heads,
- decoder_attention_heads=self.decoder_attention_heads,
- eos_token_id=self.eos_token_id,
- bos_token_id=self.bos_token_id,
- use_cache=self.use_cache,
- pad_token_id=self.pad_token_id,
- decoder_start_token_id=self.decoder_start_token_id,
- max_position_embeddings=self.max_position_embeddings,
- is_encoder_decoder=self.is_encoder_decoder,
- )
-
- return (
- config,
- input_ids,
- attention_mask,
- lm_labels,
- )
-
- def create_and_check_decoder_model_past(
- self,
- config,
- input_ids,
- attention_mask,
- lm_labels,
- ):
- config.use_cache = True
- model = {{cookiecutter.camelcase_modelname}}Decoder(config=config).to(torch_device).eval()
- # first forward pass
- outputs = model(input_ids, use_cache=True)
- outputs_use_cache_conf = model(input_ids)
- outputs_no_past = model(input_ids, use_cache=False)
-
- self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
- self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
- past_key_values = outputs["past_key_values"]
-
- # create hypothetical next token and extent to next_input_ids
- next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
- # append to next input_ids and
- next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-
- output_from_no_past = model(next_input_ids)["last_hidden_state"]
- output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
- # select random slice
- random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
- output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
- output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
- # test that outputs are equal for slice
- assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-
- def create_and_check_decoder_model_attention_mask_past(
- self,
- config,
- input_ids,
- attention_mask,
- lm_labels,
- ):
- model = {{cookiecutter.camelcase_modelname}}Decoder(config=config).to(torch_device).eval()
-
- # create attention mask
- attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
-
- half_seq_length = input_ids.shape[-1] // 2
- attn_mask[:, half_seq_length:] = 0
-
- # first forward pass
- past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
- # create hypothetical next token and extent to next_input_ids
- next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
- # change a random masked slice from input_ids
- random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
- random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
- input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
- # append to next input_ids and attn_mask
- next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
- attn_mask = torch.cat(
- [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
- dim=1,
- )
-
- # get two different outputs
- output_from_no_past = model(next_input_ids)["last_hidden_state"]
- output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
- # select random slice
- random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
- output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
- output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
- # test that outputs are equal for slice
- assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2)
-
- def prepare_config_and_inputs_for_common(self):
- config_and_inputs = self.prepare_config_and_inputs()
- (
- config,
- input_ids,
- attention_mask,
- lm_labels,
- ) = config_and_inputs
-
- inputs_dict = {
- "input_ids": input_ids,
- "attention_mask": attention_mask,
- }
- return config, inputs_dict
-
-
-@require_torch
-class {{cookiecutter.camelcase_modelname}}StandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
- all_model_classes = ({{cookiecutter.camelcase_modelname}}Decoder, {{cookiecutter.camelcase_modelname}}ForCausalLM) if is_torch_available() else ()
- all_generative_model_classes = ({{cookiecutter.camelcase_modelname}}ForCausalLM,) if is_torch_available() else ()
- test_pruning = False
- is_encoder_decoder = False
-
- def setUp(
- self,
- ):
- self.model_tester = {{cookiecutter.camelcase_modelname}}StandaloneDecoderModelTester(self, is_training=False)
- self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config)
-
- def test_config(self):
- self.config_tester.run_common_tests()
-
- def test_decoder_model_past(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
- def test_decoder_model_attn_mask_past(self):
- config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
- def test_retain_grad_hidden_states_attentions(self):
- # decoder cannot keep gradients
- return
-{% endif -%}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py
deleted file mode 100644
index f5ed661ade3625..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py
+++ /dev/null
@@ -1,461 +0,0 @@
-## Copyright 2022 The HuggingFace Team. All rights reserved.
-##
-## Licensed under the Apache License, Version 2.0 (the "License");
-## you may not use this file except in compliance with the License.
-## You may obtain a copy of the License at
-##
-## http://www.apache.org/licenses/LICENSE-2.0
-##
-## Unless required by applicable law or agreed to in writing, software
-## distributed under the License is distributed on an "AS IS" BASIS,
-## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-## See the License for the specific language governing permissions and
-## limitations under the License.
-
-## This file is made so that specific statements may be copied inside existing files. This is useful to copy
-## import statements in __init__.py, or to complete model lists in the AUTO files.
-##
-## It is to be used as such:
-## Put '# To replace in: "FILE_PATH"' in order to indicate the contents will be copied in the file at path FILE_PATH
-## Put '# Below: "STATEMENT"' in order to copy the contents below **the first occurrence** of that line in the file at FILE_PATH
-## Put '# Replace with:' followed by the lines containing the content to define the content
-## End a statement with '# End.'. If starting a new statement without redefining the FILE_PATH, it will continue pasting
-## content in that file.
-##
-## Put '## COMMENT' to comment on the file.
-
-# To replace in: "src/transformers/__init__.py"
-# Below: " # PyTorch models structure" if generating PyTorch
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
- _import_structure["models.{{cookiecutter.lowercase_modelname}}"].extend(
- [
- "{{cookiecutter.camelcase_modelname}}ForMaskedLM",
- "{{cookiecutter.camelcase_modelname}}ForCausalLM",
- "{{cookiecutter.camelcase_modelname}}ForMultipleChoice",
- "{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
- "{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
- "{{cookiecutter.camelcase_modelname}}ForTokenClassification",
- "{{cookiecutter.camelcase_modelname}}Layer",
- "{{cookiecutter.camelcase_modelname}}Model",
- "{{cookiecutter.camelcase_modelname}}PreTrainedModel",
- "load_tf_weights_in_{{cookiecutter.lowercase_modelname}}",
- ]
- )
-{% else %}
- _import_structure["models.{{cookiecutter.lowercase_modelname}}"].extend(
- [
- "{{cookiecutter.camelcase_modelname}}ForCausalLM",
- "{{cookiecutter.camelcase_modelname}}ForConditionalGeneration",
- "{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
- "{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
- "{{cookiecutter.camelcase_modelname}}Model",
- "{{cookiecutter.camelcase_modelname}}PreTrainedModel",
- ]
- )
-{% endif -%}
-# End.
-
-# Below: " # TensorFlow models structure" if generating TensorFlow
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
- _import_structure["models.{{cookiecutter.lowercase_modelname}}"].extend(
- [
- "TF{{cookiecutter.camelcase_modelname}}ForMaskedLM",
- "TF{{cookiecutter.camelcase_modelname}}ForCausalLM",
- "TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice",
- "TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
- "TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
- "TF{{cookiecutter.camelcase_modelname}}ForTokenClassification",
- "TF{{cookiecutter.camelcase_modelname}}Layer",
- "TF{{cookiecutter.camelcase_modelname}}Model",
- "TF{{cookiecutter.camelcase_modelname}}PreTrainedModel",
- ]
- )
-{% else %}
- _import_structure["models.{{cookiecutter.lowercase_modelname}}"].extend(
- [
- "TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration",
- "TF{{cookiecutter.camelcase_modelname}}Model",
- "TF{{cookiecutter.camelcase_modelname}}PreTrainedModel",
- ]
- )
-{% endif -%}
-# End.
-
-# Below: " # Flax models structure" if generating Flax
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
- _import_structure["models.{{cookiecutter.lowercase_modelname}}"].extend(
- [
- "Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM",
- "Flax{{cookiecutter.camelcase_modelname}}ForCausalLM",
- "Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice",
- "Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
- "Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
- "Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification",
- "Flax{{cookiecutter.camelcase_modelname}}Layer",
- "Flax{{cookiecutter.camelcase_modelname}}Model",
- "Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel",
- ]
- )
-{% else %}
- _import_structure["models.{{cookiecutter.lowercase_modelname}}"].extend(
- [
- "Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration",
- "Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
- "Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
- "Flax{{cookiecutter.camelcase_modelname}}Model",
- "Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel",
- ]
- )
-{% endif -%}
-# End.
-
-# Below: " # Fast tokenizers structure"
-# Replace with:
- _import_structure["models.{{cookiecutter.lowercase_modelname}}"].append("{{cookiecutter.camelcase_modelname}}TokenizerFast")
-# End.
-
-# Below: " # Models"
-# Replace with:
- "models.{{cookiecutter.lowercase_modelname}}": ["{{cookiecutter.camelcase_modelname}}Config", "{{cookiecutter.camelcase_modelname}}Tokenizer"],
-# End.
-
-# To replace in: "src/transformers/__init__.py"
-# Below: " # PyTorch model imports" if generating PyTorch
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
- from .models.{{cookiecutter.lowercase_modelname}} import (
- {{cookiecutter.camelcase_modelname}}ForMaskedLM,
- {{cookiecutter.camelcase_modelname}}ForCausalLM,
- {{cookiecutter.camelcase_modelname}}ForMultipleChoice,
- {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- {{cookiecutter.camelcase_modelname}}ForTokenClassification,
- {{cookiecutter.camelcase_modelname}}Layer,
- {{cookiecutter.camelcase_modelname}}Model,
- {{cookiecutter.camelcase_modelname}}PreTrainedModel,
- load_tf_weights_in_{{cookiecutter.lowercase_modelname}},
- )
-{% else %}
- from .models.{{cookiecutter.lowercase_modelname}} import (
- {{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
- {{cookiecutter.camelcase_modelname}}ForCausalLM,
- {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- {{cookiecutter.camelcase_modelname}}Model,
- {{cookiecutter.camelcase_modelname}}PreTrainedModel,
- )
-{% endif -%}
-# End.
-
-# Below: " # TensorFlow model imports" if generating TensorFlow
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
- from .models.{{cookiecutter.lowercase_modelname}} import (
- TF_{{cookiecutter.uppercase_modelname}} TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
- TF{{cookiecutter.camelcase_modelname}}ForCausalLM,
- TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
- TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- TF{{cookiecutter.camelcase_modelname}}ForTokenClassification,
- TF{{cookiecutter.camelcase_modelname}}Layer,
- TF{{cookiecutter.camelcase_modelname}}Model,
- TF{{cookiecutter.camelcase_modelname}}PreTrainedModel,
- )
-{% else %}
- from .models.{{cookiecutter.lowercase_modelname}} import (
- TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
- TF{{cookiecutter.camelcase_modelname}}Model,
- TF{{cookiecutter.camelcase_modelname}}PreTrainedModel,
- )
-{% endif -%}
-# End.
-
-# Below: " # Flax model imports" if generating Flax
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
- from .models.{{cookiecutter.lowercase_modelname}} import (
- Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM,
- Flax{{cookiecutter.camelcase_modelname}}ForCausalLM,
- Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
- Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification,
- Flax{{cookiecutter.camelcase_modelname}}Layer,
- Flax{{cookiecutter.camelcase_modelname}}Model,
- Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel,
- )
-{% else %}
- from .models.{{cookiecutter.lowercase_modelname}} import (
- Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
- Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
- Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
- Flax{{cookiecutter.camelcase_modelname}}Model,
- Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel,
- )
-{% endif -%}
-# End.
-
-# Below: " # Fast tokenizers imports"
-# Replace with:
- from .models.{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}TokenizerFast
-# End.
-
-# Below: " from .models.albert import AlbertConfig"
-# Replace with:
- from .models.{{cookiecutter.lowercase_modelname}} import {{cookiecutter.uppercase_modelname}}{{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}Tokenizer
-# End.
-
-
-
-# To replace in: "src/transformers/models/__init__.py"
-# Below: "from . import ("
-# Replace with:
- {{cookiecutter.lowercase_modelname}},
-# End.
-
-
-# To replace in: "src/transformers/models/auto/configuration_auto.py"
-# Below: "# Add configs here"
-# Replace with:
- ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}Config"),
-# End.
-
-# Below: "# Add full (and cased) model names here"
-# Replace with:
- ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}"),
-# End.
-
-
-
-# To replace in: "src/transformers/models/auto/modeling_auto.py" if generating PyTorch
-# Below: "# Base model mapping"
-# Replace with:
- ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}Model"),
-# End.
-
-# Below: "# Model with LM heads mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForMaskedLM"),
-{% else %}
- ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForConditionalGeneration"),
-{% endif -%}
-# End.
-
-# Below: "# Model for Causal LM mapping"
-# Replace with:
- ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForCausalLM"),
-# End.
-
-# Below: "# Model for Masked LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForMaskedLM"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Sequence Classification mapping"
-# Replace with:
- ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForSequenceClassification"),
-# End.
-
-# Below: "# Model for Question Answering mapping"
-# Replace with:
- ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForQuestionAnswering"),
-# End.
-
-# Below: "# Model for Token Classification mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForTokenClassification"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Multiple Choice mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForMultipleChoice"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Seq2Seq Causal LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-{% else %}
- ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForConditionalGeneration"),
-{% endif -%}
-# End.
-
-# To replace in: "src/transformers/models/auto/modeling_tf_auto.py" if generating TensorFlow
-# Below: "# Base model mapping"
-# Replace with:
- ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}Model"),
-# End.
-
-# Below: "# Model with LM heads mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForMaskedLM"),
-{% else %}
- ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration"),
-{% endif -%}
-# End.
-
-# Below: "# Model for Causal LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForCausalLM"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Masked LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForMaskedLM"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Sequence Classification mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Question Answering mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Token Classification mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForTokenClassification"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Multiple Choice mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Seq2Seq Causal LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-{% else %}
- ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration"),
-{% endif -%}
-# End.
-
-# To replace in: "src/transformers/models/auto/modeling_flax_auto.py" if generating Flax
-# Below: "# Base model mapping"
-# Replace with:
- ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}Model"),
-# End.
-
-# Below: "# Model for Masked LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM"),
-{% else %}
- ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration"),
-{% endif -%}
-# End.
-
-# Below: "# Model for Causal LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForCausalLM"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Masked LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Sequence Classification mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification"),
-{% else %}
- ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification"),
-{% endif -%}
-# End.
-
-# Below: "# Model for Question Answering mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering"),
-{% else %}
- ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering"),
-{% endif -%}
-# End.
-
-# Below: "# Model for Token Classification mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Multiple Choice mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
- ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Seq2Seq Causal LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-{% else %}
- ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration"),
-{% endif -%}
-# End.
-
-
-
-# To replace in: "utils/check_repo.py" if generating PyTorch
-
-# Below: "models to ignore for model xxx mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-{% else -%}
- "{{cookiecutter.camelcase_modelname}}Encoder",
- "{{cookiecutter.camelcase_modelname}}Decoder",
- "{{cookiecutter.camelcase_modelname}}DecoderWrapper",
-{% endif -%}
-# End.
-
-# Below: "models to ignore for not tested"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-{% else -%}
- "{{cookiecutter.camelcase_modelname}}Encoder", # Building part of bigger (tested) model.
- "{{cookiecutter.camelcase_modelname}}Decoder", # Building part of bigger (tested) model.
- "{{cookiecutter.camelcase_modelname}}DecoderWrapper", # Building part of bigger (tested) model.
-{% endif -%}
-# End.
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
deleted file mode 100644
index 3712c970296ea1..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# coding=utf-8
-# Copyright 2022 {{cookiecutter.authors}} and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for {{cookiecutter.modelname}}."""
-
-{%- if cookiecutter.tokenizer_type == "Based on BERT" %}
-from ...utils import logging
-from ..bert.tokenization_bert_fast import BertTokenizerFast
-from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Tokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
- "vocab_file": {
- "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt",
- }
-}
-
-
-class {{cookiecutter.camelcase_modelname}}TokenizerFast(BertTokenizerFast):
- r"""
- Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).
-
- [`~{{cookiecutter.camelcase_modelname}}TokenizerFast`] is identical to [`BertTokenizerFast`] and runs
- end-to-end tokenization: punctuation splitting and wordpiece.
-
- Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
- parameters.
- """
-
- vocab_files_names = VOCAB_FILES_NAMES
- pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
- slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer
-
-{%- elif cookiecutter.tokenizer_type == "Based on BART" %}
-from ...utils import logging
-from ..bart.tokenization_bart_fast import BartTokenizerFast
-from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Tokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class {{cookiecutter.camelcase_modelname}}TokenizerFast(BartTokenizerFast):
- r"""
- Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).
-
- [`~{{cookiecutter.camelcase_modelname}}TokenizerFast`] is identical to [`BartTokenizerFast`] and runs
- end-to-end tokenization: punctuation splitting and wordpiece.
-
- Refer to superclass [`BartTokenizerFast`] for usage examples and documentation concerning
- parameters.
- """
-
- vocab_files_names = VOCAB_FILES_NAMES
- slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer
-
-{%- elif cookiecutter.tokenizer_type == "Standalone" %}
-from typing import List, Optional
-
-from tokenizers import ByteLevelBPETokenizer
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Tokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
-
-class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast):
- """
- Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).
-
- Args:
- vocab_file (`str`):
- Path to the vocabulary file.
- """
-
- vocab_files_names = VOCAB_FILES_NAMES
- slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer
-
- def __init__(
- self,
- vocab_file,
- merges_file,
- unk_token="<|endoftext|>",
- bos_token="<|endoftext|>",
- eos_token="<|endoftext|>",
- add_prefix_space=False,
- trim_offsets=True,
- **kwargs
- ):
- super().__init__(
- ByteLevelBPETokenizer(
- vocab_file=vocab_file,
- merges_file=merges_file,
- add_prefix_space=add_prefix_space,
- trim_offsets=trim_offsets,
- ),
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- **kwargs,
- )
- self.add_prefix_space = add_prefix_space
-
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
- output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
- if token_ids_1 is None:
- return output
-
- return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
-
-
- def create_token_type_ids_from_sequences(
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
- ) -> List[int]:
- """
- Create a mask from the two sequences passed to be used in a sequence-pair classification task.
- {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.
-
- Args:
- token_ids_0 (`List[int]`):
- List of IDs.
- token_ids_1 (`List[int]`, *optional*):
- Optional second list of IDs for sequence pairs.
-
- Returns:
- `List[int]`: List of zeros.
- """
- sep = [self.sep_token_id]
- cls = [self.cls_token_id]
-
- if token_ids_1 is None:
- return len(cls + token_ids_0 + sep) * [0]
- return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-
-{% endif %}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
deleted file mode 100644
index 2f627adeb7df20..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
+++ /dev/null
@@ -1,293 +0,0 @@
-# coding=utf-8
-# Copyright 2022 {{cookiecutter.authors}} and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for {{cookiecutter.modelname}}."""
-
-{%- if cookiecutter.tokenizer_type == "Based on BERT" %}
-from ...utils import logging
-from ..bert.tokenization_bert import BertTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
- "vocab_file": {
- "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt",
- }
-}
-
-
-class {{cookiecutter.camelcase_modelname}}Tokenizer(BertTokenizer):
- r"""
- Construct a {{cookiecutter.modelname}} tokenizer.
-
- [`~{{cookiecutter.camelcase_modelname}}Tokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
- tokenization: punctuation splitting and wordpiece.
-
- Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
- parameters.
- """
-
- vocab_files_names = VOCAB_FILES_NAMES
- pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-
-{%- elif cookiecutter.tokenizer_type == "Based on BART" %}
-from ...utils import logging
-from ..bart.tokenization_bart import BartTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
-
-
-class {{cookiecutter.camelcase_modelname}}Tokenizer(BartTokenizer):
- """
- Construct a {{cookiecutter.modelname}} tokenizer.
-
- [`~{{cookiecutter.camelcase_modelname}}Tokenizer`] is identical to [`BartTokenizer`] and runs end-to-end
- tokenization: punctuation splitting and wordpiece.
-
- Refer to superclass [`BartTokenizer`] for usage examples and documentation concerning
- parameters.
- """
-
- vocab_files_names = VOCAB_FILES_NAMES
-
-{%- elif cookiecutter.tokenizer_type == "Standalone" %}
-from typing import List, Optional
-
-from tokenizers import ByteLevelBPETokenizer
-
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-
-class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
- """
- Construct a {{cookiecutter.modelname}} tokenizer. Based on byte-level Byte-Pair-Encoding.
-
- Args:
- vocab_file (`str`):
- Path to the vocabulary file.
- """
-
- vocab_files_names = VOCAB_FILES_NAMES
- model_input_names = ["input_ids", "attention_mask"]
-
- def __init__(
- self,
- vocab_file,
- unk_token="<|endoftext|>",
- bos_token="<|endoftext|>",
- eos_token="<|endoftext|>",
- **kwargs
- ):
- bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
- eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
- unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
- super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
-
- """ Initialisation """
-
- @property
- def vocab_size(self):
- """ Returns vocab size """
-
- def get_vocab(self):
- """ Returns vocab as a dict """
-
- def _tokenize(self, text):
- """ Returns a tokenized string. """
-
- def _convert_token_to_id(self, token):
- """ Converts a token (str) in an id using the vocab. """
-
- def _convert_id_to_token(self, index):
- """Converts an index (integer) in a token (str) using the vocab."""
-
- def convert_tokens_to_string(self, tokens):
- """ Converts a sequence of tokens (string) in a single string. """
-
- def save_vocabulary(self, save_directory):
- """
- Save the vocabulary and special tokens file to a directory.
-
- Args:
- save_directory (`str`):
- The directory in which to save the vocabulary.
-
- Returns:
- `Tuple(str)`: Paths to the files saved.
- """
-
- def build_inputs_with_special_tokens(
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
- ) -> List[int]:
- """
- Build model inputs from a sequence or a pair of sequence for sequence classification tasks
- by concatenating and adding special tokens.
- A {{cookiecutter.modelname}} sequence has the following format:
-
- - single sequence: ` X `
- - pair of sequences: ` A B `
-
- Args:
- token_ids_0 (`List[int]`):
- List of IDs to which the special tokens will be added.
- token_ids_1 (`List[int]`, *optional*):
- Optional second list of IDs for sequence pairs.
-
- Returns:
- `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
- """
- if token_ids_1 is None:
- return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
- cls = [self.cls_token_id]
- sep = [self.sep_token_id]
- return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
- def get_special_tokens_mask(
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
- ) -> List[int]:
- """
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer `prepare_for_model` method.
-
- Args:
- token_ids_0 (`List[int]`):
- List of IDs.
- token_ids_1 (`List[int]`, *optional*):
- Optional second list of IDs for sequence pairs.
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
- Whether or not the token list is already formatted with special tokens for the model.
-
- Returns:
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
- """
- if already_has_special_tokens:
- return super().get_special_tokens_mask(
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
- )
-
- if token_ids_1 is None:
- return [1] + ([0] * len(token_ids_0)) + [1]
- return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
- def create_token_type_ids_from_sequences(
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
- ) -> List[int]:
- """
- Create a mask from the two sequences passed to be used in a sequence-pair classification task.
- {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.
-
- Args:
- token_ids_0 (`List[int]`):
- List of IDs.
- token_ids_1 (`List[int]`, *optional*):
- Optional second list of IDs for sequence pairs.
-
- Returns:
- `List[int]`: List of zeros.
- """
- sep = [self.sep_token_id]
- cls = [self.cls_token_id]
-
- if token_ids_1 is None:
- return len(cls + token_ids_0 + sep) * [0]
- return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
- def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
- add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
- if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
- text = " " + text
- return (text, kwargs)
-
-class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast):
- """
- Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).
-
- Args:
- vocab_file (`str`):
- Path to the vocabulary file.
- """
-
- vocab_files_names = VOCAB_FILES_NAMES
- model_input_names = ["input_ids", "attention_mask"]
-
- def __init__(
- self,
- vocab_file,
- merges_file,
- unk_token="<|endoftext|>",
- bos_token="<|endoftext|>",
- eos_token="<|endoftext|>",
- add_prefix_space=False,
- trim_offsets=True,
- **kwargs
- ):
- super().__init__(
- ByteLevelBPETokenizer(
- vocab_file=vocab_file,
- merges_file=merges_file,
- add_prefix_space=add_prefix_space,
- trim_offsets=trim_offsets,
- ),
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- **kwargs,
- )
- self.add_prefix_space = add_prefix_space
-
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
- output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
- if token_ids_1 is None:
- return output
-
- return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
-
-
- def create_token_type_ids_from_sequences(
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
- ) -> List[int]:
- """
- Create a mask from the two sequences passed to be used in a sequence-pair classification task.
- {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.
-
- Args:
- token_ids_0 (`List[int]`):
- List of IDs.
- token_ids_1 (`List[int]`, *optional*):
- Optional second list of IDs for sequence pairs.
-
- Returns:
- `List[int]`: List of zeros.
- """
- sep = [self.sep_token_id]
- cls = [self.cls_token_id]
-
- if token_ids_1 is None:
- return len(cls + token_ids_0 + sep) * [0]
- return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-{% endif %}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.md b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.md
deleted file mode 100644
index dcbac3638d496c..00000000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.md
+++ /dev/null
@@ -1,234 +0,0 @@
-
-
-# {{cookiecutter.modelname}}
-
-## Overview
-
-The {{cookiecutter.modelname}} model was proposed in []() by .
-
-The abstract from the paper is the following:
-
-**
-
-Tips:
-
-
-
-This model was contributed by [INSERT YOUR HF USERNAME HERE](). The original code can be found [here]().
-
-## {{cookiecutter.camelcase_modelname}}Config
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}Config
-
-
-## {{cookiecutter.camelcase_modelname}}Tokenizer
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}Tokenizer
- - build_inputs_with_special_tokens
- - get_special_tokens_mask
- - create_token_type_ids_from_sequences
- - save_vocabulary
-
-
-## {{cookiecutter.camelcase_modelname}}TokenizerFast
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}TokenizerFast
-
-
-{% if "PyTorch" in cookiecutter.generate_tensorflow_pytorch_and_flax -%}
-## {{cookiecutter.camelcase_modelname}}Model
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}Model
- - forward
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-## {{cookiecutter.camelcase_modelname}}ForCausalLM
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForCausalLM
- - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForMaskedLM
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForMaskedLM
- - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForSequenceClassification
-
-[[autodoc]] transformers.{{cookiecutter.camelcase_modelname}}ForSequenceClassification
- - forward
-
-## {{cookiecutter.camelcase_modelname}}ForMultipleChoice
-
-[[autodoc]] transformers.{{cookiecutter.camelcase_modelname}}ForMultipleChoice
- - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForTokenClassification
-
-[[autodoc]] transformers.{{cookiecutter.camelcase_modelname}}ForTokenClassification
- - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForQuestionAnswering
- - forward
-
-{%- else %}
-## {{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForConditionalGeneration
- - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForSequenceClassification
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForSequenceClassification
- - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForQuestionAnswering
- - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForCausalLM
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForCausalLM
- - forward
-
-
-{% endif -%}
-{% endif -%}
-{% if "TensorFlow" in cookiecutter.generate_tensorflow_pytorch_and_flax -%}
-
-## TF{{cookiecutter.camelcase_modelname}}Model
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}Model
- - call
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-## TF{{cookiecutter.camelcase_modelname}}ForMaskedLM
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForMaskedLM
- - call
-
-
-## TF{{cookiecutter.camelcase_modelname}}ForCausalLM
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForCausalLM
- - call
-
-
-## TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification
- - call
-
-
-## TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice
- - call
-
-
-## TF{{cookiecutter.camelcase_modelname}}ForTokenClassification
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForTokenClassification
- - call
-
-
-## TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
- - call
-
-
-{%- else %}
-## TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
- - call
-
-
-{% endif -%}
-{% endif -%}
-
-{% if "Flax" in cookiecutter.generate_tensorflow_pytorch_and_flax -%}
-
-## Flax{{cookiecutter.camelcase_modelname}}Model
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}Model
- - call
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-## Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM
- - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForCausalLM
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForCausalLM
- - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
- - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice
- - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification
- - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
- - call
-
-
-{%- else %}
-## Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
- - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
- - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
- - call
-
-
-{% endif -%}
-{% endif -%}
diff --git a/templates/adding_a_new_model/cookiecutter.json b/templates/adding_a_new_model/cookiecutter.json
deleted file mode 100644
index 1fd9fda5b2f1be..00000000000000
--- a/templates/adding_a_new_model/cookiecutter.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
- "modelname": "BrandNewBERT",
- "uppercase_modelname": "BRAND_NEW_BERT",
- "lowercase_modelname": "brand_new_bert",
- "camelcase_modelname": "BrandNewBert",
- "authors": "The HuggingFace Team",
- "checkpoint_identifier": "brand-new-bert-base-cased",
- "tokenizer_type": ["Based on BERT", "Based on BART", "Standalone"],
- "generate_tensorflow_pytorch_and_flax": [
- "PyTorch, TensorFlow and Flax",
- "PyTorch & TensorFlow",
- "PyTorch & Flax",
- "TensorFlow & Flax",
- "PyTorch",
- "TensorFlow",
- "Flax"
- ],
- "is_encoder_decoder_model": ["True", "False"]
-}
diff --git a/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json b/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json
deleted file mode 100644
index dcc686c71210c9..00000000000000
--- a/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
- "modelname": "Template",
- "uppercase_modelname": "TEMPLATE",
- "lowercase_modelname": "template",
- "camelcase_modelname": "Template",
- "authors": "The HuggingFace Team",
- "checkpoint_identifier": "brand-new-bert-base-cased",
- "tokenizer_type": "Based on BERT",
- "generate_tensorflow_pytorch_and_flax": "PyTorch, TensorFlow and Flax",
- "is_encoder_decoder_model": "False"
-}
diff --git a/templates/adding_a_new_model/tests/flax-encoder-bert-tokenizer.json b/templates/adding_a_new_model/tests/flax-encoder-bert-tokenizer.json
deleted file mode 100644
index 506ba974c730f5..00000000000000
--- a/templates/adding_a_new_model/tests/flax-encoder-bert-tokenizer.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
- "modelname": "TemplateFLAX",
- "uppercase_modelname": "TEMPLATE_FLAX",
- "lowercase_modelname": "template_flax",
- "camelcase_modelname": "TemplateFlax",
- "authors": "The HuggingFace Team",
- "checkpoint_identifier": "brand-new-bert-base-cased",
- "tokenizer_type": "Based on BERT",
- "generate_tensorflow_pytorch_and_flax": "Flax",
- "is_encoder_decoder_model": "False"
-}
diff --git a/templates/adding_a_new_model/tests/flax-seq-2-seq-bart-tokenizer.json b/templates/adding_a_new_model/tests/flax-seq-2-seq-bart-tokenizer.json
deleted file mode 100644
index a5ad69324e6fc8..00000000000000
--- a/templates/adding_a_new_model/tests/flax-seq-2-seq-bart-tokenizer.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
- "modelname": "FlaxNewENCDEC",
- "uppercase_modelname": "FLAX_NEW_ENC_DEC",
- "lowercase_modelname": "flax_new_enc_dec_template",
- "camelcase_modelname": "FlaxNewEncDec",
- "authors": "The HuggingFace Team",
- "checkpoint_identifier": "new-flax-enc-dec-base",
- "tokenizer_type": "Based on BART",
- "generate_tensorflow_pytorch_and_flax": "Flax",
- "is_encoder_decoder_model": "True"
-}
diff --git a/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json b/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json
deleted file mode 100644
index 48a47e5dc4a4a2..00000000000000
--- a/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
- "modelname": "TemplatePT",
- "uppercase_modelname": "TEMPLATE_PT",
- "lowercase_modelname": "template_pt",
- "camelcase_modelname": "TemplatePt",
- "authors": "The HuggingFace Team",
- "checkpoint_identifier": "brand-new-bert-base-cased",
- "tokenizer_type": "Based on BERT",
- "generate_tensorflow_pytorch_and_flax": "PyTorch",
- "is_encoder_decoder_model": "False"
-}
diff --git a/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json b/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json
deleted file mode 100644
index 2fb0fdf4e598f9..00000000000000
--- a/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
- "modelname": "PTNewENCDEC",
- "uppercase_modelname": "PT_NEW_ENC_DEC",
- "lowercase_modelname": "pt_new_enc_dec_template",
- "camelcase_modelname": "PtNewEncDec",
- "authors": "The HuggingFace Team",
- "checkpoint_identifier": "pt-new-enc-dec-base",
- "tokenizer_type": "Based on BART",
- "generate_tensorflow_pytorch_and_flax": "PyTorch",
- "is_encoder_decoder_model": "True"
-}
diff --git a/templates/adding_a_new_model/tests/standalone.json b/templates/adding_a_new_model/tests/standalone.json
deleted file mode 100644
index 9b6b2a11829ea8..00000000000000
--- a/templates/adding_a_new_model/tests/standalone.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
- "modelname": "TemplateBI",
- "uppercase_modelname": "TEMPLATE_BI",
- "lowercase_modelname": "template_bi",
- "camelcase_modelname": "TemplateBi",
- "authors": "The HuggingFace Team",
- "checkpoint_identifier": "bi-brand-new-bert-base-cased",
- "tokenizer_type": "Standalone",
- "generate_tensorflow_pytorch_and_flax": "PyTorch, TensorFlow and Flax",
- "is_encoder_decoder_model": "False"
-}
diff --git a/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json b/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json
deleted file mode 100644
index ea0178d4fa01fb..00000000000000
--- a/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
- "modelname": "TemplateTF",
- "uppercase_modelname": "TEMPLATE_TF",
- "lowercase_modelname": "template_tf",
- "camelcase_modelname": "TemplateTf",
- "authors": "The HuggingFace Team",
- "checkpoint_identifier": "brand-new-bert-base-cased",
- "tokenizer_type": "Based on BERT",
- "generate_tensorflow_pytorch_and_flax": "TensorFlow",
- "is_encoder_decoder_model": "False"
-}
diff --git a/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json b/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json
deleted file mode 100644
index a1be4266b92a2b..00000000000000
--- a/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
- "modelname": "NewTFENCDEC",
- "uppercase_modelname": "NEW_TF_ENC_DEC",
- "lowercase_modelname": "new_tf_enc_dec_template",
- "camelcase_modelname": "NewTFEncDec",
- "authors": "The HuggingFace Team",
- "checkpoint_identifier": "new-tf-enc-dec-base_template",
- "tokenizer_type": "Based on BART",
- "generate_tensorflow_pytorch_and_flax": "TensorFlow",
- "is_encoder_decoder_model": "True"
-}
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 215b2582305df5..eacba9ebc6f4a5 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1091,8 +1091,9 @@ def test_beam_search_low_memory(self):
)
self.assertListEqual(low_output.tolist(), high_output.tolist())
+ @parameterized.expand([("random",), ("same",)])
@is_flaky() # Read NOTE (1) below. If there are API issues, all attempts will fail.
- def test_assisted_decoding_matches_greedy_search(self):
+ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
# This test ensures that the assisted generation does not introduce output changes over greedy search.
# NOTE (1): The sentence above is true most of the time, there is a tiny difference in the logits due to matmul
# shape differences -- and it may result in a different output. The input shape difference happens in the
@@ -1151,7 +1152,13 @@ def test_assisted_decoding_matches_greedy_search(self):
}
output_greedy = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
- assistant_model = model
+ # test with the same assistant model or randomly init one
+ # in the first case all candidate tokens are accepted, in the second none is accepted
+ # case when some are accepted and some not is hard to reproduce, so let's hope this catches most errors :)
+ if assistant_type == "random":
+ assistant_model = model_class(config).to(torch_device).eval()
+ else:
+ assistant_model = model
assistant_model.generation_config.num_assistant_tokens = 2 # see b)
assistant_model.generation_config.num_assistant_tokens_schedule = "constant" # see b)
generation_kwargs.update({"assistant_model": assistant_model})
diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py
index 0e1fe54e355583..4f755d816014aa 100644
--- a/tests/models/gemma/test_tokenization_gemma.py
+++ b/tests/models/gemma/test_tokenization_gemma.py
@@ -30,6 +30,7 @@
get_tests_dir,
nested_simplify,
require_jinja,
+ require_read_token,
require_sentencepiece,
require_tokenizers,
require_torch,
@@ -136,11 +137,12 @@ def test_special_tokens_initialization(self):
self.assertTrue(special_token_id in cr_output)
@slow
+ @require_read_token
def test_tokenizer_integration(self):
expected_encoding = {'input_ids': [[2, 158434, 591, 84193, 3836, 685, 6599, 31223, 235290, 140247, 578, 6599, 31223, 235290, 145139, 235290, 3491, 235275, 6572, 3311, 235290, 38197, 109959, 591, 25894, 235269, 162174, 235290, 235284, 235269, 1791, 6362, 12481, 235269, 1576, 18622, 235269, 2900, 1136, 86684, 235269, 29092, 4632, 16994, 604, 13146, 14944, 40371, 591, 19700, 235327, 235275, 578, 13146, 14944, 25511, 591, 235300, 12474, 235275, 675, 1163, 235248, 235304, 235284, 235340, 229903, 5377, 575, 235248, 235274, 235276, 235276, 235340, 17044, 578, 5271, 1061, 118345, 1865, 125247, 235269, 8745, 111226, 578, 176888, 235265], [2, 25894, 603, 6869, 577, 953, 235290, 8297, 5271, 209099, 41642, 774, 748, 78253, 2793, 731, 51506, 34346, 611, 2145, 2731, 578, 1833, 4807, 575, 832, 16630, 235265], [2, 651, 4320, 8426, 25341, 36271, 1163, 573, 27894, 5929, 235265]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip
self.tokenizer_integration_test_util(
expected_encoding=expected_encoding,
- model_name="hf-internal-testing/dummy-gemma",
+ model_name="google/gemma-2b",
revision="",
padding=False,
)
@@ -318,7 +320,13 @@ def test_integration_test_xnli(self):
encoded1 = pyth_tokenizer.encode(string)
encoded2 = rust_tokenizer.encode(string)
- self.assertEqual(encoded1, encoded2)
+ self.assertEqual(
+ encoded1,
+ encoded2,
+ msg="Hint: the following tokenization diff were obtained for slow vs fast:\n "
+ f"elements in slow: {set(pyth_tokenizer.tokenize(string))-set(rust_tokenizer.tokenize(string))} \nvs\n "
+ f"elements in fast: {set(rust_tokenizer.tokenize(string))-set(pyth_tokenizer.tokenize(string))} \n\n{string}",
+ )
decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
decoded2 = rust_tokenizer.decode(encoded1, skip_special_tokens=True)
@@ -332,7 +340,7 @@ def test_integration_test_xnli(self):
encoded1 = pyth_tokenizer.encode(string)
encoded2 = rust_tokenizer.encode(string)
- self.assertEqual(encoded1, encoded2)
+ self.assertEqual(encoded1, encoded2, msg=f"failed on {string}")
decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index 5a0bcea48af17a..84bd6d7a9d9b8a 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -543,8 +543,15 @@ def test_integration_test_xnli(self):
def test_special_token_special_word(self):
# the word inform should be split as ['in', 'form']
- tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)
+ tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
tokenizer.add_tokens([AddedToken("", rstrip=True, lstrip=True)], special_tokens=False)
+
+ example_inputs = tokenizer.tokenize("inform. Hey. .")
+ self.assertEqual(example_inputs, ["", "in", "form", "", ".", "âHey", ".", "ââââââ", "â."])
+
+ # Make sure dummy space is added if it is indeed the first word
+ example_inputs = tokenizer.tokenize("inform. Hey. .")
+ self.assertEqual(example_inputs, ["âinform", "", ".", "âHey", ".", "ââââââ", "â."])
out1 = tokenizer.decode(
tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False
)
@@ -553,12 +560,12 @@ def test_special_token_special_word(self):
tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True
)
# decoding strips the added prefix space.
- self.assertEqual(out2, " inform")
+ self.assertEqual(out2, "inform")
input_ids = tokenizer.encode("inform", add_special_tokens=False)
- self.assertEqual(input_ids, [29871, 32000, 262, 689]) # 29871 is the spiece underline, 'â' added as it should
+ self.assertEqual(input_ids, [32000, 262, 689]) # 29871 is the spiece underline, 'â' added as it should
out2 = tokenizer.decode(
- tokenizer.encode(" inform", add_special_tokens=False), spaces_between_special_tokens=False
+ tokenizer.encode(" inform", add_special_tokens=False), spaces_between_special_tokens=False
)
# TODO @ArthurZ currently we strip left and right, so this will not keep the spaces
self.assertEqual(out2, "inform")
@@ -575,11 +582,11 @@ def test_special_token_special_word(self):
# Let's make sure that if there are any spaces, we don't remove them!
input_ids = tokenizer.encode(" Hello how", add_special_tokens=False)
- self.assertEqual(input_ids, [259, 1, 15043, 1, 920])
+ self.assertEqual(input_ids, [29871, 1, 15043, 1, 920])
tokens = tokenizer.tokenize(" Hello how", add_special_tokens=False)
- self.assertEqual(tokens, ["ââ", "", "âHello", "", "âhow"])
+ self.assertEqual(tokens, ["â", "", "âHello", "", "âhow"])
decoded_tokens = tokenizer.decode(input_ids)
- self.assertEqual(decoded_tokens, " Hello how")
+ self.assertEqual(decoded_tokens, " Hello how")
# Let's make sure the space is preserved
input_ids = tokenizer.encode("hello", add_special_tokens=True)
@@ -594,6 +601,63 @@ def test_special_token_special_word(self):
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, "hello")
+ def test_no_prefix_space(self):
+ tokenizer = LlamaTokenizerFast.from_pretrained(
+ "huggyllama/llama-7b", legacy=False, from_slow=True, add_prefix_space=False
+ )
+ tokenizer.add_tokens([AddedToken("", rstrip=True, lstrip=True)], special_tokens=False)
+
+ example_inputs = tokenizer.tokenize("inform. Hey. .")
+ self.assertEqual(example_inputs, ["", "in", "form", "", ".", "âHey", ".", "ââââââ", "â."])
+
+ # Make sure dummy space is added if it is indeed the first word
+ example_inputs = tokenizer.tokenize("inform. Hey. .")
+ self.assertEqual(example_inputs, ["in", "form", "", ".", "âHey", ".", "ââââââ", "â."])
+ out1 = tokenizer.decode(
+ tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False
+ )
+ self.assertEqual(out1, "inform")
+ out2 = tokenizer.decode(
+ tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True
+ )
+ # decoding strips the added prefix space.
+ self.assertEqual(out2, "inform")
+ input_ids = tokenizer.encode("inform", add_special_tokens=False)
+ self.assertEqual(input_ids, [32000, 262, 689]) # 29871 is the spiece underline, 'â' added as it should
+
+ out2 = tokenizer.decode(
+ tokenizer.encode(" inform", add_special_tokens=False), spaces_between_special_tokens=False
+ )
+ self.assertEqual(out2, "inform")
+
+ input_ids = tokenizer.encode(" Hellohow", add_special_tokens=False)
+ self.assertEqual(input_ids, [1, 15043, 1, 3525])
+ tokens = tokenizer.tokenize(" Hellohow", add_special_tokens=False)
+ self.assertEqual(tokens, ["", "âHello", "", "how"])
+ decoded_tokens = tokenizer.decode(input_ids)
+ self.assertEqual(decoded_tokens, " Hellohow")
+
+ # Let's make sure that if there are any spaces, we don't remove them!
+ input_ids = tokenizer.encode(" Hello how", add_special_tokens=False)
+ self.assertEqual(input_ids, [29871, 1, 15043, 1, 920])
+ tokens = tokenizer.tokenize(" Hello how", add_special_tokens=False)
+ self.assertEqual(tokens, ["â", "", "âHello", "", "âhow"])
+ decoded_tokens = tokenizer.decode(input_ids)
+ self.assertEqual(decoded_tokens, " Hello how")
+
+ # Let's make sure the space is preserved
+ input_ids = tokenizer.encode("hello", add_special_tokens=True)
+ self.assertEqual(input_ids, [1, 12199])
+ tokens = tokenizer.tokenize("hello")
+ self.assertEqual(tokens, ["hello"])
+ decoded_tokens = tokenizer.decode(input_ids)
+ self.assertEqual(decoded_tokens, "hello")
+
+ input_ids = tokenizer.encode("hello", add_special_tokens=False)
+ self.assertEqual(input_ids, [12199])
+ decoded_tokens = tokenizer.decode(input_ids)
+ self.assertEqual(decoded_tokens, "hello")
+
def test_some_edge_cases(self):
tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index ce432e0599d73e..b4c57e7ba012da 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -591,14 +591,6 @@ def test_tokenizer_integration(self):
fast_tokenizer.add_tokens("", True)
prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
- # If the token is added as special, it's not normalized, and the only diff is the extra space after special tokens.
- # https://github.com/huggingface/transformers/pull/28881 is the fix for this.
- self.assertEqual(
- slow_tokenizer.tokenize(prompt),
- ['<|im_start|>', 'system', '\n', 'Answer', 'âthe', 'âquestions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '', '\n', 'What', 'âis', 'âshown', 'âin', 'âthis', 'âimage', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n']
- ) # fmt: skip
-
- self.assertEqual(
- fast_tokenizer.tokenize(prompt),
- ['<|im_start|>', 'âsystem', '\n', 'Answer', 'âthe', 'âquestions', '.', '<|im_end|>', '<|im_start|>', 'âuser', '\n', '', 'â', '\n', 'What', 'âis', 'âshown', 'âin', 'âthis', 'âimage', '?', '<|im_end|>', '<|im_start|>', 'âassistant', '\n']
- ) # fmt: skip
+ EXPECTED_OUTPUT = ['<|im_start|>', 'system', '\n', 'Answer', 'âthe', 'âquestions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '', '\n', 'What', 'âis', 'âshown', 'âin', 'âthis', 'âimage', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n'] # fmt: skip
+ self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
+ self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
diff --git a/tests/models/phi3/__init__.py b/tests/models/phi3/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py
new file mode 100644
index 00000000000000..cc0c00d4e1ea63
--- /dev/null
+++ b/tests/models/phi3/test_modeling_phi3.py
@@ -0,0 +1,474 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Testing suite for the PyTorch Phi-3 model. """
+
+
+import unittest
+
+from parameterized import parameterized
+
+from transformers import Phi3Config, is_torch_available, set_seed
+from transformers.testing_utils import (
+ require_torch,
+ slow,
+ torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+ import torch
+
+ from transformers import (
+ AutoTokenizer,
+ Phi3ForCausalLM,
+ Phi3ForSequenceClassification,
+ Phi3ForTokenClassification,
+ Phi3Model,
+ )
+
+
+class Phi3ModelTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=13,
+ seq_length=7,
+ is_training=True,
+ use_input_mask=True,
+ use_token_type_ids=False,
+ use_labels=True,
+ vocab_size=99,
+ hidden_size=32,
+ num_hidden_layers=2,
+ num_attention_heads=4,
+ intermediate_size=37,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.1,
+ attention_probs_dropout_prob=0.1,
+ max_position_embeddings=512,
+ type_vocab_size=16,
+ type_sequence_label_size=2,
+ initializer_range=0.02,
+ num_labels=3,
+ num_choices=4,
+ pad_token_id=0,
+ scope=None,
+ ):
+ self.parent = parent
+ self.batch_size = batch_size
+ self.seq_length = seq_length
+ self.is_training = is_training
+ self.use_input_mask = use_input_mask
+ self.use_token_type_ids = use_token_type_ids
+ self.use_labels = use_labels
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.hidden_act = hidden_act
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.max_position_embeddings = max_position_embeddings
+ self.type_vocab_size = type_vocab_size
+ self.type_sequence_label_size = type_sequence_label_size
+ self.initializer_range = initializer_range
+ self.num_labels = num_labels
+ self.num_choices = num_choices
+ self.pad_token_id = pad_token_id
+ self.scope = scope
+
+ # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs
+ def prepare_config_and_inputs(self):
+ input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+ input_mask = None
+ if self.use_input_mask:
+ input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+
+ token_type_ids = None
+ if self.use_token_type_ids:
+ token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+ sequence_labels = None
+ token_labels = None
+ choice_labels = None
+ if self.use_labels:
+ sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+ token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+ choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+ config = self.get_config()
+
+ return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+ def get_config(self):
+ return Phi3Config(
+ vocab_size=self.vocab_size,
+ hidden_size=self.hidden_size,
+ num_hidden_layers=self.num_hidden_layers,
+ num_attention_heads=self.num_attention_heads,
+ intermediate_size=self.intermediate_size,
+ hidden_act=self.hidden_act,
+ hidden_dropout_prob=self.hidden_dropout_prob,
+ attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+ max_position_embeddings=self.max_position_embeddings,
+ type_vocab_size=self.type_vocab_size,
+ is_decoder=False,
+ initializer_range=self.initializer_range,
+ pad_token_id=self.pad_token_id,
+ )
+
+ # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Phi3
+ def create_and_check_model(
+ self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+ ):
+ model = Phi3Model(config=config)
+ model.to(torch_device)
+ model.eval()
+ result = model(input_ids, attention_mask=input_mask)
+ result = model(input_ids)
+ self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+ # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Phi3
+ def create_and_check_model_as_decoder(
+ self,
+ config,
+ input_ids,
+ token_type_ids,
+ input_mask,
+ sequence_labels,
+ token_labels,
+ choice_labels,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ ):
+ config.add_cross_attention = True
+ model = Phi3Model(config)
+ model.to(torch_device)
+ model.eval()
+ result = model(
+ input_ids,
+ attention_mask=input_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ )
+ result = model(
+ input_ids,
+ attention_mask=input_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ )
+ result = model(input_ids, attention_mask=input_mask)
+ self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+ # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Phi3
+ def create_and_check_for_causal_lm(
+ self,
+ config,
+ input_ids,
+ token_type_ids,
+ input_mask,
+ sequence_labels,
+ token_labels,
+ choice_labels,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ ):
+ model = Phi3ForCausalLM(config=config)
+ model.to(torch_device)
+ model.eval()
+ result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+ self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+ # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Phi3
+ def create_and_check_decoder_model_past_large_inputs(
+ self,
+ config,
+ input_ids,
+ token_type_ids,
+ input_mask,
+ sequence_labels,
+ token_labels,
+ choice_labels,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ ):
+ config.is_decoder = True
+ config.add_cross_attention = True
+ model = Phi3ForCausalLM(config=config)
+ model.to(torch_device)
+ model.eval()
+
+ # first forward pass
+ outputs = model(
+ input_ids,
+ attention_mask=input_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ use_cache=True,
+ )
+ past_key_values = outputs.past_key_values
+
+ # create hypothetical multiple next token and extent to next_input_ids
+ next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+ next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+ # append to next input_ids and
+ next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+ next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+ output_from_no_past = model(
+ next_input_ids,
+ attention_mask=next_attention_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ output_hidden_states=True,
+ )["hidden_states"][0]
+ output_from_past = model(
+ next_tokens,
+ attention_mask=next_attention_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_values=past_key_values,
+ output_hidden_states=True,
+ )["hidden_states"][0]
+
+ # select random slice
+ random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+ output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+ output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+ self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+ # test that outputs are equal for slice
+ self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+ # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
+ def prepare_config_and_inputs_for_common(self):
+ config_and_inputs = self.prepare_config_and_inputs()
+ (
+ config,
+ input_ids,
+ token_type_ids,
+ input_mask,
+ sequence_labels,
+ token_labels,
+ choice_labels,
+ ) = config_and_inputs
+ inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+ return config, inputs_dict
+
+
+@require_torch
+class Phi3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+ all_model_classes = (
+ (Phi3Model, Phi3ForCausalLM, Phi3ForSequenceClassification, Phi3ForTokenClassification)
+ if is_torch_available()
+ else ()
+ )
+ all_generative_model_classes = (Phi3ForCausalLM,) if is_torch_available() else ()
+ pipeline_model_mapping = (
+ {
+ "feature-extraction": Phi3Model,
+ "text-classification": Phi3ForSequenceClassification,
+ "text-generation": Phi3ForCausalLM,
+ "token-classification": Phi3ForTokenClassification,
+ "zero-shot": Phi3ForSequenceClassification,
+ }
+ if is_torch_available()
+ else {}
+ )
+
+ test_headmasking = False
+ test_pruning = False
+
+ # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79292/workflows/fa2ba644-8953-44a6-8f67-ccd69ca6a476/jobs/1012905
+ def is_pipeline_test_to_skip(
+ self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+ ):
+ return True
+
+ # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.setUp with Llama->Phi3
+ def setUp(self):
+ self.model_tester = Phi3ModelTester(self)
+ self.config_tester = ConfigTester(self, config_class=Phi3Config, hidden_size=37)
+
+ # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_config
+ def test_config(self):
+ self.config_tester.run_common_tests()
+
+ # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model
+ def test_model(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_model(*config_and_inputs)
+
+ # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model with Llama->Phi3,llama->phi3
+ def test_phi3_sequence_classification_model(self):
+ config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ config.num_labels = 3
+ input_ids = input_dict["input_ids"]
+ attention_mask = input_ids.ne(1).to(torch_device)
+ sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+ model = Phi3ForSequenceClassification(config)
+ model.to(torch_device)
+ model.eval()
+ result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+ self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+ # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model_for_single_label with Llama->Phi3,llama->phi3
+ def test_phi3_sequence_classification_model_for_single_label(self):
+ config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ config.num_labels = 3
+ config.problem_type = "single_label_classification"
+ input_ids = input_dict["input_ids"]
+ attention_mask = input_ids.ne(1).to(torch_device)
+ sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+ model = Phi3ForSequenceClassification(config)
+ model.to(torch_device)
+ model.eval()
+ result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+ self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+ # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model_for_multi_label with Llama->Phi3,llama->phi3
+ def test_phi3_sequence_classification_model_for_multi_label(self):
+ config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ config.num_labels = 3
+ config.problem_type = "multi_label_classification"
+ input_ids = input_dict["input_ids"]
+ attention_mask = input_ids.ne(1).to(torch_device)
+ sequence_labels = ids_tensor(
+ [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+ ).to(torch.float)
+ model = Phi3ForSequenceClassification(config)
+ model.to(torch_device)
+ model.eval()
+ result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+ self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+ @parameterized.expand([("su",), ("yarn",)])
+ def test_model_rope_scaling_from_config(self, scaling_type):
+ config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+ short_input = ids_tensor([1, 10], config.vocab_size)
+ long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+ set_seed(42) # Fixed seed at init time so the two models get the same random weights
+ original_model = Phi3Model(config)
+ original_model.to(torch_device)
+ original_model.eval()
+ original_short_output = original_model(short_input).last_hidden_state
+ original_long_output = original_model(long_input).last_hidden_state
+
+ set_seed(42) # Fixed seed at init time so the two models get the same random weights
+ n_factors = config.hidden_size // config.num_attention_heads // 2
+ config.rope_scaling = {
+ "type": scaling_type,
+ "short_factor": [5.0 for _ in range(n_factors)],
+ "long_factor": [5.0 for _ in range(n_factors)],
+ }
+ scaled_model = Phi3Model(config)
+ scaled_model.to(torch_device)
+ scaled_model.eval()
+ scaled_short_output = scaled_model(short_input).last_hidden_state
+ scaled_long_output = scaled_model(long_input).last_hidden_state
+
+ # Scaling changes the RoPE embeddings, both for the short and long outputs
+ self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+ self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+
+@slow
+@require_torch
+class Phi3IntegrationTest(unittest.TestCase):
+ def test_model_phi3_mini_4k_instruct_logits(self):
+ input_ids = {
+ "input_ids": torch.tensor(
+ [[1212, 318, 281, 1672, 2643, 290, 428, 318, 257, 1332]], dtype=torch.long, device=torch_device
+ )
+ }
+
+ model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct").to(torch_device)
+ model.eval()
+
+ output = model(**input_ids).logits
+
+ EXPECTED_OUTPUT = torch.tensor([[ 0.9979, -1.9449, -2.5613, -2.2110, -0.9323, -2.2726, -3.2468, -2.0122,-1.0021, -1.2764, -1.0876, -1.2358, 3.9385, 6.2152, -0.3695, -2.3285,-1.2907, -1.8238, -1.9941, -2.2098, -0.6923, -1.6793, -1.1660, -2.0469,-0.7369, -1.4101, -1.4091, -3.1694, -1.8383, -1.1952],[ 3.0525, 1.9178, 3.7016, 0.9263, 0.3397, 1.9584, 2.1347, 0.3482, 1.3773, 0.2153, 0.2798, 0.8360, 9.0936, 11.4944, -0.3575, -0.9442,-0.1246, 1.3869, 0.9846, 1.7243, 0.9150, 1.0823, 0.4313, 1.5742, 0.2566, -0.1401, -1.3019, 0.4967, 0.6941, 0.7214]]).to(torch_device) # fmt: skip
+
+ self.assertTrue(torch.allclose(EXPECTED_OUTPUT, output[0, :2, :30], atol=1e-4, rtol=1e-4))
+
+ def test_phi3_mini_4k_instruct_generation(self):
+ model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+
+ messages = [
+ {
+ "role": "system",
+ "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.",
+ },
+ {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
+ ]
+ inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
+
+ outputs = model.generate(inputs, max_new_tokens=32)
+ output_text = tokenizer.batch_decode(outputs)
+
+ EXPECTED_OUTPUT = [
+ "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Absolutely! Bananas and dragonfruits are both delicious fruits that can be combined in various ways to create tasty and nutrit"
+ ]
+
+ self.assertListEqual(output_text, EXPECTED_OUTPUT)
+
+ def test_model_phi3_mini_128k_instruct_logits(self):
+ input_ids = {
+ "input_ids": torch.tensor(
+ [[1212, 318, 281, 1672, 2643, 290, 428, 318, 257, 1332]], dtype=torch.long, device=torch_device
+ )
+ }
+
+ model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-128k-instruct").to(torch_device)
+ model.eval()
+
+ output = model(**input_ids).logits
+
+ EXPECTED_OUTPUT = torch.tensor([[ 1.8478, -0.5709, -1.6792, -1.2133, -0.7809, -0.8817, -2.0969, -1.1191,-0.7731, -1.0483, -0.5961, -1.3067, 3.1325, 6.9442, -0.4803, -0.9154,-1.3085, -1.0822, -1.1433, -0.7660, -0.8531, -0.9150, -0.6179, -1.6153,-0.2239, -1.3207, -1.1187, -2.4795, -1.4733, -0.4931],[ 3.5839, 2.4722, 3.7130, 1.2032, 0.7356, 2.7777, 2.5256, 0.9157, 1.6431, 0.3533, 0.5100, 1.3512, 8.9873, 10.9815, 0.3530, 0.1473, 0.2051, 1.8553, 1.5988, 2.2268, 1.1897, 1.2829, 0.7894, 1.8895, 0.7666, 0.4122, -0.9316, 0.9936, 1.2722, 0.8263]]).to(torch_device) # fmt: skip
+
+ self.assertTrue(torch.allclose(EXPECTED_OUTPUT, output[0, :2, :30], atol=1e-4, rtol=1e-4))
+
+ def test_phi3_mini_128k_instruct_generation(self):
+ model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-128k-instruct")
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-128k-instruct")
+
+ messages = [
+ {
+ "role": "system",
+ "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.",
+ },
+ {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
+ ]
+ inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
+
+ outputs = model.generate(inputs, max_new_tokens=32)
+ output_text = tokenizer.batch_decode(outputs)
+
+ EXPECTED_OUTPUT = [
+ "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits can be combined in various delicious and healthy ways. Here are some ideas:\n\n1."
+ ]
+
+ self.assertListEqual(output_text, EXPECTED_OUTPUT)
diff --git a/tests/models/seggpt/test_modeling_seggpt.py b/tests/models/seggpt/test_modeling_seggpt.py
index d4a8a46f037851..d43d4304532431 100644
--- a/tests/models/seggpt/test_modeling_seggpt.py
+++ b/tests/models/seggpt/test_modeling_seggpt.py
@@ -16,6 +16,7 @@
import inspect
+import math
import unittest
from datasets import load_dataset
@@ -39,6 +40,7 @@
from torch import nn
from transformers import SegGptForImageSegmentation, SegGptModel
+ from transformers.models.seggpt.modeling_seggpt import SegGptLoss
if is_vision_available():
@@ -298,6 +300,22 @@ def recursive_check(batched_object, single_row_object, model_name, key):
model_row_output[key] = model_row_output[key][1:]
recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
+ def test_seggpt_loss(self):
+ torch.manual_seed(100)
+ config = self.model_tester.get_config()
+
+ prompt_masks = torch.rand(1, config.num_channels, config.image_size, config.image_size)
+ label = torch.rand(1, config.num_channels, config.image_size, config.image_size)
+ pred_masks = torch.rand(1, config.num_channels, config.image_size * 2, config.image_size)
+ # seq_len x 2 because the loss concatenates prompt_masks and labels as pred_masks is concatenated
+ bool_masked_pos = torch.rand(1, self.model_tester.seq_length * 2) > 0.5
+
+ loss = SegGptLoss(config)
+ loss_value = loss(prompt_masks, pred_masks, label, bool_masked_pos)
+ expected_loss_value = torch.tensor(0.3340)
+
+ self.assertTrue(torch.allclose(loss_value, expected_loss_value, atol=1e-4))
+
@slow
def test_model_from_pretrained(self):
model_name = "BAAI/seggpt-vit-large"
@@ -312,6 +330,20 @@ def prepare_img():
return images, masks
+def prepare_bool_masked_pos(config: SegGptConfig):
+ num_patches = math.prod([i // config.patch_size for i in config.image_size])
+ mask_ratio = 0.75
+ torch.manual_seed(2)
+ num_masked_patches = int(num_patches * mask_ratio)
+ shuffle_idx = torch.randperm(num_patches)
+ bool_masked_pos = torch.FloatTensor([0] * (num_patches - num_masked_patches) + [1] * num_masked_patches)[
+ shuffle_idx
+ ]
+ bool_masked_pos = bool_masked_pos.unsqueeze(0).bool()
+
+ return bool_masked_pos
+
+
@require_torch
@require_vision
class SegGptModelIntegrationTest(unittest.TestCase):
@@ -390,3 +422,30 @@ def test_few_shot_inference(self):
self.assertEqual(outputs.pred_masks.shape, expected_shape)
self.assertTrue(torch.allclose(outputs.pred_masks[0, :, 448:451, :3], expected_slice, atol=4e-4))
+
+ @slow
+ def test_one_shot_with_label(self):
+ model = SegGptForImageSegmentation.from_pretrained("BAAI/seggpt-vit-large").to(torch_device)
+
+ image_processor = self.default_image_processor
+
+ images, masks = prepare_img()
+
+ input_image = images[1]
+ label = masks[1]
+ prompt_image = images[0]
+ prompt_mask = masks[0]
+
+ inputs = image_processor(
+ images=input_image, prompt_masks=prompt_mask, prompt_images=prompt_image, return_tensors="pt"
+ ).to(torch_device)
+
+ labels = image_processor(images=None, prompt_masks=label, return_tensors="pt")["prompt_masks"].to(torch_device)
+
+ bool_masked_pos = prepare_bool_masked_pos(model.config).to(torch_device)
+
+ with torch.no_grad():
+ outputs = model(**inputs, labels=labels, bool_masked_pos=bool_masked_pos)
+
+ expected_loss = torch.tensor(0.0074).to(torch_device)
+ self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4))
diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py
index a1bc2ff172f749..f7465779b59461 100644
--- a/tests/models/yolos/test_image_processing_yolos.py
+++ b/tests/models/yolos/test_image_processing_yolos.py
@@ -18,6 +18,8 @@
import pathlib
import unittest
+from parameterized import parameterized
+
from transformers.testing_utils import require_torch, require_vision, slow
from transformers.utils import is_torch_available, is_vision_available
@@ -98,7 +100,7 @@ def get_expected_values(self, image_inputs, batched=False):
if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size))
- if width < height and width != size:
+ if width <= height and width != size:
height = int(size * height / width)
width = size
elif height < width and height != size:
@@ -183,17 +185,32 @@ def test_equivalence_padding(self):
torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
)
- def test_resize_max_size_respected(self):
+ @parameterized.expand(
+ [
+ ((3, 100, 1500), 1333, 800),
+ ((3, 400, 400), 1333, 800),
+ ((3, 1500, 1500), 1333, 800),
+ ((3, 800, 1333), 1333, 800),
+ ((3, 1333, 800), 1333, 800),
+ ((3, 800, 800), 400, 400),
+ ]
+ )
+ def test_resize_max_size_respected(self, image_size, longest_edge, shortest_edge):
image_processor = self.image_processing_class(**self.image_processor_dict)
# create torch tensors as image
- image = torch.randint(0, 256, (3, 100, 1500), dtype=torch.uint8)
+ image = torch.randint(0, 256, image_size, dtype=torch.uint8)
processed_image = image_processor(
- image, size={"longest_edge": 1333, "shortest_edge": 800}, do_pad=False, return_tensors="pt"
+ image,
+ size={"longest_edge": longest_edge, "shortest_edge": shortest_edge},
+ do_pad=False,
+ return_tensors="pt",
)["pixel_values"]
- self.assertTrue(processed_image.shape[-1] <= 1333)
- self.assertTrue(processed_image.shape[-2] <= 800)
+ shape = list(processed_image.shape[-2:])
+ max_size, min_size = max(shape), min(shape)
+ self.assertTrue(max_size <= 1333, f"Expected max_size <= 1333, got image shape {shape}")
+ self.assertTrue(min_size <= 800, f"Expected min_size <= 800, got image shape {shape}")
@slow
def test_call_pytorch_with_coco_detection_annotations(self):
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 37ae919a448cba..ba0bf8e6b27ebb 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -821,26 +821,26 @@ def test_model_parallelism_gpt2(self):
@require_accelerate
@mark.accelerate_tests
- @require_torch_gpu
+ @require_torch_accelerator
def test_from_pretrained_disk_offload_task_model(self):
model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-gpt2")
device_map = {
- "transformer.wte": 0,
- "transformer.wpe": 0,
+ "transformer.wte": f"{torch_device}:0",
+ "transformer.wpe": f"{torch_device}:0",
"transformer.h.0": "cpu",
"transformer.h.1": "cpu",
"transformer.h.2": "cpu",
"transformer.h.3": "disk",
"transformer.h.4": "disk",
- "transformer.ln_f": 0,
- "lm_head": 0,
+ "transformer.ln_f": f"{torch_device}:0",
+ "lm_head": f"{torch_device}:0",
}
with tempfile.TemporaryDirectory() as tmp_dir:
- inputs = torch.tensor([[1, 2, 3]]).to(0)
+ inputs = torch.tensor([[1, 2, 3]]).to(f"{torch_device}:0")
model.save_pretrained(tmp_dir)
- new_model = AutoModelForCausalLM.from_pretrained(tmp_dir).to(0)
- outputs1 = new_model.to(0)(inputs)
+ new_model = AutoModelForCausalLM.from_pretrained(tmp_dir).to(f"{torch_device}:0")
+ outputs1 = new_model.to(f"{torch_device}:0")(inputs)
offload_folder = os.path.join(tmp_dir, "offload")
new_model_with_offload = AutoModelForCausalLM.from_pretrained(
@@ -851,7 +851,6 @@ def test_from_pretrained_disk_offload_task_model(self):
self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
# With state dict temp offload
- offload_folder = os.path.join(tmp_dir, "offload")
new_model_with_offload = AutoModelForCausalLM.from_pretrained(
tmp_dir,
device_map=device_map,
@@ -859,30 +858,29 @@ def test_from_pretrained_disk_offload_task_model(self):
offload_state_dict=True,
)
outputs2 = new_model_with_offload(inputs)
-
self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
@require_accelerate
@mark.accelerate_tests
- @require_torch_gpu
+ @require_torch_accelerator
def test_from_pretrained_disk_offload_derived_to_base_model(self):
derived_model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
device_map = {
- "wte": 0,
- "wpe": 0,
+ "wte": f"{torch_device}:0",
+ "wpe": f"{torch_device}:0",
"h.0": "cpu",
"h.1": "cpu",
"h.2": "cpu",
"h.3": "disk",
"h.4": "disk",
- "ln_f": 0,
+ "ln_f": f"{torch_device}:0",
}
with tempfile.TemporaryDirectory() as tmp_dir:
- inputs = torch.tensor([[1, 2, 3]]).to(0)
+ inputs = torch.tensor([[1, 2, 3]]).to(f"{torch_device}:0")
derived_model.save_pretrained(tmp_dir, use_safetensors=True)
base_model = AutoModel.from_pretrained(tmp_dir)
- outputs1 = base_model.to(0)(inputs)
+ outputs1 = base_model.to(f"{torch_device}:0")(inputs)
# with disk offload
offload_folder = os.path.join(tmp_dir, "offload")
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 60a2fac4c8f57d..dd5d5c77dab634 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -858,7 +858,6 @@ def check_copies(overwrite: bool = False, file: str = None):
+ diff
+ "\nRun `make fix-copies` or `python utils/check_copies.py --fix_and_overwrite` to fix them."
)
- check_model_list_copy(overwrite=overwrite)
def check_full_copies(overwrite: bool = False):
@@ -1055,68 +1054,6 @@ def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> Tup
return "".join(lines[start_index:end_index]), start_index, end_index, lines
-def check_model_list_copy(overwrite: bool = False):
- """
- Check the model lists in the README is consistent with the ones in the other READMES and also with `index.nmd`.
-
- Args:
- overwrite (`bool`, *optional*, defaults to `False`):
- Whether or not to overwrite the copies when they don't match.
- """
- # Fix potential doc links in the README
- with open(os.path.join(REPO_PATH, "README.md"), "r", encoding="utf-8", newline="\n") as f:
- readme = f.read()
- new_readme = readme.replace("https://huggingface.co/transformers", "https://huggingface.co/docs/transformers")
- new_readme = new_readme.replace(
- "https://huggingface.co/docs/main/transformers", "https://huggingface.co/docs/transformers/main"
- )
- if new_readme != readme:
- if overwrite:
- with open(os.path.join(REPO_PATH, "README.md"), "w", encoding="utf-8", newline="\n") as f:
- f.write(new_readme)
- else:
- raise ValueError(
- "The main README contains wrong links to the documentation of Transformers. Run `make fix-copies` to "
- "automatically fix them."
- )
-
- md_list = get_model_list(
- filename="README.md",
- start_prompt=LOCALIZED_READMES["README.md"]["start_prompt"],
- end_prompt=LOCALIZED_READMES["README.md"]["end_prompt"],
- )
-
- # Build the converted Markdown.
- converted_md_lists = []
- for filename, value in LOCALIZED_READMES.items():
- _start_prompt = value["start_prompt"]
- _end_prompt = value["end_prompt"]
- _format_model_list = value["format_model_list"]
-
- localized_md_list = get_model_list(filename, _start_prompt, _end_prompt)
- readmes_match, converted_md_list = convert_to_localized_md(md_list, localized_md_list, _format_model_list)
-
- converted_md_lists.append((filename, readmes_match, converted_md_list, _start_prompt, _end_prompt))
-
- # Compare the converted Markdowns
- for converted_md_list in converted_md_lists:
- filename, readmes_match, converted_md, _start_prompt, _end_prompt = converted_md_list
-
- if filename == "README.md":
- continue
- if overwrite:
- _, start_index, end_index, lines = _find_text_in_file(
- filename=os.path.join(REPO_PATH, filename), start_prompt=_start_prompt, end_prompt=_end_prompt
- )
- with open(os.path.join(REPO_PATH, filename), "w", encoding="utf-8", newline="\n") as f:
- f.writelines(lines[:start_index] + [converted_md] + lines[end_index:])
- elif not readmes_match:
- raise ValueError(
- f"The model list in the README changed and the list in `{filename}` has not been updated. Run "
- "`make fix-copies` to fix this."
- )
-
-
# Map a model name with the name it has in the README for the check_readme check
SPECIAL_MODEL_NAMES = {
"Bert Generation": "BERT For Sequence Generation",
@@ -1160,60 +1097,11 @@ def check_model_list_copy(overwrite: bool = False):
)
-def check_readme(overwrite: bool = False):
- """
- Check if the main README contains all the models in the library or not.
-
- Args:
- overwrite (`bool`, *optional*, defaults to `False`):
- Whether or not to add an entry for the missing models using `README_TEMPLATE`.
- """
- info = LOCALIZED_READMES["README.md"]
- models, start_index, end_index, lines = _find_text_in_file(
- os.path.join(REPO_PATH, "README.md"),
- info["start_prompt"],
- info["end_prompt"],
- )
- models_in_readme = [re.search(r"\*\*\[([^\]]*)", line).groups()[0] for line in models.strip().split("\n")]
-
- model_names_mapping = transformers_module.models.auto.configuration_auto.MODEL_NAMES_MAPPING
- absents = [
- (key, name)
- for key, name in model_names_mapping.items()
- if SPECIAL_MODEL_NAMES.get(name, name) not in models_in_readme
- ]
- # Remove exceptions
- absents = [(key, name) for key, name in absents if name not in MODELS_NOT_IN_README]
- if len(absents) > 0 and not overwrite:
- print(absents)
- raise ValueError(
- "The main README doesn't contain all models, run `make fix-copies` to fill it with the missing model(s)"
- " then complete the generated entries.\nIf the model is not supposed to be in the main README, add it to"
- " the list `MODELS_NOT_IN_README` in utils/check_copies.py.\nIf it has a different name in the repo than"
- " in the README, map the correspondence in `SPECIAL_MODEL_NAMES` in utils/check_copies.py."
- )
-
- new_models = [README_TEMPLATE.format(model_name=name, model_type=key) for key, name in absents]
-
- all_models = models.strip().split("\n") + new_models
- all_models = sorted(all_models, key=lambda x: re.search(r"\*\*\[([^\]]*)", x).groups()[0].lower())
- all_models = "\n".join(all_models) + "\n"
-
- if all_models != models:
- if overwrite:
- print("Fixing the main README.")
- with open(os.path.join(REPO_PATH, "README.md"), "w", encoding="utf-8", newline="\n") as f:
- f.writelines(lines[:start_index] + [all_models] + lines[end_index:])
- else:
- raise ValueError("The main README model list is not properly sorted. Run `make fix-copies` to fix this.")
-
-
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--file", type=str, default=None, help="A specific file to check and/or fix")
parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
args = parser.parse_args()
- check_readme(args.fix_and_overwrite)
check_copies(args.fix_and_overwrite, args.file)
check_full_copies(args.fix_and_overwrite)
diff --git a/utils/check_if_new_model_added.py b/utils/check_if_new_model_added.py
new file mode 100644
index 00000000000000..f3ae0d585a1517
--- /dev/null
+++ b/utils/check_if_new_model_added.py
@@ -0,0 +1,96 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script is used to get the directory of the modeling file that is added in a pull request (i.e. a new model PR).
+
+Usage:
+
+```bash
+python utils/check_if_new_model_added.py
+```
+"""
+
+import re
+from pathlib import Path
+from typing import List
+
+from git import Repo
+
+
+PATH_TO_REPO = Path(__file__).parent.parent.resolve()
+
+
+def get_new_python_files_between_commits(base_commit: str, commits: List[str]) -> List[str]:
+ """
+ Get the list of added python files between a base commit and one or several commits.
+
+ Args:
+ repo (`git.Repo`):
+ A git repository (for instance the Transformers repo).
+ base_commit (`str`):
+ The commit reference of where to compare for the diff. This is the current commit, not the branching point!
+ commits (`List[str]`):
+ The list of commits with which to compare the repo at `base_commit` (so the branching point).
+
+ Returns:
+ `List[str]`: The list of python files added between a base commit and one or several commits.
+ """
+ code_diff = []
+ for commit in commits:
+ for diff_obj in commit.diff(base_commit):
+ # We always add new python files
+ if diff_obj.change_type == "A" and diff_obj.b_path.endswith(".py"):
+ code_diff.append(diff_obj.b_path)
+
+ return code_diff
+
+
+def get_new_python_files() -> List[str]:
+ """
+ Return a list of python files that have been added between the current head and the main branch.
+
+ Returns:
+ `List[str]`: The list of python files added.
+ """
+ repo = Repo(PATH_TO_REPO)
+
+ try:
+ # For the cases where the main branch exists locally
+ main = repo.refs.main
+ except AttributeError:
+ # On GitHub Actions runners, it doesn't have local main branch
+ main = repo.remotes.origin.refs.main
+
+ print(f"main is at {main.commit}")
+ print(f"Current head is at {repo.head.commit}")
+
+ branching_commits = repo.merge_base(main, repo.head)
+ for commit in branching_commits:
+ print(f"Branching commit: {commit}")
+ return get_new_python_files_between_commits(repo.head.commit, branching_commits)
+
+
+if __name__ == "__main__":
+ new_files = get_new_python_files()
+ reg = re.compile(r"src/transformers/(models/.*)/modeling_.*\.py")
+
+ new_model = ""
+ for x in new_files:
+ find_new_model = reg.findall(x)
+ if len(find_new_model) > 0:
+ new_model = find_new_model[0]
+ # It's unlikely we have 2 new modeling files in a pull request.
+ break
+ print(new_model)
diff --git a/utils/check_table.py b/utils/check_table.py
index 99031f025c8562..9c9318ca857168 100644
--- a/utils/check_table.py
+++ b/utils/check_table.py
@@ -155,6 +155,7 @@ def _center_text(text: str, width: int) -> str:
"HerBERT": "BERT",
"LayoutXLM": "LayoutLMv2",
"Llama2": "LLaMA",
+ "Llama3": "LLaMA",
"MADLAD-400": "T5",
"MatCha": "Pix2Struct",
"mBART-50": "mBART",
diff --git a/utils/check_task_guides.py b/utils/check_task_guides.py
deleted file mode 100644
index b00ff1dc1a5a08..00000000000000
--- a/utils/check_task_guides.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Utility that checks the list of models in the tips in the task-specific pages of the doc is up to date and potentially
-fixes it.
-
-Use from the root of the repo with:
-
-```bash
-python utils/check_task_guides.py
-```
-
-for a check that will error in case of inconsistencies (used by `make repo-consistency`).
-
-To auto-fix issues run:
-
-```bash
-python utils/check_task_guides.py --fix_and_overwrite
-```
-
-which is used by `make fix-copies`.
-"""
-import argparse
-import os
-
-from transformers.utils import direct_transformers_import
-
-
-# All paths are set with the intent you should run this script from the root of the repo with the command
-# python utils/check_task_guides.py
-TRANSFORMERS_PATH = "src/transformers"
-PATH_TO_TASK_GUIDES = "docs/source/en/tasks"
-
-
-def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> str:
- """
- Find the text in filename between two prompts.
-
- Args:
- filename (`str`): The file to search into.
- start_prompt (`str`): A string to look for at the start of the content searched.
- end_prompt (`str`): A string that will mark the end of the content to look for.
-
- Returns:
- `str`: The content between the prompts.
- """
- with open(filename, "r", encoding="utf-8", newline="\n") as f:
- lines = f.readlines()
- # Find the start prompt.
- start_index = 0
- while not lines[start_index].startswith(start_prompt):
- start_index += 1
- start_index += 1
-
- # Now go until the end prompt.
- end_index = start_index
- while not lines[end_index].startswith(end_prompt):
- end_index += 1
- end_index -= 1
-
- while len(lines[start_index]) <= 1:
- start_index += 1
- while len(lines[end_index]) <= 1:
- end_index -= 1
- end_index += 1
- return "".join(lines[start_index:end_index]), start_index, end_index, lines
-
-
-# This is to make sure the transformers module imported is the one in the repo.
-transformers_module = direct_transformers_import(TRANSFORMERS_PATH)
-
-# Map between a task guide and the corresponding auto class.
-TASK_GUIDE_TO_MODELS = {
- "asr.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_CTC_MAPPING_NAMES,
- "audio_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
- "language_modeling.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
- "image_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
- "masked_language_modeling.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_MASKED_LM_MAPPING_NAMES,
- "multiple_choice.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
- "object_detection.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
- "question_answering.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
- "semantic_segmentation.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
- "sequence_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
- "summarization.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
- "token_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
- "translation.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
- "video_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES,
- "document_question_answering.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
- "monocular_depth_estimation.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES,
-}
-
-# This list contains model types used in some task guides that are not in `CONFIG_MAPPING_NAMES` (therefore not in any
-# `MODEL_MAPPING_NAMES` or any `MODEL_FOR_XXX_MAPPING_NAMES`).
-SPECIAL_TASK_GUIDE_TO_MODEL_TYPES = {
- "summarization.md": ("nllb",),
- "translation.md": ("nllb",),
-}
-
-
-def get_model_list_for_task(task_guide: str) -> str:
- """
- Return the list of models supporting a given task.
-
- Args:
- task_guide (`str`): The name of the task guide to check.
-
- Returns:
- `str`: The list of models supporting this task, as links to their respective doc pages separated by commas.
- """
- model_maping_names = TASK_GUIDE_TO_MODELS[task_guide]
- special_model_types = SPECIAL_TASK_GUIDE_TO_MODEL_TYPES.get(task_guide, set())
- model_names = {
- code: name
- for code, name in transformers_module.MODEL_NAMES_MAPPING.items()
- if (code in model_maping_names or code in special_model_types)
- }
- return ", ".join([f"[{name}](../model_doc/{code})" for code, name in model_names.items()]) + "\n"
-
-
-def check_model_list_for_task(task_guide: str, overwrite: bool = False):
- """
- For a given task guide, checks the model list in the generated tip for consistency with the state of the lib and
- updates it if needed.
-
- Args:
- task_guide (`str`):
- The name of the task guide to check.
- overwrite (`bool`, *optional*, defaults to `False`):
- Whether or not to overwrite the table when it's not up to date.
- """
- current_list, start_index, end_index, lines = _find_text_in_file(
- filename=os.path.join(PATH_TO_TASK_GUIDES, task_guide),
- start_prompt="",
- end_prompt="",
- )
-
- new_list = get_model_list_for_task(task_guide)
-
- if current_list != new_list:
- if overwrite:
- with open(os.path.join(PATH_TO_TASK_GUIDES, task_guide), "w", encoding="utf-8", newline="\n") as f:
- f.writelines(lines[:start_index] + [new_list] + lines[end_index:])
- else:
- raise ValueError(
- f"The list of models that can be used in the {task_guide} guide needs an update. Run `make fix-copies`"
- " to fix this."
- )
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
- args = parser.parse_args()
-
- for task_guide in TASK_GUIDE_TO_MODELS.keys():
- check_model_list_for_task(task_guide, args.fix_and_overwrite)
diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt
index 1869836909e696..25de38efe5db6e 100644
--- a/utils/not_doctested.txt
+++ b/utils/not_doctested.txt
@@ -2,7 +2,6 @@ docs/source/en/_config.py
docs/source/en/accelerate.md
docs/source/en/add_new_model.md
docs/source/en/add_new_pipeline.md
-docs/source/en/add_tensorflow_model.md
docs/source/en/attention.md
docs/source/en/benchmarks.md
docs/source/en/bertology.md
@@ -336,7 +335,6 @@ src/transformers/benchmark/benchmark_args_tf.py
src/transformers/benchmark/benchmark_args_utils.py
src/transformers/benchmark/benchmark_tf.py
src/transformers/benchmark/benchmark_utils.py
-src/transformers/commands/add_new_model.py
src/transformers/commands/add_new_model_like.py
src/transformers/commands/convert.py
src/transformers/commands/download.py
diff --git a/utils/notification_service.py b/utils/notification_service.py
index 158e01942b81fa..ba082b046fce18 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -992,13 +992,13 @@ def prepare_reports(title, header, reports, to_truncate=True):
"job_link": {},
}
for model in models
- if f"run_all_tests_gpu_{model}_test_reports" in available_artifacts
+ if f"run_models_gpu_{model}_test_reports" in available_artifacts
}
unclassified_model_failures = []
for model in model_results.keys():
- for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths:
+ for artifact_path in available_artifacts[f"run_models_gpu_{model}_test_reports"].paths:
artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
if "stats" in artifact:
# Link to the GitHub Action job
@@ -1052,10 +1052,10 @@ def prepare_reports(title, header, reports, to_truncate=True):
# Additional runs
additional_files = {
- "PyTorch pipelines": "run_tests_torch_pipeline_gpu",
- "TensorFlow pipelines": "run_tests_tf_pipeline_gpu",
- "Examples directory": "run_examples_gpu",
- "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports",
+ "PyTorch pipelines": "run_pipelines_torch_gpu_test_reports",
+ "TensorFlow pipelines": "run_pipelines_tf_gpu_test_reports",
+ "Examples directory": "run_examples_gpu_test_reports",
+ "Torch CUDA extension tests": "run_torch_cuda_extensions_gpu_test_reports",
}
if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):
@@ -1075,7 +1075,7 @@ def prepare_reports(title, header, reports, to_truncate=True):
"run_pipelines_torch_gpu": "PyTorch pipelines",
"run_pipelines_tf_gpu": "TensorFlow pipelines",
"run_examples_gpu": "Examples directory",
- "run_all_tests_torch_cuda_extensions_gpu": "Torch CUDA extension tests",
+ "run_torch_cuda_extensions_gpu": "Torch CUDA extension tests",
}
# Remove some entries in `additional_files` if they are not concerned.
@@ -1133,10 +1133,10 @@ def prepare_reports(title, header, reports, to_truncate=True):
)
# Let's only check the warning for the model testing job. Currently, the job `run_extract_warnings` is only run
- # when `inputs.job` (in the workflow file) is `run_tests_gpu`. The reason is: otherwise we need to save several
+ # when `inputs.job` (in the workflow file) is `run_models_gpu`. The reason is: otherwise we need to save several
# artifacts with different names which complicates the logic for an insignificant part of the CI workflow reporting.
selected_warnings = []
- if job_name == "run_tests_gpu":
+ if job_name == "run_models_gpu":
if "warnings_in_ci" in available_artifacts:
directory = available_artifacts["warnings_in_ci"].paths[0]["path"]
with open(os.path.join(directory, "selected_warnings.json")) as fp:
@@ -1147,7 +1147,7 @@ def prepare_reports(title, header, reports, to_truncate=True):
# Only the model testing job is concerned: this condition is to avoid other jobs to upload the empty list as
# results.
- if job_name == "run_tests_gpu":
+ if job_name == "run_models_gpu":
with open("prev_ci_results/model_results.json", "w", encoding="UTF-8") as fp:
json.dump(model_results, fp, indent=4, ensure_ascii=False)
diff --git a/utils/notification_service_quantization.py b/utils/notification_service_quantization.py
index 11bc57e618a7e4..1687eeaa25f32f 100644
--- a/utils/notification_service_quantization.py
+++ b/utils/notification_service_quantization.py
@@ -200,7 +200,7 @@ def post_reply(self):
"job_link": {},
}
for quant in quantization_matrix
- if f"run_tests_quantization_torch_gpu_{quant}" in available_artifacts
+ if f"run_quantization_torch_gpu_{ quant }_test_reports" in available_artifacts
}
github_actions_jobs = get_jobs(
@@ -217,7 +217,7 @@ def post_reply(self):
break
for quant in quantization_results.keys():
- for artifact_path in available_artifacts[f"run_tests_quantization_torch_gpu_{quant}"].paths:
+ for artifact_path in available_artifacts[f"run_quantization_torch_gpu_{ quant }_test_reports"].paths:
artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
if "stats" in artifact:
# Link to the GitHub Action job
diff --git a/utils/split_model_tests.py b/utils/split_model_tests.py
index fc8800ffcf1c48..e5083aaeb46fa5 100644
--- a/utils/split_model_tests.py
+++ b/utils/split_model_tests.py
@@ -18,7 +18,7 @@
to split the list of jobs to run into multiple slices each containing a smaller number of jobs. This way, we can bypass
the maximum of 256 jobs in a matrix.
-See the `setup` and `run_tests_gpu` jobs defined in the workflow file `.github/workflows/self-scheduled.yml` for more
+See the `setup` and `run_models_gpu` jobs defined in the workflow file `.github/workflows/self-scheduled.yml` for more
details.
Usage: