diff --git a/.github/workflows/delete-doc-comment-trigger.yml b/.github/workflows/delete-doc-comment-trigger.yml
deleted file mode 100644
index 5e39e2539..000000000
--- a/.github/workflows/delete-doc-comment-trigger.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-name: Delete doc comment trigger
-
-on:
- pull_request:
- types: [ closed ]
-
-
-jobs:
- delete:
- uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
- with:
- pr_number: ${{ github.event.number }}
\ No newline at end of file
diff --git a/.github/workflows/delete-doc-comment.yml b/.github/workflows/delete-doc-comment.yml
deleted file mode 100644
index 8604019d7..000000000
--- a/.github/workflows/delete-doc-comment.yml
+++ /dev/null
@@ -1,14 +0,0 @@
-name: Delete doc comment
-
-on:
- workflow_run:
- workflows: ["Delete doc comment trigger"]
- types:
- - completed
-
-
-jobs:
- delete:
- uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
- secrets:
- comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
\ No newline at end of file
diff --git a/README.md b/README.md
index f15b73826..2ad41220e 100644
--- a/README.md
+++ b/README.md
@@ -15,10 +15,13 @@
-
+
+
+
+
-
+
@@ -98,7 +101,7 @@ npm i @xenova/transformers
Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
```html
```
@@ -130,7 +133,7 @@ Want to jump straight in? Get started with one of our sample applications/templa
-By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@xenova/transformers@2.8.0/dist/), which should work out-of-the-box. You can customize this as follows:
+By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@xenova/transformers@2.10.0/dist/), which should work out-of-the-box. You can customize this as follows:
### Settings
@@ -211,7 +214,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
| Task | ID | Description | Supported? |
|--------------------------|----|-------------|------------|
-| [Depth Estimation](https://huggingface.co/tasks/depth-estimation) | `depth-estimation` | Predicting the depth of objects present in an image. | ❌ |
+| [Depth Estimation](https://huggingface.co/tasks/depth-estimation) | `depth-estimation` | Predicting the depth of objects present in an image. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.DepthEstimationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=depth-estimation&library=transformers.js) |
| [Image Classification](https://huggingface.co/tasks/image-classification) | `image-classification` | Assigning a label or class to an entire image. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-classification&library=transformers.js) |
| [Image Segmentation](https://huggingface.co/tasks/image-segmentation) | `image-segmentation` | Divides an image into segments where each pixel is mapped to an object. This task has multiple variants such as instance segmentation, panoptic segmentation and semantic segmentation. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageSegmentationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-segmentation&library=transformers.js) |
| [Image-to-Image](https://huggingface.co/tasks/image-to-image) | `image-to-image` | Transforming a source image to match the characteristics of a target image or a target image domain. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageToImagePipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-to-image&library=transformers.js) |
@@ -247,7 +250,9 @@ You can refine your search by selecting the task you're interested in (e.g., [te
| [Image-to-Text](https://huggingface.co/tasks/image-to-text) | `image-to-text` | Output text from a given image. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageToTextPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-to-text&library=transformers.js) |
| [Text-to-Image](https://huggingface.co/tasks/text-to-image) | `text-to-image` | Generates images from input text. | ❌ |
| [Visual Question Answering](https://huggingface.co/tasks/visual-question-answering) | `visual-question-answering` | Answering open-ended questions based on an image. | ❌ |
+| [Zero-Shot Audio Classification](https://huggingface.co/learn/audio-course/chapter4/classification_models#zero-shot-audio-classification) | `zero-shot-audio-classification` | Classifying audios into classes that are unseen during training. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotAudioClassificationPipeline)
[(models)](https://huggingface.co/models?other=zero-shot-audio-classification&library=transformers.js) |
| [Zero-Shot Image Classification](https://huggingface.co/tasks/zero-shot-image-classification) | `zero-shot-image-classification` | Classifying images into classes that are unseen during training. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotImageClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&library=transformers.js) |
+| [Zero-Shot Object Detection](https://huggingface.co/tasks/zero-shot-object-detection) | `zero-shot-object-detection` | Identify objects of classes that are unseen during training. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotObjectDetectionPipeline)
[(models)](https://huggingface.co/models?other=zero-shot-object-detection&library=transformers.js) |
#### Reinforcement Learning
@@ -261,6 +266,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
### Models
1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
@@ -268,17 +274,22 @@ You can refine your search by selecting the task you're interested in (e.g., [te
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
+1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
+1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
@@ -300,7 +311,9 @@ You can refine your search by selecting the task you're interested in (e.g., [te
1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
diff --git a/docs/scripts/build_readme.py b/docs/scripts/build_readme.py
index e3beaca86..82c51dad7 100644
--- a/docs/scripts/build_readme.py
+++ b/docs/scripts/build_readme.py
@@ -17,10 +17,13 @@
-
+
+
+
+
-
+
diff --git a/docs/snippets/2_installation.snippet b/docs/snippets/2_installation.snippet
index f06edaa68..8d5e433d2 100644
--- a/docs/snippets/2_installation.snippet
+++ b/docs/snippets/2_installation.snippet
@@ -7,6 +7,6 @@ npm i @xenova/transformers
Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
```html
```
diff --git a/docs/snippets/4_custom-usage.snippet b/docs/snippets/4_custom-usage.snippet
index d9522edc2..3367b2685 100644
--- a/docs/snippets/4_custom-usage.snippet
+++ b/docs/snippets/4_custom-usage.snippet
@@ -1,6 +1,6 @@
-By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@xenova/transformers@2.8.0/dist/), which should work out-of-the-box. You can customize this as follows:
+By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@xenova/transformers@2.10.0/dist/), which should work out-of-the-box. You can customize this as follows:
### Settings
diff --git a/docs/snippets/5_supported-tasks.snippet b/docs/snippets/5_supported-tasks.snippet
index dee075808..838026092 100644
--- a/docs/snippets/5_supported-tasks.snippet
+++ b/docs/snippets/5_supported-tasks.snippet
@@ -22,7 +22,7 @@
| Task | ID | Description | Supported? |
|--------------------------|----|-------------|------------|
-| [Depth Estimation](https://huggingface.co/tasks/depth-estimation) | `depth-estimation` | Predicting the depth of objects present in an image. | ❌ |
+| [Depth Estimation](https://huggingface.co/tasks/depth-estimation) | `depth-estimation` | Predicting the depth of objects present in an image. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.DepthEstimationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=depth-estimation&library=transformers.js) |
| [Image Classification](https://huggingface.co/tasks/image-classification) | `image-classification` | Assigning a label or class to an entire image. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-classification&library=transformers.js) |
| [Image Segmentation](https://huggingface.co/tasks/image-segmentation) | `image-segmentation` | Divides an image into segments where each pixel is mapped to an object. This task has multiple variants such as instance segmentation, panoptic segmentation and semantic segmentation. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageSegmentationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-segmentation&library=transformers.js) |
| [Image-to-Image](https://huggingface.co/tasks/image-to-image) | `image-to-image` | Transforming a source image to match the characteristics of a target image or a target image domain. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageToImagePipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-to-image&library=transformers.js) |
@@ -58,7 +58,9 @@
| [Image-to-Text](https://huggingface.co/tasks/image-to-text) | `image-to-text` | Output text from a given image. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageToTextPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-to-text&library=transformers.js) |
| [Text-to-Image](https://huggingface.co/tasks/text-to-image) | `text-to-image` | Generates images from input text. | ❌ |
| [Visual Question Answering](https://huggingface.co/tasks/visual-question-answering) | `visual-question-answering` | Answering open-ended questions based on an image. | ❌ |
+| [Zero-Shot Audio Classification](https://huggingface.co/learn/audio-course/chapter4/classification_models#zero-shot-audio-classification) | `zero-shot-audio-classification` | Classifying audios into classes that are unseen during training. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotAudioClassificationPipeline)
[(models)](https://huggingface.co/models?other=zero-shot-audio-classification&library=transformers.js) |
| [Zero-Shot Image Classification](https://huggingface.co/tasks/zero-shot-image-classification) | `zero-shot-image-classification` | Classifying images into classes that are unseen during training. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotImageClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&library=transformers.js) |
+| [Zero-Shot Object Detection](https://huggingface.co/tasks/zero-shot-object-detection) | `zero-shot-object-detection` | Identify objects of classes that are unseen during training. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotObjectDetectionPipeline)
[(models)](https://huggingface.co/models?other=zero-shot-object-detection&library=transformers.js) |
#### Reinforcement Learning
diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet
index 42c12bd2a..4dfc00e2d 100644
--- a/docs/snippets/6_supported-models.snippet
+++ b/docs/snippets/6_supported-models.snippet
@@ -2,6 +2,7 @@
### Models
1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
@@ -9,17 +10,22 @@
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
+1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
+1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
@@ -41,7 +47,9 @@
1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
diff --git a/examples/next-client/package-lock.json b/examples/next-client/package-lock.json
index ea29ca18c..309387dca 100644
--- a/examples/next-client/package-lock.json
+++ b/examples/next-client/package-lock.json
@@ -4019,9 +4019,9 @@
}
},
"node_modules/sharp": {
- "version": "0.32.4",
- "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.4.tgz",
- "integrity": "sha512-exUnZewqVZC6UXqXuQ8fyJJv0M968feBi04jb9GcUHrWtkRoAKnbJt8IfwT4NJs7FskArbJ14JAFGVuooszoGg==",
+ "version": "0.32.6",
+ "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz",
+ "integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==",
"hasInstallScript": true,
"dependencies": {
"color": "^4.2.3",
diff --git a/examples/next-server/package-lock.json b/examples/next-server/package-lock.json
index 25f40f30b..e7861f920 100644
--- a/examples/next-server/package-lock.json
+++ b/examples/next-server/package-lock.json
@@ -4019,9 +4019,9 @@
}
},
"node_modules/sharp": {
- "version": "0.32.4",
- "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.4.tgz",
- "integrity": "sha512-exUnZewqVZC6UXqXuQ8fyJJv0M968feBi04jb9GcUHrWtkRoAKnbJt8IfwT4NJs7FskArbJ14JAFGVuooszoGg==",
+ "version": "0.32.6",
+ "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz",
+ "integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==",
"hasInstallScript": true,
"dependencies": {
"color": "^4.2.3",
diff --git a/examples/semantic-image-search-client/package-lock.json b/examples/semantic-image-search-client/package-lock.json
index c99b280f2..7c06d25f3 100644
--- a/examples/semantic-image-search-client/package-lock.json
+++ b/examples/semantic-image-search-client/package-lock.json
@@ -4073,9 +4073,9 @@
}
},
"node_modules/sharp": {
- "version": "0.32.4",
- "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.4.tgz",
- "integrity": "sha512-exUnZewqVZC6UXqXuQ8fyJJv0M968feBi04jb9GcUHrWtkRoAKnbJt8IfwT4NJs7FskArbJ14JAFGVuooszoGg==",
+ "version": "0.32.6",
+ "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz",
+ "integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==",
"hasInstallScript": true,
"dependencies": {
"color": "^4.2.3",
diff --git a/examples/semantic-image-search/package-lock.json b/examples/semantic-image-search/package-lock.json
index f714d8b13..fbc41c984 100644
--- a/examples/semantic-image-search/package-lock.json
+++ b/examples/semantic-image-search/package-lock.json
@@ -4256,9 +4256,9 @@
}
},
"node_modules/sharp": {
- "version": "0.32.4",
- "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.4.tgz",
- "integrity": "sha512-exUnZewqVZC6UXqXuQ8fyJJv0M968feBi04jb9GcUHrWtkRoAKnbJt8IfwT4NJs7FskArbJ14JAFGVuooszoGg==",
+ "version": "0.32.6",
+ "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz",
+ "integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==",
"hasInstallScript": true,
"dependencies": {
"color": "^4.2.3",
diff --git a/package-lock.json b/package-lock.json
index 3c86aafe7..c2bdd41bf 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
{
"name": "@xenova/transformers",
- "version": "2.8.0",
+ "version": "2.10.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@xenova/transformers",
- "version": "2.8.0",
+ "version": "2.10.0",
"license": "Apache-2.0",
"dependencies": {
"onnxruntime-web": "1.14.0",
@@ -2017,6 +2017,11 @@
"integrity": "sha512-hNfzcOV8W4NdualtqBFPyVO+54DSJuZGY9qT4pRroB6S9e3iiido2ISIC5h9R2sPJ8H3FHCIiEnsv1lPXO3KtQ==",
"dev": true
},
+ "node_modules/b4a": {
+ "version": "1.6.4",
+ "resolved": "https://registry.npmjs.org/b4a/-/b4a-1.6.4.tgz",
+ "integrity": "sha512-fpWrvyVHEKyeEvbKZTVOeZF3VSKKWtJxFIxX/jaVPf+cLbGUSitjb49pHLqPV2BUNNZ0LcoeEGfE/YCpyDYHIw=="
+ },
"node_modules/babel-jest": {
"version": "29.6.1",
"resolved": "https://registry.npmjs.org/babel-jest/-/babel-jest-29.6.1.tgz",
@@ -3014,9 +3019,9 @@
}
},
"node_modules/detect-libc": {
- "version": "2.0.1",
- "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.1.tgz",
- "integrity": "sha512-463v3ZeIrcWtdgIg6vI6XUncguvr2TnGl4SzDXinkt9mSLpBJKXT3mW6xT3VQdDN11+WVs29pgvivTc4Lp8v+w==",
+ "version": "2.0.2",
+ "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.2.tgz",
+ "integrity": "sha512-UX6sGumvvqSaXgdKGUsgZWqcUyIXZ/vZTrlRT/iobiKhGL0zL4d3osHj3uqllWJK+i+sixDS/3COVEOFbupFyw==",
"engines": {
"node": ">=8"
}
@@ -3415,6 +3420,11 @@
"integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
"dev": true
},
+ "node_modules/fast-fifo": {
+ "version": "1.3.2",
+ "resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz",
+ "integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ=="
+ },
"node_modules/fast-glob": {
"version": "3.2.12",
"resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.2.12.tgz",
@@ -5616,9 +5626,9 @@
}
},
"node_modules/node-addon-api": {
- "version": "6.0.0",
- "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.0.0.tgz",
- "integrity": "sha512-GyHvgPvUXBvAkXa0YvYnhilSB1A+FRYMpIVggKzPZqdaZfevZOuzfWzyvgzOwRLHBeo/MMswmJFsrNF4Nw1pmA=="
+ "version": "6.1.0",
+ "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz",
+ "integrity": "sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA=="
},
"node_modules/node-forge": {
"version": "1.3.1",
@@ -6150,6 +6160,11 @@
}
]
},
+ "node_modules/queue-tick": {
+ "version": "1.0.1",
+ "resolved": "https://registry.npmjs.org/queue-tick/-/queue-tick-1.0.1.tgz",
+ "integrity": "sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag=="
+ },
"node_modules/randombytes": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz",
@@ -6689,18 +6704,18 @@
}
},
"node_modules/sharp": {
- "version": "0.32.0",
- "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.0.tgz",
- "integrity": "sha512-yLAypVcqj1toSAqRSwbs86nEzfyZVDYqjuUX8grhFpeij0DDNagKJXELS/auegDBRDg1XBtELdOGfo2X1cCpeA==",
+ "version": "0.32.6",
+ "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz",
+ "integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==",
"hasInstallScript": true,
"dependencies": {
"color": "^4.2.3",
- "detect-libc": "^2.0.1",
- "node-addon-api": "^6.0.0",
+ "detect-libc": "^2.0.2",
+ "node-addon-api": "^6.1.0",
"prebuild-install": "^7.1.1",
- "semver": "^7.3.8",
+ "semver": "^7.5.4",
"simple-get": "^4.0.1",
- "tar-fs": "^2.1.1",
+ "tar-fs": "^3.0.4",
"tunnel-agent": "^0.6.0"
},
"engines": {
@@ -6710,6 +6725,26 @@
"url": "https://opencollective.com/libvips"
}
},
+ "node_modules/sharp/node_modules/tar-fs": {
+ "version": "3.0.4",
+ "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.4.tgz",
+ "integrity": "sha512-5AFQU8b9qLfZCX9zp2duONhPmZv0hGYiBPJsyUdqMjzq/mqVpy/rEUSeHk1+YitmxugaptgBh5oDGU3VsAJq4w==",
+ "dependencies": {
+ "mkdirp-classic": "^0.5.2",
+ "pump": "^3.0.0",
+ "tar-stream": "^3.1.5"
+ }
+ },
+ "node_modules/sharp/node_modules/tar-stream": {
+ "version": "3.1.6",
+ "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.6.tgz",
+ "integrity": "sha512-B/UyjYwPpMBv+PaFSWAmtYjwdrlEaZQEhMIBFNC5oEG8lpiW8XjcSdmEaClj28ArfKScKHs2nshz3k2le6crsg==",
+ "dependencies": {
+ "b4a": "^1.6.4",
+ "fast-fifo": "^1.2.0",
+ "streamx": "^2.15.0"
+ }
+ },
"node_modules/shebang-command": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
@@ -7026,6 +7061,15 @@
"node": ">=0.10.0"
}
},
+ "node_modules/streamx": {
+ "version": "2.15.5",
+ "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.15.5.tgz",
+ "integrity": "sha512-9thPGMkKC2GctCzyCUjME3yR03x2xNo0GPKGkRw2UMYN+gqWa9uqpyNWhmsNCutU5zHmkUum0LsCRQTXUgUCAg==",
+ "dependencies": {
+ "fast-fifo": "^1.1.0",
+ "queue-tick": "^1.0.1"
+ }
+ },
"node_modules/string_decoder": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
diff --git a/package.json b/package.json
index ce18ca9f3..131af0308 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "@xenova/transformers",
- "version": "2.8.0",
+ "version": "2.10.0",
"description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
"main": "./src/transformers.js",
"types": "./types/transformers.d.ts",
@@ -10,7 +10,7 @@
"dev": "webpack serve --no-client-overlay",
"build": "webpack && npm run typegen",
"generate-tests": "python -m tests.generate_tests",
- "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose",
+ "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose --maxConcurrency 1",
"readme": "python ./docs/scripts/build_readme.py",
"docs-api": "node ./docs/scripts/generate.js",
"docs-preview": "doc-builder preview transformers.js ./docs/source/ --not_python_module",
diff --git a/scripts/convert.py b/scripts/convert.py
index ffc999bfa..9055397e8 100644
--- a/scripts/convert.py
+++ b/scripts/convert.py
@@ -84,7 +84,7 @@
'vision-encoder-decoder': {
'per_channel': False,
'reduce_range': False,
- }
+ },
}
MODELS_WITHOUT_TOKENIZERS = [
@@ -326,6 +326,11 @@ def main():
with open(os.path.join(output_model_folder, 'tokenizer.json'), 'w', encoding='utf-8') as fp:
json.dump(tokenizer_json, fp, indent=4)
+ elif config.model_type == 'owlvit':
+ # Override default batch size to 1, needed because non-maximum suppression is performed for exporting.
+ # For more information, see https://github.com/huggingface/optimum/blob/e3b7efb1257c011db907ef40ab340e795cc5684c/optimum/exporters/onnx/model_configs.py#L1028-L1032
+ export_kwargs['batch_size'] = 1
+
else:
pass # TODO
@@ -348,6 +353,25 @@ def main():
device=conv_args.device,
)
+ # TODO: Enable once https://github.com/huggingface/optimum/pull/1552 is merged
+ # elif config.model_type == 'clap' and conv_args.split_modalities:
+ # # Handle special case for exporting text and audio models separately
+ # from .extra.clap import ClapTextModelWithProjectionOnnxConfig, ClapAudioModelWithProjectionOnnxConfig
+ # from transformers.models.clap import ClapTextModelWithProjection, ClapAudioModelWithProjection
+
+ # text_model = ClapTextModelWithProjection.from_pretrained(model_id)
+ # audio_model = ClapAudioModelWithProjection.from_pretrained(model_id)
+
+ # export_models(
+ # models_and_onnx_configs={
+ # "text_model": (text_model, ClapTextModelWithProjectionOnnxConfig(text_model.config)),
+ # "audio_model": (audio_model, ClapAudioModelWithProjectionOnnxConfig(audio_model.config)),
+ # },
+ # output_dir=output_model_folder,
+ # opset=conv_args.opset,
+ # device=conv_args.device,
+ # )
+
else:
main_export(**export_kwargs)
diff --git a/scripts/extra/clap.py b/scripts/extra/clap.py
new file mode 100644
index 000000000..cd71dcad5
--- /dev/null
+++ b/scripts/extra/clap.py
@@ -0,0 +1,40 @@
+# TODO: Enable once https://github.com/huggingface/optimum/pull/1552 is merged
+
+# # Support exporting vision and text models separately:
+# # Adapted from https://github.com/huggingface/optimum/issues/1186#issuecomment-1637641760
+
+# from optimum.exporters.onnx.model_configs import CLAPTextWithProjectionOnnxConfig, AudioOnnxConfig
+# from optimum.utils.normalized_config import NormalizedAudioConfig
+# from optimum.utils.input_generators import DummyAudioInputGenerator
+# from typing import Dict
+
+
+# class ClapAudioModelWithProjectionOnnxConfig(AudioOnnxConfig):
+# NORMALIZED_CONFIG_CLASS = NormalizedAudioConfig
+# DUMMY_INPUT_GENERATOR_CLASSES = (DummyAudioInputGenerator, )
+
+# @property
+# def inputs(self) -> Dict[str, Dict[int, str]]:
+# return {
+# "input_features": {0: "audio_batch_size", 1: "num_channels", 2: "height", 3: "width"}, # As described in modeling_clap.py
+# }
+
+# @property
+# def outputs(self) -> Dict[str, Dict[int, str]]:
+# return {
+# "audio_embeds": {0: "batch_size"},
+# }
+
+# class ClapTextModelWithProjectionOnnxConfig(CLAPTextWithProjectionOnnxConfig):
+# @property
+# def outputs(self) -> Dict[str, Dict[int, str]]:
+# return {
+# "text_embeds": {0: "batch_size"},
+# }
+
+# def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
+# dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
+# if framework == "pt":
+# import torch
+# dummy_inputs["input_ids"] = dummy_inputs["input_ids"].to(dtype=torch.int64)
+# return dummy_inputs
diff --git a/scripts/supported_models.py b/scripts/supported_models.py
index 7838686d0..26a46cbbd 100644
--- a/scripts/supported_models.py
+++ b/scripts/supported_models.py
@@ -16,6 +16,15 @@
'sentence-transformers/paraphrase-albert-base-v2',
],
},
+ 'audio-spectrogram-transformer': {
+ # Audio classification
+ 'audio-classification': {
+ 'MIT/ast-finetuned-audioset-10-10-0.4593',
+ 'MIT/ast-finetuned-audioset-16-16-0.442',
+ 'MIT/ast-finetuned-speech-commands-v2',
+ 'mtg-upf/discogs-maest-30s-pw-73e-ts',
+ }
+ },
'bart': {
# Summarization
'summarization': [
@@ -175,6 +184,17 @@
'openai/clip-vit-large-patch14-336',
]
},
+ 'clap': {
+ # Zero-shot audio classification and feature extraction
+ # (with and without `--split_modalities`)
+ 'zero-shot-audio-classification': {
+ 'laion/clap-htsat-unfused',
+ # TODO add 'laion/clap-htsat-fused',
+ 'laion/larger_clap_general',
+ 'laion/larger_clap_music_and_speech',
+ # 'Xenova/tiny-random-ClapModel',
+ }
+ },
'codegen': {
# Text generation
'text-generation': [
@@ -183,6 +203,49 @@
'Salesforce/codegen-350M-nl',
],
},
+ 'convnext': {
+ # Image classification
+ 'image-classification': [
+ 'facebook/convnext-tiny-224',
+ 'facebook/convnext-small-224',
+ 'facebook/convnext-base-224',
+ 'facebook/convnext-base-224-22k',
+ 'facebook/convnext-base-224-22k-1k',
+ 'facebook/convnext-base-384',
+ 'facebook/convnext-base-384-22k-1k',
+ 'facebook/convnext-large-224',
+ 'facebook/convnext-large-224-22k',
+ 'facebook/convnext-large-224-22k-1k',
+ 'facebook/convnext-large-384',
+ 'facebook/convnext-large-384-22k-1k',
+ 'facebook/convnext-xlarge-224-22k',
+ 'facebook/convnext-xlarge-224-22k-1k',
+ 'facebook/convnext-xlarge-384-22k-1k',
+ ],
+ },
+ 'convnextv2': {
+ # Image classification
+ 'image-classification': [
+ 'facebook/convnextv2-atto-1k-224',
+ 'facebook/convnextv2-femto-1k-224',
+ 'facebook/convnextv2-pico-1k-224',
+ 'facebook/convnextv2-tiny-1k-224',
+ 'facebook/convnextv2-tiny-22k-384',
+ 'facebook/convnextv2-tiny-22k-224',
+ 'facebook/convnextv2-nano-1k-224',
+ 'facebook/convnextv2-nano-22k-384',
+ 'facebook/convnextv2-base-22k-224',
+ 'facebook/convnextv2-base-1k-224',
+ 'facebook/convnextv2-base-22k-384',
+ 'facebook/convnextv2-large-22k-224',
+ 'facebook/convnextv2-large-1k-224',
+ 'facebook/convnextv2-large-22k-384',
+ # 'facebook/convnextv2-huge-22k-512',
+ # 'facebook/convnextv2-huge-1k-224',
+ # 'facebook/convnextv2-huge-22k-384',
+ # 'facebook/convnextv2-nano-22k-224',
+ ],
+ },
'deberta': {
# Zero-shot classification
'zero-shot-classification': [
@@ -265,7 +328,7 @@
'distilbert-base-cased',
],
},
- 'donut': { # NOTE: also a `vision-encoder-decoder`
+ 'donut': { # NOTE: also a `vision-encoder-decoder`
# Image-to-text
'image-to-text': [
'naver-clova-ix/donut-base-finetuned-cord-v2',
@@ -277,6 +340,13 @@
'naver-clova-ix/donut-base-finetuned-docvqa',
],
},
+ 'dpt': {
+ # Depth estimation
+ 'depth-estimation': [
+ 'Intel/dpt-hybrid-midas',
+ 'Intel/dpt-large',
+ ],
+ },
'falcon': {
# Text generation
'text-generation': [
@@ -284,6 +354,13 @@
'fxmarty/really-tiny-falcon-testing',
]
},
+ 'glpn': {
+ # Depth estimation
+ 'depth-estimation': [
+ 'vinvino02/glpn-kitti',
+ 'vinvino02/glpn-nyu',
+ ],
+ },
'gpt_neo': {
# Text generation
'text-generation': [
@@ -454,6 +531,13 @@
'google/mt5-base',
],
},
+ 'nougat': {
+ # Image-to-text
+ 'image-to-text': [
+ 'facebook/nougat-small',
+ 'facebook/nougat-base',
+ ],
+ },
'opt': {
# Text generation
'text-generation': [
@@ -464,6 +548,15 @@
'PygmalionAI/pygmalion-350m',
]
},
+ 'owlvit': {
+ # Object detection (Zero-shot object detection)
+ # NOTE: Exported with --batch_size 1
+ 'zero-shot-object-detection': [
+ 'google/owlvit-base-patch32',
+ 'google/owlvit-base-patch16',
+ 'google/owlvit-large-patch14',
+ ],
+ },
'resnet': {
# Image classification
'image-classification': [
@@ -503,7 +596,7 @@
# 'facebook/sam-vit-large',
# 'facebook/sam-vit-huge',
# ],
-
+
'speecht5': {
# Text-to-audio/Text-to-speech
'text-to-audio': [
diff --git a/src/backends/onnx.js b/src/backends/onnx.js
index a06beb0e3..0bee3dce7 100644
--- a/src/backends/onnx.js
+++ b/src/backends/onnx.js
@@ -21,7 +21,7 @@
import * as ONNX_NODE from 'onnxruntime-node';
import * as ONNX_WEB from 'onnxruntime-web';
-/** @type {module} The ONNX runtime module. */
+/** @type {import('onnxruntime-web')} The ONNX runtime module. */
export let ONNX;
export const executionProviders = [
diff --git a/src/env.js b/src/env.js
index a12688933..16b19aaa5 100644
--- a/src/env.js
+++ b/src/env.js
@@ -29,7 +29,7 @@ import url from 'url';
import { ONNX } from './backends/onnx.js';
const { env: onnx_env } = ONNX;
-const VERSION = '2.8.0';
+const VERSION = '2.10.0';
// Check if various APIs are available (depends on environment)
const WEB_CACHE_AVAILABLE = typeof self !== 'undefined' && 'caches' in self;
diff --git a/src/models.js b/src/models.js
index b0a82cee0..1c55d8877 100644
--- a/src/models.js
+++ b/src/models.js
@@ -42,6 +42,10 @@ import {
AutoConfig,
} from './configs.js';
+import {
+ add_token_types,
+} from './tokenizers.js';
+
import {
Callable,
isIntegralNumber,
@@ -64,6 +68,7 @@ import {
WhisperTimeStampLogitsProcessor,
NoRepeatNGramLogitsProcessor,
RepetitionPenaltyLogitsProcessor,
+ NoBadWordsLogitsProcessor,
MinLengthLogitsProcessor,
MinNewTokensLengthLogitsProcessor,
@@ -82,7 +87,9 @@ import {
import { executionProviders, ONNX } from './backends/onnx.js';
import { medianFilter } from './transformers.js';
-const { InferenceSession, Tensor: ONNXTensor } = ONNX;
+const { InferenceSession, Tensor: ONNXTensor, env } = ONNX;
+
+/** @typedef {import('onnxruntime-web').InferenceSession} InferenceSession */
//////////////////////////////////////////////////
// Model types: used internally
@@ -142,21 +149,31 @@ async function constructSession(pretrained_model_name_or_path, fileName, options
/**
* Validate model inputs
* @param {InferenceSession} session The InferenceSession object that will be run.
- * @param {Object} inputs The inputs to check.
- * @returns {Promise