From 5f572241b4dee3622dc31584ad682abf496fef2f Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 6 Mar 2024 13:08:28 +0200 Subject: [PATCH] Add support for `UniSpeech` and `UniSpeechSat` models (#624) * Add support for `UniSpeech` and `UniSpeechSat` models * Remove listed support for unispeech STT models * Align unispeech w/ existing wav2vec logic --- README.md | 2 + docs/snippets/6_supported-models.snippet | 2 + scripts/convert.py | 16 +++- scripts/supported_models.py | 50 +++++++++++ src/models.js | 103 ++++++++++++++++++++++- src/pipelines.js | 2 + 6 files changed, 173 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4fcaf68f5..a5e51fe1c 100644 --- a/README.md +++ b/README.md @@ -341,6 +341,8 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. +1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. +1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son. diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet index ed2c1bdff..eed000be3 100644 --- a/docs/snippets/6_supported-models.snippet +++ b/docs/snippets/6_supported-models.snippet @@ -76,6 +76,8 @@ 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. +1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. +1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son. diff --git a/scripts/convert.py b/scripts/convert.py index 5b8620471..d53843983 100644 --- a/scripts/convert.py +++ b/scripts/convert.py @@ -103,6 +103,18 @@ 'per_channel': False, 'reduce_range': False, }, + 'wav2vec2': { + 'per_channel': False, + 'reduce_range': False, + }, + 'unispeech': { + 'per_channel': False, + 'reduce_range': False, + }, + 'unispeech-sat': { + 'per_channel': False, + 'reduce_range': False, + }, } MODELS_WITHOUT_TOKENIZERS = [ @@ -110,6 +122,8 @@ 'wav2vec2-bert', 'wavlm', 'hubert', + 'unispeech', + 'unispeech-sat', ] @@ -386,7 +400,7 @@ def main(): **get_main_export_kwargs(config, "automatic-speech-recognition") ) - elif config.model_type in ('wav2vec2', 'wav2vec2-bert', 'hubert'): + elif config.model_type in ('wav2vec2', 'wav2vec2-bert', 'hubert', 'unispeech' , 'unispeech-sat'): if tokenizer is not None: from .extra.wav2vec2 import generate_tokenizer_json tokenizer_json = generate_tokenizer_json(tokenizer) diff --git a/scripts/supported_models.py b/scripts/supported_models.py index 7d7a5c169..7a2fd62bf 100644 --- a/scripts/supported_models.py +++ b/scripts/supported_models.py @@ -924,6 +924,45 @@ 'microsoft/trocr-base-handwritten', ], }, + 'unispeech': { + # Feature extraction + 'feature-extraction': [ + # Requires --task feature-extraction + 'microsoft/unispeech-large-1500h-cv', + ], + # TODO: add support for + # # Automatic speech recognition + # 'automatic-speech-recognition': [ + # 'microsoft/unispeech-1350-en-353-fr-ft-1h', + # 'microsoft/unispeech-1350-en-17h-ky-ft-1h', + # 'microsoft/unispeech-1350-en-90-it-ft-1h', + # 'microsoft/unispeech-1350-en-168-es-ft-1h', + # ], + }, + 'unispeech-sat': { + # Feature extraction + 'feature-extraction': [ + # Requires --task feature-extraction + 'microsoft/unispeech-sat-base', + ], + + # Audio XVector (e.g., for speaker verification) + 'audio-xvector': [ + 'microsoft/unispeech-sat-base-plus-sv', + 'microsoft/unispeech-sat-base-sv', + 'microsoft/unispeech-sat-large-sv', + ], + + # Audio frame classification + 'audio-frame-classification': [ + 'microsoft/unispeech-sat-base-plus-sd', + ], + + # Automatic speech recognition + 'automatic-speech-recognition': [ + 'microsoft/unispeech-sat-base-100h-libri-ft', + ], + }, 'vision-encoder-decoder': { # Image-to-text 'image-to-text': [ @@ -993,6 +1032,11 @@ 'facebook/mms-lid-4017', ], + # Audio frame classification + 'audio-frame-classification': [ + 'anton-l/wav2vec2-base-superb-sd', + ], + # Automatic speech recognition 'automatic-speech-recognition': [ 'jonatasgrosman/wav2vec2-large-xlsr-53-english', @@ -1020,6 +1064,12 @@ 'microsoft/wavlm-large', ], + # Audio frame classification + 'audio-frame-classification': [ + 'anton-l/wav2vec2-base-superb-sd', + 'microsoft/wavlm-base-plus-sd', + ], + # Audio XVector (e.g., for speaker verification) 'audio-xvector': [ 'microsoft/wavlm-base-plus-sv', diff --git a/src/models.js b/src/models.js index cf7807300..ec4612052 100644 --- a/src/models.js +++ b/src/models.js @@ -4574,7 +4574,97 @@ export class Wav2Vec2ForSequenceClassification extends Wav2Vec2PreTrainedModel { ////////////////////////////////////////////////// ////////////////////////////////////////////////// -// Wav2Vec2 models +// UniSpeech models +export class UniSpeechPreTrainedModel extends PreTrainedModel { }; + +/** + * The bare UniSpeech Model transformer outputting raw hidden-states without any specific head on top. + */ +export class UniSpeechModel extends UniSpeechPreTrainedModel { } + +/** + * UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). + */ +export class UniSpeechForCTC extends UniSpeechPreTrainedModel { + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } +} + +/** + * UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output). + */ +export class UniSpeechForSequenceClassification extends UniSpeechPreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} +////////////////////////////////////////////////// + +////////////////////////////////////////////////// +// UniSpeechSat models +export class UniSpeechSatPreTrainedModel extends PreTrainedModel { }; + +/** + * The bare UniSpeechSat Model transformer outputting raw hidden-states without any specific head on top. + */ +export class UniSpeechSatModel extends UniSpeechSatPreTrainedModel { } + +/** + * UniSpeechSat Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). + */ +export class UniSpeechSatForCTC extends UniSpeechSatPreTrainedModel { + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } +} + +/** + * UniSpeechSat Model with a sequence classification head on top (a linear layer over the pooled output). + */ +export class UniSpeechSatForSequenceClassification extends UniSpeechSatPreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * UniSpeechSat Model with a frame classification head on top for tasks like Speaker Diarization. + */ +export class UniSpeechSatForAudioFrameClassification extends UniSpeechSatPreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} +////////////////////////////////////////////////// + +////////////////////////////////////////////////// +// Wav2Vec2Bert models export class Wav2Vec2BertPreTrainedModel extends PreTrainedModel { }; /** @@ -5289,6 +5379,8 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([ ['squeezebert', ['SqueezeBertModel', SqueezeBertModel]], ['wav2vec2', ['Wav2Vec2Model', Wav2Vec2Model]], ['wav2vec2-bert', ['Wav2Vec2BertModel', Wav2Vec2BertModel]], + ['unispeech', ['UniSpeechModel', UniSpeechModel]], + ['unispeech-sat', ['UniSpeechSatModel', UniSpeechSatModel]], ['hubert', ['HubertModel', HubertModel]], ['wavlm', ['WavLMModel', WavLMModel]], ['audio-spectrogram-transformer', ['ASTModel', ASTModel]], @@ -5514,6 +5606,8 @@ const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([ const MODEL_FOR_CTC_MAPPING_NAMES = new Map([ ['wav2vec2', ['Wav2Vec2ForCTC', Wav2Vec2ForCTC]], ['wav2vec2-bert', ['Wav2Vec2BertForCTC', Wav2Vec2BertForCTC]], + ['unispeech', ['UniSpeechForCTC', UniSpeechForCTC]], + ['unispeech-sat', ['UniSpeechSatForCTC', UniSpeechSatForCTC]], ['wavlm', ['WavLMForCTC', WavLMForCTC]], ['hubert', ['HubertForCTC', HubertForCTC]], ]); @@ -5521,6 +5615,8 @@ const MODEL_FOR_CTC_MAPPING_NAMES = new Map([ const MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = new Map([ ['wav2vec2', ['Wav2Vec2ForSequenceClassification', Wav2Vec2ForSequenceClassification]], ['wav2vec2-bert', ['Wav2Vec2BertForSequenceClassification', Wav2Vec2BertForSequenceClassification]], + ['unispeech', ['UniSpeechForSequenceClassification', UniSpeechForSequenceClassification]], + ['unispeech-sat', ['UniSpeechSatForSequenceClassification', UniSpeechSatForSequenceClassification]], ['wavlm', ['WavLMForSequenceClassification', WavLMForSequenceClassification]], ['hubert', ['HubertForSequenceClassification', HubertForSequenceClassification]], ['audio-spectrogram-transformer', ['ASTForAudioClassification', ASTForAudioClassification]], @@ -5530,6 +5626,10 @@ const MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = new Map([ ['wavlm', ['WavLMForXVector', WavLMForXVector]], ]); +const MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = new Map([ + ['unispeech-sat', ['UniSpeechSatForAudioFrameClassification', UniSpeechSatForAudioFrameClassification]], +]); + const MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = new Map([ ['vitmatte', ['VitMatteForImageMatting', VitMatteForImageMatting]], ]); @@ -5571,6 +5671,7 @@ const MODEL_CLASS_TYPE_MAPPING = [ [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], ]; for (const [mappings, type] of MODEL_CLASS_TYPE_MAPPING) { diff --git a/src/pipelines.js b/src/pipelines.js index a9af251cb..25dfb5875 100644 --- a/src/pipelines.js +++ b/src/pipelines.js @@ -1526,6 +1526,8 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options return this._call_whisper(audio, kwargs) case 'wav2vec2': case 'wav2vec2-bert': + case 'unispeech': + case 'unispeech-sat': case 'hubert': return this._call_wav2vec2(audio, kwargs) default: