From 3d35f66e8b724cdee5d8dc5ade9627e32345379c Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Fri, 23 Feb 2024 01:35:59 +0100 Subject: [PATCH 1/8] Add WavLMForXVector support --- src/models.js | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/src/models.js b/src/models.js index 46c112a0b..19e183588 100644 --- a/src/models.js +++ b/src/models.js @@ -4735,6 +4735,47 @@ export class WavLMForSequenceClassification extends WavLMPreTrainedModel { } } +/** + * WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification. + * + * **Example:** Extract speaker embeddings with `WavLMForXVector`. + * ```javascript + * import { AutoProcessor, AutoModel, read_audio } from '@xenova/transformers'; + * + * const processor = await AutoProcessor.from_pretrained('D4ve-R/wavlm-base-plus-sv'); + * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav'; + * const audio = await read_audio(url, 16000); + * const inputs = await processor(audio); + + * const model = await AutoModel.from_pretrained('D4ve-R/wavlm-base-plus-sv', {quantized: false}); + * const embeddings = await model(inputs); + * // { + * // embeddings: Tensor { + * // dims: [ 1, 512 ], + * // type: 'float32', + * // data: Float32Array(512) [-0.349443256855011, ...], + * // size: 512 + * // }, + * // logits: Tensor { + * // dims: [ 1, 512 ], + * // type: 'float32', + * // data: Float32Array(512) [0.022836603224277496, ...], + * // size: 512 + * // } + * // } + * ``` + */ +export class WavLMForXVector extends WavLMPreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new XVectorOutput(await super._call(model_inputs)); + } +} + ////////////////////////////////////////////////// // SpeechT5 models /** @@ -5483,6 +5524,10 @@ const MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = new Map([ ['audio-spectrogram-transformer', ['ASTForAudioClassification', ASTForAudioClassification]], ]); +const MODEL_FOR_SPEAKER_VERIFICATION_MAPPING_NAMES = new Map([ + ['wavlm', ['WavLMForXVector', WavLMForXVector]], +]); + const MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = new Map([ ['vitmatte', ['VitMatteForImageMatting', VitMatteForImageMatting]], ]); @@ -5523,6 +5568,7 @@ const MODEL_CLASS_TYPE_MAPPING = [ [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_SPEAKER_VERIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], ]; for (const [mappings, type] of MODEL_CLASS_TYPE_MAPPING) { @@ -5741,6 +5787,10 @@ export class AutoModelForAudioClassification extends PretrainedMixin { static MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]; } +export class AutoModelForSpeakerVerification extends PretrainedMixin { + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SPEAKER_VERIFICATION_MAPPING_NAMES]; +} + export class AutoModelForDocumentQuestionAnswering extends PretrainedMixin { static MODEL_CLASS_MAPPINGS = [MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]; } @@ -5793,6 +5843,22 @@ export class SequenceClassifierOutput extends ModelOutput { } } +/** + * Base class for outputs of x-vector models. + */ +export class XVectorOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax). + * @param {Tensor} output.embeddings The embeddings of the input sequence. + */ + constructor({ logits, embeddings }) { + super(); + this.logits = logits; + this.embeddings = embeddings; + } +} + /** * Base class for outputs of token classification models. */ From 6d09c47e4c22171f6c6fc37c4a2a795586f6ddb7 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Fri, 23 Feb 2024 10:52:29 +0100 Subject: [PATCH 2/8] fix model docs --- src/models.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models.js b/src/models.js index 19e183588..6c823bfe5 100644 --- a/src/models.js +++ b/src/models.js @@ -4747,7 +4747,7 @@ export class WavLMForSequenceClassification extends WavLMPreTrainedModel { * const audio = await read_audio(url, 16000); * const inputs = await processor(audio); - * const model = await AutoModel.from_pretrained('D4ve-R/wavlm-base-plus-sv', {quantized: false}); + * const model = await AutoModel.from_pretrained('D4ve-R/wavlm-base-plus-sv'); * const embeddings = await model(inputs); * // { * // embeddings: Tensor { From 69f5652ef51b911b9bcbeee44ca47462657e6609 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Fri, 23 Feb 2024 11:30:51 +0100 Subject: [PATCH 3/8] fix bad naming --- src/models.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/models.js b/src/models.js index 6c823bfe5..11e51c169 100644 --- a/src/models.js +++ b/src/models.js @@ -5524,7 +5524,7 @@ const MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = new Map([ ['audio-spectrogram-transformer', ['ASTForAudioClassification', ASTForAudioClassification]], ]); -const MODEL_FOR_SPEAKER_VERIFICATION_MAPPING_NAMES = new Map([ +const MODEL_FOR_XVECTOR_MAPPING_NAMES = new Map([ ['wavlm', ['WavLMForXVector', WavLMForXVector]], ]); @@ -5568,7 +5568,7 @@ const MODEL_CLASS_TYPE_MAPPING = [ [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_SPEAKER_VERIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_XVECTOR_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], ]; for (const [mappings, type] of MODEL_CLASS_TYPE_MAPPING) { @@ -5787,8 +5787,8 @@ export class AutoModelForAudioClassification extends PretrainedMixin { static MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]; } -export class AutoModelForSpeakerVerification extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SPEAKER_VERIFICATION_MAPPING_NAMES]; +export class AutoModelForXVector extends PretrainedMixin { + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_XVECTOR_MAPPING_NAMES]; } export class AutoModelForDocumentQuestionAnswering extends PretrainedMixin { From c3c6e01b7c37966a1d41dde9ab96e08d36eb27e3 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 28 Feb 2024 16:57:30 +0200 Subject: [PATCH 4/8] Apply suggestions from code review --- src/models.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/models.js b/src/models.js index 11e51c169..a832bb6c1 100644 --- a/src/models.js +++ b/src/models.js @@ -5524,7 +5524,7 @@ const MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = new Map([ ['audio-spectrogram-transformer', ['ASTForAudioClassification', ASTForAudioClassification]], ]); -const MODEL_FOR_XVECTOR_MAPPING_NAMES = new Map([ +const MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = new Map([ ['wavlm', ['WavLMForXVector', WavLMForXVector]], ]); @@ -5568,7 +5568,7 @@ const MODEL_CLASS_TYPE_MAPPING = [ [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_XVECTOR_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], ]; for (const [mappings, type] of MODEL_CLASS_TYPE_MAPPING) { @@ -5788,7 +5788,7 @@ export class AutoModelForAudioClassification extends PretrainedMixin { } export class AutoModelForXVector extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_XVECTOR_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES]; } export class AutoModelForDocumentQuestionAnswering extends PretrainedMixin { @@ -5844,13 +5844,13 @@ export class SequenceClassifierOutput extends ModelOutput { } /** - * Base class for outputs of x-vector models. + * Base class for outputs of XVector models. */ export class XVectorOutput extends ModelOutput { /** * @param {Object} output The output of the model. - * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax). - * @param {Tensor} output.embeddings The embeddings of the input sequence. + * @param {Tensor} output.logits Classification hidden states before AMSoftmax, of shape `(batch_size, config.xvector_output_dim)`. + * @param {Tensor} output.embeddings Utterance embeddings used for vector similarity-based retrieval, of shape `(batch_size, config.xvector_output_dim)`. */ constructor({ logits, embeddings }) { super(); From fbfaf1482252d3fa8b76b023b6786be2ee5b61c5 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 28 Feb 2024 17:07:41 +0000 Subject: [PATCH 5/8] Update default `wavlm` quantization settings --- scripts/convert.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/convert.py b/scripts/convert.py index 87ac56f99..5b8620471 100644 --- a/scripts/convert.py +++ b/scripts/convert.py @@ -99,6 +99,10 @@ 'per_channel': False, 'reduce_range': False, }, + 'wavlm': { + 'per_channel': False, + 'reduce_range': False, + }, } MODELS_WITHOUT_TOKENIZERS = [ From f87fc7dc04be9790c360cfc54407d835afb7e85d Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 28 Feb 2024 17:08:10 +0000 Subject: [PATCH 6/8] Update list of supported `wavlm` models --- scripts/supported_models.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/supported_models.py b/scripts/supported_models.py index 99f3421ab..7d7a5c169 100644 --- a/scripts/supported_models.py +++ b/scripts/supported_models.py @@ -1019,6 +1019,12 @@ 'microsoft/wavlm-base-plus', 'microsoft/wavlm-large', ], + + # Audio XVector (e.g., for speaker verification) + 'audio-xvector': [ + 'microsoft/wavlm-base-plus-sv', + 'microsoft/wavlm-base-sv', + ], }, 'whisper': { # Automatic speech recognition From b62dee327e2af73c93f3deb7426e5e260e67985c Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 28 Feb 2024 17:15:59 +0000 Subject: [PATCH 7/8] Update JSDoc --- src/models.js | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/models.js b/src/models.js index a832bb6c1..cf7807300 100644 --- a/src/models.js +++ b/src/models.js @@ -4742,24 +4742,26 @@ export class WavLMForSequenceClassification extends WavLMPreTrainedModel { * ```javascript * import { AutoProcessor, AutoModel, read_audio } from '@xenova/transformers'; * - * const processor = await AutoProcessor.from_pretrained('D4ve-R/wavlm-base-plus-sv'); + * // Read and preprocess audio + * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base-plus-sv'); * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav'; * const audio = await read_audio(url, 16000); * const inputs = await processor(audio); - - * const model = await AutoModel.from_pretrained('D4ve-R/wavlm-base-plus-sv'); - * const embeddings = await model(inputs); + * + * // Run model with inputs + * const model = await AutoModel.from_pretrained('Xenova/wavlm-base-plus-sv'); + * const outputs = await model(inputs); * // { - * // embeddings: Tensor { + * // logits: Tensor { * // dims: [ 1, 512 ], * // type: 'float32', - * // data: Float32Array(512) [-0.349443256855011, ...], + * // data: Float32Array(512) [0.5847219228744507, ...], * // size: 512 * // }, - * // logits: Tensor { + * // embeddings: Tensor { * // dims: [ 1, 512 ], * // type: 'float32', - * // data: Float32Array(512) [0.022836603224277496, ...], + * // data: Float32Array(512) [-0.09079201519489288, ...], * // size: 512 * // } * // } @@ -4769,7 +4771,7 @@ export class WavLMForXVector extends WavLMPreTrainedModel { /** * Calls the model on new inputs. * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. + * @returns {Promise} An object containing the model's output logits and speaker embeddings. */ async _call(model_inputs) { return new XVectorOutput(await super._call(model_inputs)); From 87e0131d64c277ced669ab70fb911d89c0451156 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 28 Feb 2024 17:16:16 +0000 Subject: [PATCH 8/8] Fix typo --- src/processors.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/processors.js b/src/processors.js index 463f82551..a8b82e913 100644 --- a/src/processors.js +++ b/src/processors.js @@ -158,7 +158,7 @@ function post_process_object_detection(outputs, threshold = 0.5, target_sizes = function validate_audio_inputs(audio, feature_extractor) { if (!(audio instanceof Float32Array || audio instanceof Float64Array)) { throw new Error( - `${feature_extractor} expects input to be a Float32Array or a Float64Array, but got ${audio?.constructor?.name ?? typeof audio} instead.` + + `${feature_extractor} expects input to be a Float32Array or a Float64Array, but got ${audio?.constructor?.name ?? typeof audio} instead. ` + `If using the feature extractor directly, remember to use \`read_audio(url, sampling_rate)\` to obtain the raw audio data of the file/url.` ) }