diff --git a/src/pipelines.js b/src/pipelines.js index 398e1f096..161094270 100644 --- a/src/pipelines.js +++ b/src/pipelines.js @@ -130,8 +130,9 @@ async function prepareAudios(audios, sampling_rate) { * @param {number[]} box The bounding box as a list. * @param {boolean} asInteger Whether to cast to integers. * @returns {BoundingBox} The bounding box as an object. + * @private */ -export function get_bounding_box(box, asInteger) { +function get_bounding_box(box, asInteger) { if (asInteger) { box = box.map(x => x | 0); } @@ -141,6 +142,14 @@ export function get_bounding_box(box, asInteger) { } +/** + * @callback DisposeType Disposes the item. + * @returns {Promise} A promise that resolves when the item has been disposed. + * + * @typedef {Object} Disposable + * @property {DisposeType} dispose A promise that resolves when the pipeline has been disposed. + */ + /** * The Pipeline class is the class from which all pipelines inherit. * Refer to this class for methods shared across different pipelines. @@ -151,9 +160,9 @@ export class Pipeline extends Callable { * Create a new Pipeline. * @param {Object} options An object containing the following properties: * @param {string} [options.task] The task of the pipeline. Useful for specifying subtasks. - * @param {PreTrainedModel} [options.model] The model to use. - * @param {PreTrainedTokenizer} [options.tokenizer=null] The tokenizer to use (if any). - * @param {Processor} [options.processor=null] The processor to use (if any). + * @param {PreTrainedModel} [options.model] The model used by the pipeline. + * @param {PreTrainedTokenizer} [options.tokenizer=null] The tokenizer used by the pipeline (if any). + * @param {Processor} [options.processor=null] The processor used by the pipeline (if any). */ constructor({ task, model, tokenizer = null, processor = null }) { super(); @@ -163,10 +172,7 @@ export class Pipeline extends Callable { this.processor = processor; } - /** - * Disposes the model. - * @returns {Promise} A promise that resolves when the model has been disposed. - */ + /** @type {DisposeType} */ async dispose() { await this.model.dispose(); } @@ -175,8 +181,8 @@ export class Pipeline extends Callable { /** * @typedef {Object} ModelTokenizerConstructorArgs * @property {string} task The task of the pipeline. Useful for specifying subtasks. - * @property {PreTrainedModel} model The model to use. - * @property {PreTrainedTokenizer} tokenizer The tokenizer to use. + * @property {PreTrainedModel} model The model used by the pipeline. + * @property {PreTrainedTokenizer} tokenizer The tokenizer used by the pipeline. * * @typedef {ModelTokenizerConstructorArgs} TextPipelineConstructorArgs An object used to instantiate a text-based pipeline. */ @@ -184,8 +190,8 @@ export class Pipeline extends Callable { /** * @typedef {Object} ModelProcessorConstructorArgs * @property {string} task The task of the pipeline. Useful for specifying subtasks. - * @property {PreTrainedModel} model The model to use. - * @property {Processor} processor The processor to use. + * @property {PreTrainedModel} model The model used by the pipeline. + * @property {Processor} processor The processor used by the pipeline. * * @typedef {ModelProcessorConstructorArgs} AudioPipelineConstructorArgs An object used to instantiate an audio-based pipeline. * @typedef {ModelProcessorConstructorArgs} ImagePipelineConstructorArgs An object used to instantiate an image-based pipeline. @@ -195,9 +201,9 @@ export class Pipeline extends Callable { /** * @typedef {Object} ModelTokenizerProcessorConstructorArgs * @property {string} task The task of the pipeline. Useful for specifying subtasks. - * @property {PreTrainedModel} model The model to use. - * @property {PreTrainedTokenizer} tokenizer The tokenizer to use. - * @property {Processor} processor The processor to use. + * @property {PreTrainedModel} model The model used by the pipeline. + * @property {PreTrainedTokenizer} tokenizer The tokenizer used by the pipeline. + * @property {Processor} processor The processor used by the pipeline. * * @typedef {ModelTokenizerProcessorConstructorArgs} TextAudioPipelineConstructorArgs An object used to instantiate a text- and audio-based pipeline. * @typedef {ModelTokenizerProcessorConstructorArgs} TextImagePipelineConstructorArgs An object used to instantiate a text- and image-based pipeline. @@ -216,6 +222,8 @@ export class Pipeline extends Callable { * @param {string|string[]} texts The input text(s) to be classified. * @param {TextClassificationPipelineOptions} [options] The options to use for text classification. * @returns {Promise} An array or object containing the predicted labels and scores. + * + * @typedef {TextPipelineConstructorArgs & TextClassificationPipelineCallback & Disposable} TextClassificationPipelineType */ /** @@ -255,7 +263,7 @@ export class Pipeline extends Callable { * // ] * ``` */ -export class TextClassificationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TextClassificationPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class TextClassificationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TextClassificationPipelineType} */ (Pipeline)) { /** * Create a new TextClassificationPipeline. @@ -269,24 +277,23 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi async _call(texts, { topk = 1 } = {}) { - const self = /** @type {TextClassificationPipeline & Pipeline} */ (/** @type {any} */ (this)); // Run tokenization - const model_inputs = self.tokenizer(texts, { + const model_inputs = this.tokenizer(texts, { padding: true, truncation: true, }); // Run model - const outputs = await self.model(model_inputs) + const outputs = await this.model(model_inputs) // TODO: Use softmax tensor function const function_to_apply = - self.model.config.problem_type === 'multi_label_classification' + this.model.config.problem_type === 'multi_label_classification' ? batch => batch.sigmoid().data : batch => softmax(batch.data); // single_label_classification (default) - const id2label = self.model.config.id2label; + const id2label = this.model.config.id2label; const toReturn = []; for (const batch of outputs.logits) { @@ -325,6 +332,8 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi * @param {string|string[]} texts One or several texts (or one list of texts) for token classification. * @param {TokenClassificationPipelineOptions} [options] The options to use for token classification. * @returns {Promise} The result. + * + * @typedef {TextPipelineConstructorArgs & TokenClassificationPipelineCallback & Disposable} TokenClassificationPipelineType */ /** @@ -356,7 +365,7 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi * // ] * ``` */ -export class TokenClassificationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TokenClassificationPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class TokenClassificationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TokenClassificationPipelineType} */ (Pipeline)) { /** * Create a new TokenClassificationPipeline. @@ -370,21 +379,20 @@ export class TokenClassificationPipeline extends (/** @type {new (options: TextP async _call(texts, { ignore_labels = ['O'], } = {}) { - const self = /** @type {TokenClassificationPipeline & Pipeline} */ (/** @type {any} */ (this)); const isBatched = Array.isArray(texts); // Run tokenization - const model_inputs = self.tokenizer(isBatched ? texts : [texts], { + const model_inputs = this.tokenizer(isBatched ? texts : [texts], { padding: true, truncation: true, }); // Run model - const outputs = await self.model(model_inputs) + const outputs = await this.model(model_inputs) const logits = outputs.logits; - const id2label = self.model.config.id2label; + const id2label = this.model.config.id2label; const toReturn = []; for (let i = 0; i < logits.dims[0]; ++i) { @@ -404,7 +412,7 @@ export class TokenClassificationPipeline extends (/** @type {new (options: TextP } // TODO add option to keep special tokens? - const word = self.tokenizer.decode([ids[j].item()], { skip_special_tokens: true }); + const word = this.tokenizer.decode([ids[j].item()], { skip_special_tokens: true }); if (word === '') { // Was a special token. So, we skip it. continue; @@ -444,6 +452,8 @@ export class TokenClassificationPipeline extends (/** @type {new (options: TextP * @param {string|string[]} context One or several context(s) associated with the question(s) (must be used in conjunction with the `question` argument). * @param {QuestionAnsweringPipelineOptions} [options] The options to use for question answering. * @returns {Promise} An array or object containing the predicted answers and scores. + * + * @typedef {TextPipelineConstructorArgs & QuestionAnsweringPipelineCallback & Disposable} QuestionAnsweringPipelineType */ /** @@ -461,7 +471,7 @@ export class TokenClassificationPipeline extends (/** @type {new (options: TextP * // } * ``` */ -export class QuestionAnsweringPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => QuestionAnsweringPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class QuestionAnsweringPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => QuestionAnsweringPipelineType} */ (Pipeline)) { /** * Create a new QuestionAnsweringPipeline. @@ -475,22 +485,21 @@ export class QuestionAnsweringPipeline extends (/** @type {new (options: TextPip async _call(question, context, { topk = 1 } = {}) { - const self = /** @type {QuestionAnsweringPipeline & Pipeline} */ (/** @type {any} */ (this)); // Run tokenization - const inputs = self.tokenizer(question, { + const inputs = this.tokenizer(question, { text_pair: context, padding: true, truncation: true, }); - const output = await self.model(inputs); + const output = await this.model(inputs); /** @type {QuestionAnsweringOutput[]} */ const toReturn = []; for (let j = 0; j < output.start_logits.dims[0]; ++j) { const ids = inputs.input_ids[j]; - const sepIndex = ids.indexOf(self.tokenizer.sep_token_id); + const sepIndex = ids.indexOf(this.tokenizer.sep_token_id); const s1 = Array.from(softmax(output.start_logits[j].data)) .map((x, i) => [x, i]) @@ -509,7 +518,7 @@ export class QuestionAnsweringPipeline extends (/** @type {new (options: TextPip const answer_tokens = [...ids].slice(start, end + 1) - const answer = self.tokenizer.decode(answer_tokens, { + const answer = this.tokenizer.decode(answer_tokens, { skip_special_tokens: true, }); @@ -545,6 +554,8 @@ export class QuestionAnsweringPipeline extends (/** @type {new (options: TextPip * and the sequence with the predicted token filled in, or an array of such arrays (one for each input text). * If only one input text is given, the output will be an array of objects. * @throws {Error} When the mask token is not found in the input text. + * + * @typedef {TextPipelineConstructorArgs & FillMaskPipelineCallback & Disposable} FillMaskPipelineType */ /** @@ -570,7 +581,7 @@ export class QuestionAnsweringPipeline extends (/** @type {new (options: TextPip * // [{ token_str: 'spiral', score: 0.6299987435340881, token: 14061, sequence: 'The Milky Way is a spiral galaxy.' }] * ``` */ -export class FillMaskPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => FillMaskPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class FillMaskPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => FillMaskPipelineType} */ (Pipeline)) { /** * Create a new FillMaskPipeline. @@ -584,25 +595,24 @@ export class FillMaskPipeline extends (/** @type {new (options: TextPipelineCons async _call(texts, { topk = 5 } = {}) { - const self = /** @type {FillMaskPipeline & Pipeline} */ (/** @type {any} */ (this)); // Run tokenization - const model_inputs = self.tokenizer(texts, { + const model_inputs = this.tokenizer(texts, { padding: true, truncation: true, }); // Run model - const outputs = await self.model(model_inputs) + const outputs = await this.model(model_inputs) const toReturn = []; for (let i = 0; i < model_inputs.input_ids.dims[0]; ++i) { const ids = model_inputs.input_ids[i]; - const mask_token_index = ids.indexOf(self.tokenizer.mask_token_id) + const mask_token_index = ids.indexOf(this.tokenizer.mask_token_id) if (mask_token_index === -1) { - throw Error(`Mask token (${self.tokenizer.mask_token}) not found in text.`) + throw Error(`Mask token (${this.tokenizer.mask_token}) not found in text.`) } const logits = outputs.logits[i]; const itemLogits = logits[mask_token_index]; @@ -616,8 +626,8 @@ export class FillMaskPipeline extends (/** @type {new (options: TextPipelineCons return { score: x[1], token: x[0], - token_str: self.tokenizer.model.vocab[x[0]], - sequence: self.tokenizer.decode(sequence, { skip_special_tokens: true }), + token_str: this.tokenizer.model.vocab[x[0]], + sequence: this.tokenizer.decode(sequence, { skip_special_tokens: true }), } })); } @@ -635,6 +645,8 @@ export class FillMaskPipeline extends (/** @type {new (options: TextPipelineCons * @param {string|string[]} texts Input text for the encoder. * @param {import('./utils/generation.js').GenerationConfigType} [options] Additional keyword arguments to pass along to the generate method of the model. * @returns {Promise} + * + * @typedef {TextPipelineConstructorArgs & Text2TextGenerationPipelineCallback & Disposable} Text2TextGenerationPipelineType */ /** @@ -649,7 +661,7 @@ export class FillMaskPipeline extends (/** @type {new (options: TextPipelineCons * // [{ generated_text: "To become more healthy, you can: 1. Eat a balanced diet with plenty of fruits, vegetables, whole grains, lean proteins, and healthy fats. 2. Stay hydrated by drinking plenty of water. 3. Get enough sleep and manage stress levels. 4. Avoid smoking and excessive alcohol consumption. 5. Regularly exercise and maintain a healthy weight. 6. Practice good hygiene and sanitation. 7. Seek medical attention if you experience any health issues." }] * ``` */ -export class Text2TextGenerationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => Text2TextGenerationPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class Text2TextGenerationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => Text2TextGenerationPipelineType} */ (Pipeline)) { /** @type {'generated_text'} */ _key = 'generated_text'; @@ -667,57 +679,59 @@ export class Text2TextGenerationPipeline extends (/** @type {new (options: TextP texts = [texts]; } - const self = /** @type {Text2TextGenerationPipeline & Pipeline} */ (/** @type {any} */ (this)); // Add global prefix, if present - if (self.model.config.prefix) { - texts = texts.map(x => self.model.config.prefix + x) + if (this.model.config.prefix) { + texts = texts.map(x => this.model.config.prefix + x) } // Handle task specific params: - const task_specific_params = self.model.config.task_specific_params - if (task_specific_params && task_specific_params[self.task]) { + const task_specific_params = this.model.config.task_specific_params + if (task_specific_params && task_specific_params[this.task]) { // Add prefixes, if present - if (task_specific_params[self.task].prefix) { - texts = texts.map(x => task_specific_params[self.task].prefix + x) + if (task_specific_params[this.task].prefix) { + texts = texts.map(x => task_specific_params[this.task].prefix + x) } // TODO update generation config } + const tokenizer = this.tokenizer; const tokenizer_options = { padding: true, truncation: true, } let input_ids; - if (self instanceof TranslationPipeline && '_build_translation_inputs' in /** @type {Pipeline} */ (self).tokenizer) { + if (this instanceof TranslationPipeline && '_build_translation_inputs' in tokenizer) { // TODO: move to Translation pipeline? // Currently put here to avoid code duplication // @ts-ignore - input_ids = self.tokenizer._build_translation_inputs(texts, tokenizer_options, generate_kwargs).input_ids; + input_ids = tokenizer._build_translation_inputs(texts, tokenizer_options, generate_kwargs).input_ids; } else { - input_ids = self.tokenizer(texts, tokenizer_options).input_ids; + input_ids = tokenizer(texts, tokenizer_options).input_ids; } - const outputTokenIds = await self.model.generate(input_ids, generate_kwargs); + const outputTokenIds = await this.model.generate(input_ids, generate_kwargs); - return self.tokenizer.batch_decode(outputTokenIds, { + return tokenizer.batch_decode(outputTokenIds, { skip_special_tokens: true, - }).map(text => ({ [self._key]: text })); + }).map(text => ({ [this._key]: text })); } } /** - * @typedef {Object} SummarizationGenerationSingle + * @typedef {Object} SummarizationSingle * @property {string} summary_text The summary text. - * @typedef {SummarizationGenerationSingle[]} SummarizationGenerationOutput + * @typedef {SummarizationSingle[]} SummarizationOutput * - * @callback SummarizationGenerationPipelineCallback Summarize the text(s) given as inputs. + * @callback SummarizationPipelineCallback Summarize the text(s) given as inputs. * @param {string|string[]} texts One or several articles (or one list of articles) to summarize. * @param {import('./utils/generation.js').GenerationConfigType} [options] Additional keyword arguments to pass along to the generate method of the model. - * @returns {Promise} + * @returns {Promise} + * + * @typedef {TextPipelineConstructorArgs & SummarizationPipelineCallback & Disposable} SummarizationPipelineType */ /** @@ -740,7 +754,7 @@ export class Text2TextGenerationPipeline extends (/** @type {new (options: TextP * // [{ summary_text: ' The Eiffel Tower is about the same height as an 81-storey building and the tallest structure in Paris. It is the second tallest free-standing structure in France after the Millau Viaduct.' }] * ``` */ -export class SummarizationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => SummarizationGenerationPipelineCallback} */ (/** @type {any} */ (Text2TextGenerationPipeline))) { +export class SummarizationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => SummarizationPipelineType} */ (/** @type {any} */ (Text2TextGenerationPipeline))) { /** @type {'summary_text'} */ _key = 'summary_text'; @@ -755,14 +769,16 @@ export class SummarizationPipeline extends (/** @type {new (options: TextPipelin /** - * @typedef {Object} TranslationGenerationSingle + * @typedef {Object} TranslationSingle * @property {string} translation_text The translated text. - * @typedef {TranslationGenerationSingle[]} TranslationGenerationOutput + * @typedef {TranslationSingle[]} TranslationOutput * - * @callback TranslationGenerationPipelineCallback Translate the text(s) given as inputs. + * @callback TranslationPipelineCallback Translate the text(s) given as inputs. * @param {string|string[]} texts Texts to be translated. * @param {import('./utils/generation.js').GenerationConfigType} [options] Additional keyword arguments to pass along to the generate method of the model. - * @returns {Promise} + * @returns {Promise} + * + * @typedef {TextPipelineConstructorArgs & TranslationPipelineCallback & Disposable} TranslationPipelineType */ /** @@ -810,7 +826,7 @@ export class SummarizationPipeline extends (/** @type {new (options: TextPipelin * // [{ translation_text: 'Le chef des Nations affirme qu 'il n 'y a military solution in Syria.' }] * ``` */ -export class TranslationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TranslationGenerationPipelineCallback} */ (/** @type {any} */ (Text2TextGenerationPipeline))) { +export class TranslationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TranslationPipelineType} */ (/** @type {any} */ (Text2TextGenerationPipeline))) { /** @type {'translation_text'} */ _key = 'translation_text'; @@ -837,6 +853,8 @@ export class TranslationPipeline extends (/** @type {new (options: TextPipelineC * @param {string|string[]} texts One or several prompts (or one list of prompts) to complete. * @param {TextGenerationConfig} [options] Additional keyword arguments to pass along to the generate method of the model. * @returns {Promise} An array or object containing the generated texts. + * + * @typedef {TextPipelineConstructorArgs & TextGenerationPipelineCallback & Disposable} TextGenerationPipelineType */ /** @@ -889,7 +907,7 @@ export class TranslationPipeline extends (/** @type {new (options: TextPipelineC * // }] * ``` */ -export class TextGenerationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TextGenerationPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class TextGenerationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TextGenerationPipelineType} */ (Pipeline)) { /** * Create a new TextGenerationPipeline. @@ -901,7 +919,6 @@ export class TextGenerationPipeline extends (/** @type {new (options: TextPipeli /** @type {TextGenerationPipelineCallback} */ async _call(texts, generate_kwargs = {}) { - const self = /** @type {TextGenerationPipeline & Pipeline} */ (/** @type {any} */ (this)); const isBatched = Array.isArray(texts); if (!isBatched) { @@ -911,18 +928,18 @@ export class TextGenerationPipeline extends (/** @type {new (options: TextPipeli // By default, do not add special tokens const add_special_tokens = generate_kwargs.add_special_tokens ?? false; - self.tokenizer.padding_side = 'left'; - const { input_ids, attention_mask } = self.tokenizer(texts, { + this.tokenizer.padding_side = 'left'; + const { input_ids, attention_mask } = this.tokenizer(texts, { add_special_tokens, padding: true, truncation: true, }); - const outputTokenIds = await self.model.generate(input_ids, generate_kwargs, null, { + const outputTokenIds = await this.model.generate(input_ids, generate_kwargs, null, { inputs_attention_mask: attention_mask }); - const decoded = self.tokenizer.batch_decode(outputTokenIds, { + const decoded = this.tokenizer.batch_decode(outputTokenIds, { skip_special_tokens: true, }); @@ -959,6 +976,8 @@ export class TextGenerationPipeline extends (/** @type {new (options: TextPipeli * Can be a single label, a string of comma-separated labels, or a list of labels. * @param {ZeroShotClassificationPipelineOptions} [options] The options to use for zero-shot classification. * @returns {Promise} An array or object containing the predicted labels and scores. + * + * @typedef {TextPipelineConstructorArgs & ZeroShotClassificationPipelineCallback & Disposable} ZeroShotClassificationPipelineType */ /** @@ -993,7 +1012,7 @@ export class TextGenerationPipeline extends (/** @type {new (options: TextPipeli * // } * ``` */ -export class ZeroShotClassificationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => ZeroShotClassificationPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class ZeroShotClassificationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => ZeroShotClassificationPipelineType} */ (Pipeline)) { /** * Create a new ZeroShotClassificationPipeline. * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. @@ -1026,7 +1045,6 @@ export class ZeroShotClassificationPipeline extends (/** @type {new (options: Te hypothesis_template = "This example is {}.", multi_label = false, } = {}) { - const self = /** @type {ZeroShotClassificationPipeline & Pipeline} */ (/** @type {any} */ (this)); const isBatched = Array.isArray(texts); if (!isBatched) { @@ -1052,20 +1070,20 @@ export class ZeroShotClassificationPipeline extends (/** @type {new (options: Te const entails_logits = []; for (const hypothesis of hypotheses) { - const inputs = self.tokenizer(premise, { + const inputs = this.tokenizer(premise, { text_pair: hypothesis, padding: true, truncation: true, }) - const outputs = await self.model(inputs) + const outputs = await this.model(inputs) if (softmaxEach) { entails_logits.push([ - outputs.logits.data[self.contradiction_id], - outputs.logits.data[self.entailment_id] + outputs.logits.data[this.contradiction_id], + outputs.logits.data[this.entailment_id] ]) } else { - entails_logits.push(outputs.logits.data[self.entailment_id]) + entails_logits.push(outputs.logits.data[this.entailment_id]) } } @@ -1098,6 +1116,8 @@ export class ZeroShotClassificationPipeline extends (/** @type {new (options: Te * @param {string|string[]} texts One or several texts (or one list of texts) to get the features of. * @param {FeatureExtractionPipelineOptions} [options] The options to use for feature extraction. * @returns {Promise} The features computed by the model. + * + * @typedef {TextPipelineConstructorArgs & FeatureExtractionPipelineCallback & Disposable} FeatureExtractionPipelineType */ /** @@ -1137,7 +1157,7 @@ export class ZeroShotClassificationPipeline extends (/** @type {new (options: Te * // } * ``` */ -export class FeatureExtractionPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => FeatureExtractionPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class FeatureExtractionPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => FeatureExtractionPipelineType} */ (Pipeline)) { /** * Create a new FeatureExtractionPipeline. * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. @@ -1151,16 +1171,15 @@ export class FeatureExtractionPipeline extends (/** @type {new (options: TextPip pooling = /** @type {'none'} */('none'), normalize = false, } = {}) { - const self = /** @type {FeatureExtractionPipeline & Pipeline} */ (/** @type {any} */ (this)); // Run tokenization - const model_inputs = self.tokenizer(texts, { + const model_inputs = this.tokenizer(texts, { padding: true, truncation: true, }); // Run model - const outputs = await self.model(model_inputs) + const outputs = await this.model(model_inputs) // TODO: Provide warning to the user that they might be using model which was not exported // specifically for feature extraction @@ -1210,6 +1229,8 @@ export class FeatureExtractionPipeline extends (/** @type {new (options: TextPip * - `Float32Array` or `Float64Array` of shape `(n, )`, representing the raw audio at the correct sampling rate (no further check will be done). * @param {AudioClassificationPipelineOptions} [options] The options to use for audio classification. * @returns {Promise} An array or object containing the predicted labels and scores. + * + * @typedef {AudioPipelineConstructorArgs & AudioClassificationPipelineCallback & Disposable} AudioClassificationPipelineType */ /** @@ -1240,7 +1261,7 @@ export class FeatureExtractionPipeline extends (/** @type {new (options: TextPip * // ] * ``` */ -export class AudioClassificationPipeline extends (/** @type {new (options: AudioPipelineConstructorArgs) => AudioClassificationPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class AudioClassificationPipeline extends (/** @type {new (options: AudioPipelineConstructorArgs) => AudioClassificationPipelineType} */ (Pipeline)) { /** * Create a new AudioClassificationPipeline. @@ -1254,19 +1275,18 @@ export class AudioClassificationPipeline extends (/** @type {new (options: Audio async _call(audio, { topk = null } = {}) { - const self = /** @type {AudioClassificationPipeline & Pipeline} */ (/** @type {any} */ (this)); const single = !Array.isArray(audio); - const sampling_rate = self.processor.feature_extractor.config.sampling_rate; + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; const preparedAudios = await prepareAudios(audio, sampling_rate); - const id2label = self.model.config.id2label; + const id2label = this.model.config.id2label; const toReturn = []; for (const aud of preparedAudios) { - const inputs = await self.processor(aud); - const output = await self.model(inputs); + const inputs = await this.processor(aud); + const output = await this.model(inputs); const logits = output.logits[0]; const scores = getTopItems(softmax(logits.data), topk); @@ -1305,6 +1325,8 @@ export class AudioClassificationPipeline extends (/** @type {new (options: Audio * @param {string[]} candidate_labels The candidate labels for this audio. * @param {ZeroShotAudioClassificationPipelineOptions} [options] The options to use for zero-shot audio classification. * @returns {Promise} An array of objects containing the predicted labels and scores. + * + * @typedef {TextAudioPipelineConstructorArgs & ZeroShotAudioClassificationPipelineCallback & Disposable} ZeroShotAudioClassificationPipelineType */ /** @@ -1323,7 +1345,7 @@ export class AudioClassificationPipeline extends (/** @type {new (options: Audio * // ] * ``` */ -export class ZeroShotAudioClassificationPipeline extends (/** @type {new (options: TextAudioPipelineConstructorArgs) => ZeroShotAudioClassificationPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class ZeroShotAudioClassificationPipeline extends (/** @type {new (options: TextAudioPipelineConstructorArgs) => ZeroShotAudioClassificationPipelineType} */ (Pipeline)) { /** * Create a new ZeroShotAudioClassificationPipeline. @@ -1337,7 +1359,6 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option async _call(audio, candidate_labels, { hypothesis_template = "This is a sound of {}." } = {}) { - const self = /** @type {ZeroShotAudioClassificationPipeline & Pipeline} */ (/** @type {any} */ (this)); const single = !Array.isArray(audio); if (single) { @@ -1350,20 +1371,20 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option ); // Run tokenization - const text_inputs = self.tokenizer(texts, { + const text_inputs = this.tokenizer(texts, { padding: true, truncation: true, }); - const sampling_rate = self.processor.feature_extractor.config.sampling_rate; + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; const preparedAudios = await prepareAudios(audio, sampling_rate); const toReturn = []; for (const aud of preparedAudios) { - const audio_inputs = await self.processor(aud); + const audio_inputs = await this.processor(aud); // Run model with both text and audio inputs - const output = await self.model({ ...text_inputs, ...audio_inputs }); + const output = await this.model({ ...text_inputs, ...audio_inputs }); // Compute softmax per audio const probs = softmax(output.logits_per_audio.data); @@ -1416,6 +1437,8 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option * - `Float32Array` or `Float64Array` of shape `(n, )`, representing the raw audio at the correct sampling rate (no further check will be done). * @param {AutomaticSpeechRecognitionConfig} [options] Additional keyword arguments to pass along to the generate method of the model. * @returns {Promise} An object containing the transcription text and optionally timestamps if `return_timestamps` is `true`. + * + * @typedef {TextAudioPipelineConstructorArgs & AutomaticSpeechRecognitionPipelineCallback & Disposable} AutomaticSpeechRecognitionPipelineType */ /** @@ -1486,7 +1509,7 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option * // { text: " So in college, I was a government major, which means [...] So I'd start off light and I'd bump it up" } * ``` */ -export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextAudioPipelineConstructorArgs) => AutomaticSpeechRecognitionPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextAudioPipelineConstructorArgs) => AutomaticSpeechRecognitionPipelineType} */ (Pipeline)) { /** * Create a new AutomaticSpeechRecognitionPipeline. @@ -1498,15 +1521,14 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options /** @type {AutomaticSpeechRecognitionPipelineCallback} */ async _call(audio, kwargs = {}) { - const self = /** @type {AutomaticSpeechRecognitionPipeline & Pipeline} */ (/** @type {any} */ (this)); - switch (self.model.config.model_type) { + switch (this.model.config.model_type) { case 'whisper': - return self._call_whisper(audio, kwargs) + return this._call_whisper(audio, kwargs) case 'wav2vec2': case 'hubert': - return self._call_wav2vec2(audio, kwargs) + return this._call_wav2vec2(audio, kwargs) default: - throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${self.model.config.model_type}'.`) + throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`) } } @@ -1516,7 +1538,6 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options */ async _call_wav2vec2(audio, kwargs = {}) { // TODO use kwargs - const self = /** @type {AutomaticSpeechRecognitionPipeline & Pipeline} */ (/** @type {any} */ (this)); if (kwargs.language) { console.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".'); @@ -1530,20 +1551,20 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options audio = [/** @type {AudioInput} */ (audio)]; } - const sampling_rate = self.processor.feature_extractor.config.sampling_rate; + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; const preparedAudios = await prepareAudios(audio, sampling_rate); const toReturn = []; for (const aud of preparedAudios) { - const inputs = await self.processor(aud); - const output = await self.model(inputs); + const inputs = await this.processor(aud); + const output = await this.model(inputs); const logits = output.logits[0]; const predicted_ids = []; for (const item of logits) { predicted_ids.push(max(item.data)[1]) } - const predicted_sentences = self.tokenizer.decode(predicted_ids) + const predicted_sentences = this.tokenizer.decode(predicted_ids) toReturn.push({ text: predicted_sentences }) } return single ? toReturn[0] : toReturn; @@ -1554,7 +1575,6 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options * @private */ async _call_whisper(audio, kwargs = {}) { - const self = /** @type {AutomaticSpeechRecognitionPipeline & Pipeline} */ (/** @type {any} */ (this)); const return_timestamps = kwargs.return_timestamps ?? false; const chunk_length_s = kwargs.chunk_length_s ?? 0; @@ -1574,7 +1594,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options throw new Error("Cannot specify `language`/`task`/`return_timestamps` and `forced_decoder_ids` at the same time.") } // @ts-ignore - const decoder_prompt_ids = self.tokenizer.get_decoder_prompt_ids({ language, task, no_timestamps: !return_timestamps }) + const decoder_prompt_ids = this.tokenizer.get_decoder_prompt_ids({ language, task, no_timestamps: !return_timestamps }) if (decoder_prompt_ids.length > 0) { kwargs.forced_decoder_ids = decoder_prompt_ids; } @@ -1585,10 +1605,10 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options audio = [/** @type {AudioInput} */ (audio)]; } - const time_precision = self.processor.feature_extractor.config.chunk_length / self.model.config.max_source_positions; - const hop_length = self.processor.feature_extractor.config.hop_length; + const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions; + const hop_length = this.processor.feature_extractor.config.hop_length; - const sampling_rate = self.processor.feature_extractor.config.sampling_rate; + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; const preparedAudios = await prepareAudios(audio, sampling_rate); const toReturn = []; @@ -1613,7 +1633,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options while (offset < aud.length) { const subarr = aud.subarray(offset, offset + window); - const feature = await self.processor(subarr); + const feature = await this.processor(subarr); const isFirst = offset === 0; const isLast = offset + jump >= aud.length; @@ -1632,7 +1652,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options } else { chunks = [{ stride: [aud.length, 0, 0], - input_features: (await self.processor(aud)).input_features, + input_features: (await this.processor(aud)).input_features, is_last: true }] } @@ -1642,7 +1662,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options kwargs.num_frames = Math.floor(chunk.stride[0] / hop_length); // NOTE: doing sequentially for now - const data = await self.model.generate(chunk.input_features, kwargs); + const data = await this.model.generate(chunk.input_features, kwargs); // TODO: Right now we only get top beam if (return_timestamps === 'word') { @@ -1665,7 +1685,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options // Merge text chunks // @ts-ignore - const [full_text, optional] = self.tokenizer._decode_asr(chunks, { + const [full_text, optional] = this.tokenizer._decode_asr(chunks, { time_precision, return_timestamps, force_full_sequences }); @@ -1684,6 +1704,8 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options * @param {ImagePipelineInputs} texts The images to be captioned. * @param {import('./utils/generation.js').GenerationConfigType} [options] Additional keyword arguments to pass along to the generate method of the model. * @returns {Promise} An object (or array of objects) containing the generated text(s). + * + * @typedef {TextImagePipelineConstructorArgs & ImageToTextPipelineCallback & Disposable} ImageToTextPipelineType */ /** @@ -1705,7 +1727,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options * // [{ generated_text: 'Mr. Brown commented icily.' }] * ``` */ -export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ImageToTextPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ImageToTextPipelineType} */ (Pipeline)) { /** * Create a new ImageToTextPipeline. @@ -1717,18 +1739,17 @@ export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipe /** @type {ImageToTextPipelineCallback} */ async _call(images, generate_kwargs = {}) { - const self = /** @type {ImageToTextPipeline & Pipeline} */ (/** @type {any} */ (this)); const isBatched = Array.isArray(images); const preparedImages = await prepareImages(images); - const { pixel_values } = await self.processor(preparedImages); + const { pixel_values } = await this.processor(preparedImages); const toReturn = []; for (const batch of pixel_values) { batch.dims = [1, ...batch.dims] - const output = await self.model.generate(batch, generate_kwargs); - const decoded = self.tokenizer.batch_decode(output, { + const output = await this.model.generate(batch, generate_kwargs); + const decoded = this.tokenizer.batch_decode(output, { skip_special_tokens: true, }).map(x => ({ generated_text: x.trim() })) toReturn.push(decoded); @@ -1751,6 +1772,8 @@ export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipe * @param {ImagePipelineInputs} images The input images(s) to be classified. * @param {ImageClassificationPipelineOptions} [options] The options to use for image classification. * @returns {Promise} An array or object containing the predicted labels and scores. + * + * @typedef {ImagePipelineConstructorArgs & ImageClassificationPipelineCallback & Disposable} ImageClassificationPipelineType */ /** @@ -1793,7 +1816,7 @@ export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipe * // ] * ``` */ -export class ImageClassificationPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageClassificationPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class ImageClassificationPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageClassificationPipelineType} */ (Pipeline)) { /** * Create a new ImageClassificationPipeline. @@ -1807,15 +1830,14 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image async _call(images, { topk = 1 } = {}) { - const self = /** @type {ImageClassificationPipeline & Pipeline} */ (/** @type {any} */ (this)); const isBatched = Array.isArray(images); const preparedImages = await prepareImages(images); - const { pixel_values } = await self.processor(preparedImages); - const output = await self.model({ pixel_values }); + const { pixel_values } = await this.processor(preparedImages); + const output = await this.model({ pixel_values }); - const id2label = self.model.config.id2label; + const id2label = this.model.config.id2label; const toReturn = []; for (const batch of output.logits) { const scores = getTopItems(softmax(batch.data), topk); @@ -1855,6 +1877,8 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image * @param {ImagePipelineInputs} images The input images. * @param {ImageSegmentationPipelineOptions} [options] The options to use for image segmentation. * @returns {Promise} The annotated segments. + * + * @typedef {ImagePipelineConstructorArgs & ImageSegmentationPipelineCallback & Disposable} ImageSegmentationPipelineType */ /** @@ -1872,7 +1896,7 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image * // ] * ``` */ -export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageSegmentationPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageSegmentationPipelineType} */ (Pipeline)) { /** * Create a new ImageSegmentationPipeline. * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. @@ -1897,7 +1921,6 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi target_sizes = null, subtask = null, } = {}) { - const self = /** @type {ImageSegmentationPipeline & Pipeline} */ (/** @type {any} */ (this)); const isBatched = Array.isArray(images); if (isBatched && images.length !== 1) { @@ -1907,23 +1930,23 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi const preparedImages = await prepareImages(images); const imageSizes = preparedImages.map(x => [x.height, x.width]); - const { pixel_values, pixel_mask } = await self.processor(preparedImages); - const output = await self.model({ pixel_values, pixel_mask }); + const { pixel_values, pixel_mask } = await this.processor(preparedImages); + const output = await this.model({ pixel_values, pixel_mask }); let fn = null; if (subtask !== null) { - fn = self.subtasks_mapping[subtask]; + fn = this.subtasks_mapping[subtask]; } else { - for (let [task, func] of Object.entries(self.subtasks_mapping)) { - if (func in self.processor.feature_extractor) { - fn = self.processor.feature_extractor[func].bind(self.processor.feature_extractor); + for (let [task, func] of Object.entries(this.subtasks_mapping)) { + if (func in this.processor.feature_extractor) { + fn = this.processor.feature_extractor[func].bind(this.processor.feature_extractor); subtask = task; break; } } } - const id2label = self.model.config.id2label; + const id2label = this.model.config.id2label; /** @type {ImageSegmentationPipelineOutput[]} */ const annotation = []; @@ -1998,6 +2021,8 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi * @param {string[]} candidate_labels The candidate labels for this image. * @param {ZeroShotImageClassificationPipelineOptions} [options] The options to use for zero-shot image classification. * @returns {Promise} An array of objects containing the predicted labels and scores. + * + * @typedef {TextImagePipelineConstructorArgs & ZeroShotImageClassificationPipelineCallback & Disposable} ZeroShotImageClassificationPipelineType */ /** @@ -2016,7 +2041,7 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi * // ] * ``` */ -export class ZeroShotImageClassificationPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ZeroShotImageClassificationPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class ZeroShotImageClassificationPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ZeroShotImageClassificationPipelineType} */ (Pipeline)) { /** * Create a new ZeroShotImageClassificationPipeline. * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline. @@ -2029,7 +2054,6 @@ export class ZeroShotImageClassificationPipeline extends (/** @type {new (option async _call(images, candidate_labels, { hypothesis_template = "This is a photo of {}" } = {}) { - const self = /** @type {ZeroShotImageClassificationPipeline & Pipeline} */ (/** @type {any} */ (this)); const isBatched = Array.isArray(images); const preparedImages = await prepareImages(images); @@ -2040,19 +2064,19 @@ export class ZeroShotImageClassificationPipeline extends (/** @type {new (option ); // Run tokenization - const text_inputs = self.tokenizer(texts, { - padding: self.model.config.model_type === 'siglip' ? 'max_length' : true, + const text_inputs = this.tokenizer(texts, { + padding: this.model.config.model_type === 'siglip' ? 'max_length' : true, truncation: true, }); // Run processor - const { pixel_values } = await self.processor(preparedImages); + const { pixel_values } = await this.processor(preparedImages); // Run model with both text and pixel inputs - const output = await self.model({ ...text_inputs, pixel_values }); + const output = await this.model({ ...text_inputs, pixel_values }); const function_to_apply = - self.model.config.model_type === 'siglip' + this.model.config.model_type === 'siglip' ? batch => batch.sigmoid().data : batch => softmax(batch.data); @@ -2090,6 +2114,8 @@ export class ZeroShotImageClassificationPipeline extends (/** @type {new (option * @param {ImagePipelineInputs} images The input images. * @param {ObjectDetectionPipelineOptions} [options] The options to use for object detection. * @returns {Promise} A list of objects or a list of list of objects. + * + * @typedef {ImagePipelineConstructorArgs & ObjectDetectionPipelineCallback & Disposable} ObjectDetectionPipelineType */ /** @@ -2114,7 +2140,7 @@ export class ZeroShotImageClassificationPipeline extends (/** @type {new (option * // }] * ``` */ -export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ObjectDetectionPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ObjectDetectionPipelineType} */ (Pipeline)) { /** * Create a new ObjectDetectionPipeline. @@ -2129,7 +2155,6 @@ export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipe threshold = 0.9, percentage = false, } = {}) { - const self = /** @type {ObjectDetectionPipeline & Pipeline} */ (/** @type {any} */ (this)); const isBatched = Array.isArray(images); @@ -2140,14 +2165,14 @@ export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipe const imageSizes = percentage ? null : preparedImages.map(x => [x.height, x.width]); - const { pixel_values, pixel_mask } = await self.processor(preparedImages); - const output = await self.model({ pixel_values, pixel_mask }); + const { pixel_values, pixel_mask } = await this.processor(preparedImages); + const output = await this.model({ pixel_values, pixel_mask }); // @ts-ignore - const processed = self.processor.feature_extractor.post_process_object_detection(output, threshold, imageSizes); + const processed = this.processor.feature_extractor.post_process_object_detection(output, threshold, imageSizes); // Add labels - const id2label = self.model.config.id2label; + const id2label = this.model.config.id2label; // Format output /** @type {ObjectDetectionPipelineOutput[]} */ @@ -2182,6 +2207,8 @@ export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipe * @param {string[]} candidate_labels What the model should recognize in the image. * @param {ZeroShotObjectDetectionPipelineOptions} [options] The options to use for zero-shot object detection. * @returns {Promise} An array of objects containing the predicted labels, scores, and bounding boxes. + * + * @typedef {TextImagePipelineConstructorArgs & ZeroShotObjectDetectionPipelineCallback & Disposable} ZeroShotObjectDetectionPipelineType */ /** @@ -2248,7 +2275,7 @@ export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipe * // ] * ``` */ -export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ZeroShotObjectDetectionPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ZeroShotObjectDetectionPipelineType} */ (Pipeline)) { /** * Create a new ZeroShotObjectDetectionPipeline. @@ -2264,19 +2291,18 @@ export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: T topk = null, percentage = false, } = {}) { - const self = /** @type {ZeroShotObjectDetectionPipeline & Pipeline} */ (/** @type {any} */ (this)); const isBatched = Array.isArray(images); const preparedImages = await prepareImages(images); // Run tokenization - const text_inputs = self.tokenizer(candidate_labels, { + const text_inputs = this.tokenizer(candidate_labels, { padding: true, truncation: true, }); // Run processor - const model_inputs = await self.processor(preparedImages); + const model_inputs = await this.processor(preparedImages); // Since non-maximum suppression is performed for exporting, we need to // process each image separately. For more information, see: @@ -2288,10 +2314,10 @@ export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: T const pixel_values = model_inputs.pixel_values[i].unsqueeze_(0); // Run model with both text and pixel inputs - const output = await self.model({ ...text_inputs, pixel_values }); + const output = await this.model({ ...text_inputs, pixel_values }); // @ts-ignore - const processed = self.processor.feature_extractor.post_process_object_detection(output, threshold, imageSize, true)[0]; + const processed = this.processor.feature_extractor.post_process_object_detection(output, threshold, imageSize, true)[0]; let result = processed.boxes.map((box, i) => ({ score: processed.scores[i], label: candidate_labels[processed.classes[i]], @@ -2317,6 +2343,8 @@ export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: T * @param {string} question A question to ask of the document. * @param {import('./utils/generation.js').GenerationConfigType} [options] Additional keyword arguments to pass along to the generate method of the model. * @returns {Promise} An object (or array of objects) containing the answer(s). + * + * @typedef {TextImagePipelineConstructorArgs & DocumentQuestionAnsweringPipelineCallback & Disposable} DocumentQuestionAnsweringPipelineType */ /** @@ -2333,7 +2361,7 @@ export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: T * // [{ answer: 'us-001' }] * ``` */ -export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => DocumentQuestionAnsweringPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => DocumentQuestionAnsweringPipelineType} */ (Pipeline)) { /** * Create a new DocumentQuestionAnsweringPipeline. @@ -2345,34 +2373,33 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: /** @type {DocumentQuestionAnsweringPipelineCallback} */ async _call(image, question, generate_kwargs = {}) { - const self = /** @type {DocumentQuestionAnsweringPipeline & Pipeline} */ (/** @type {any} */ (this)); // NOTE: For now, we only support a batch size of 1 // Preprocess image const preparedImage = (await prepareImages(image))[0]; - const { pixel_values } = await self.processor(preparedImage); + const { pixel_values } = await this.processor(preparedImage); // Run tokenization const task_prompt = `${question}`; - const decoder_input_ids = self.tokenizer(task_prompt, { + const decoder_input_ids = this.tokenizer(task_prompt, { add_special_tokens: false, padding: true, truncation: true, }).input_ids; // Run model - const output = await self.model.generate( + const output = await this.model.generate( pixel_values, { ...generate_kwargs, decoder_input_ids, - max_length: self.model.config.decoder.max_position_embeddings, + max_length: this.model.config.decoder.max_position_embeddings, } ); // Decode output - const decoded = self.tokenizer.batch_decode(output)[0]; + const decoded = this.tokenizer.batch_decode(output)[0]; // Parse answer const match = decoded.match(/(.*?)<\/s_answer>/); @@ -2387,7 +2414,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: /** * @typedef {Object} VocoderOptions - * @property {PreTrainedModel} [vocoder] The vocoder to use (if the model uses one). If not provided, use the default HifiGan vocoder. + * @property {PreTrainedModel} [vocoder] The vocoder used by the pipeline (if the model uses one). If not provided, use the default HifiGan vocoder. * @typedef {TextAudioPipelineConstructorArgs & VocoderOptions} TextToAudioPipelineConstructorArgs */ @@ -2403,6 +2430,8 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: * @param {string|string[]} texts The text(s) to generate. * @param {TextToAudioPipelineOptions} options Parameters passed to the model generation/forward method. * @returns {Promise} An object containing the generated audio and sampling rate. + * + * @typedef {TextToAudioPipelineConstructorArgs & TextToAudioPipelineCallback & Disposable} TextToAudioPipelineType */ /** @@ -2440,7 +2469,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: * // } * ``` */ -export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineConstructorArgs) => TextToAudioPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineConstructorArgs) => TextToAudioPipelineType} */ (Pipeline)) { DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan" /** @@ -2459,10 +2488,9 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi async _call(text_inputs, { speaker_embeddings = null, } = {}) { - const self = /** @type {TextToAudioPipeline & Pipeline} */ (/** @type {any} */ (this)); // If this.processor is not set, we are using a `AutoModelForTextToWaveform` model - if (self.processor) { + if (this.processor) { return this._call_text_to_spectrogram(text_inputs, { speaker_embeddings }); } else { return this._call_text_to_waveform(text_inputs); @@ -2470,18 +2498,17 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi } async _call_text_to_waveform(text_inputs) { - const self = /** @type {TextToAudioPipeline & Pipeline} */ (/** @type {any} */ (this)); // Run tokenization - const inputs = self.tokenizer(text_inputs, { + const inputs = this.tokenizer(text_inputs, { padding: true, truncation: true, }); // Generate waveform - const { waveform } = await self.model(inputs); + const { waveform } = await this.model(inputs); - const sampling_rate = self.model.config.sampling_rate; + const sampling_rate = this.model.config.sampling_rate; return { audio: waveform.data, sampling_rate, @@ -2489,12 +2516,11 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi } async _call_text_to_spectrogram(text_inputs, { speaker_embeddings }) { - const self = /** @type {TextToAudioPipeline & Pipeline} */ (/** @type {any} */ (this)); // Load vocoder, if not provided - if (!self.vocoder) { + if (!this.vocoder) { console.log('No vocoder specified, using default HifiGan vocoder.'); - self.vocoder = await AutoModel.from_pretrained(self.DEFAULT_VOCODER_ID, { quantized: false }); + this.vocoder = await AutoModel.from_pretrained(this.DEFAULT_VOCODER_ID, { quantized: false }); } // Load speaker embeddings as Float32Array from path/URL @@ -2516,16 +2542,16 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi } // Run tokenization - const { input_ids } = self.tokenizer(text_inputs, { + const { input_ids } = this.tokenizer(text_inputs, { padding: true, truncation: true, }); // NOTE: At this point, we are guaranteed that `speaker_embeddings` is a `Tensor` // @ts-ignore - const { waveform } = await self.model.generate_speech(input_ids, speaker_embeddings, { vocoder: this.vocoder }); + const { waveform } = await this.model.generate_speech(input_ids, speaker_embeddings, { vocoder: this.vocoder }); - const sampling_rate = self.processor.feature_extractor.config.sampling_rate; + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; return { audio: waveform.data, sampling_rate, @@ -2537,6 +2563,8 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi * @callback ImageToImagePipelineCallback Transform the image(s) passed as inputs. * @param {ImagePipelineInputs} images The images to transform. * @returns {Promise} The transformed image or list of images. + * + * @typedef {ImagePipelineConstructorArgs & ImageToImagePipelineCallback & Disposable} ImageToImagePipelineType */ /** @@ -2555,7 +2583,7 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi * // } * ``` */ -export class ImageToImagePipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageToImagePipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class ImageToImagePipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageToImagePipelineType} */ (Pipeline)) { /** * Create a new ImageToImagePipeline. * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. @@ -2566,11 +2594,10 @@ export class ImageToImagePipeline extends (/** @type {new (options: ImagePipelin /** @type {ImageToImagePipelineCallback} */ async _call(images) { - const self = /** @type {ImageToImagePipeline & Pipeline} */ (/** @type {any} */ (this)); const preparedImages = await prepareImages(images); - const inputs = await self.processor(preparedImages); - const outputs = await self.model(inputs); + const inputs = await this.processor(preparedImages); + const outputs = await this.model(inputs); /** @type {RawImage[]} */ const toReturn = []; @@ -2591,6 +2618,8 @@ export class ImageToImagePipeline extends (/** @type {new (options: ImagePipelin * @callback DepthEstimationPipelineCallback Predicts the depth for the image(s) passed as inputs. * @param {ImagePipelineInputs} images The images to compute depth for. * @returns {Promise} An image or a list of images containing result(s). + * + * @typedef {ImagePipelineConstructorArgs & DepthEstimationPipelineCallback & Disposable} DepthEstimationPipelineType */ /** @@ -2617,7 +2646,7 @@ export class ImageToImagePipeline extends (/** @type {new (options: ImagePipelin * // } * ``` */ -export class DepthEstimationPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => DepthEstimationPipelineCallback} */ (/** @type {any} */ Pipeline)) { +export class DepthEstimationPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => DepthEstimationPipelineType} */ (Pipeline)) { /** * Create a new DepthEstimationPipeline. * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. @@ -2628,12 +2657,11 @@ export class DepthEstimationPipeline extends (/** @type {new (options: ImagePipe /** @type {DepthEstimationPipelineCallback} */ async _call(images) { - const self = /** @type {DepthEstimationPipeline & Pipeline} */ (/** @type {any} */ (this)); const preparedImages = await prepareImages(images); - const inputs = await self.processor(preparedImages); - const { predicted_depth } = await self.model(inputs); + const inputs = await this.processor(preparedImages); + const { predicted_depth } = await this.model(inputs); const toReturn = []; for (let i = 0; i < preparedImages.length; ++i) { diff --git a/src/utils/generation.js b/src/utils/generation.js index 9b53d3c6f..1f9dc898b 100644 --- a/src/utils/generation.js +++ b/src/utils/generation.js @@ -252,17 +252,17 @@ export class WhisperTimeStampLogitsProcessor extends LogitsProcessor { * @returns {Tensor} The modified logits. */ _call(input_ids, logits) { + const logitsData = /** @type {Float32Array} */(logits.data); + // suppress <|notimestamps|> which is handled by without_timestamps - logits.data[this.no_timestamps_token_id] = -Infinity; + logitsData[this.no_timestamps_token_id] = -Infinity; if (input_ids.length === this.begin_index - 1) { - logits.data.fill(-Infinity); - logits.data[this.timestamp_begin] = 0; + logitsData.fill(-Infinity); + logitsData[this.timestamp_begin] = 0; return logits; } - const logitsData = /** @type {Float32Array} */(logits.data); - // timestamps have to appear in pairs, except directly before eos_token; mask logits accordingly const seq = input_ids.slice(this.begin_index); const last_was_timestamp = seq.length >= 1 && seq[seq.length - 1] >= this.timestamp_begin;