diff --git a/src/tokenizers.js b/src/tokenizers.js index 9cf270382..8d94f2d4f 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -357,14 +357,21 @@ export class TokenizerModel extends Callable { case 'Unigram': // @ts-ignore return new Unigram(config, ...args); - case 'BPE': return new BPE(config); default: + // Some tokenizers, like for google-t5/t5-small, do not have a `type` field. + // In this case, we can infer the tokenizer type based on the structure of the `vocab` field. if (config.vocab) { - // @ts-ignore - return new LegacyTokenizerModel(config, ...args); + if (Array.isArray(config.vocab)) { + // config.vocab is of type `[string, number][]` + // @ts-ignore + return new Unigram(config, ...args); + } else { + // @ts-ignore + return new LegacyTokenizerModel(config, ...args); + } } throw new Error(`Unknown TokenizerModel type: ${config.type}`); } diff --git a/tests/models/t5/tokenization.js b/tests/models/t5/tokenization.js index 27a9ca267..c64b163f0 100644 --- a/tests/models/t5/tokenization.js +++ b/tests/models/t5/tokenization.js @@ -237,4 +237,13 @@ export const TEST_CONFIG = { decoded: "Hey . how are you", }, }, + "google-t5/t5-small": { + // Test that tokenizer type can be inferred (`type: "Unigram"` is missing) + SIMPLE: { + text: BASE_TEST_STRINGS.SIMPLE, + tokens: ["\u2581How", "\u2581are", "\u2581you", "\u2581doing", "?"], + ids: [571, 33, 25, 692, 58, 1], + decoded: "How are you doing?", + }, + } };