Skip to content

Commit

Permalink
Support inferringunigram tokenizer type
Browse files Browse the repository at this point in the history
  • Loading branch information
xenova committed Oct 4, 2024
1 parent c61a76b commit 32d8df4
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 3 deletions.
13 changes: 10 additions & 3 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -357,14 +357,21 @@ export class TokenizerModel extends Callable {
case 'Unigram':
// @ts-ignore
return new Unigram(config, ...args);

case 'BPE':
return new BPE(config);

default:
// Some tokenizers, like for google-t5/t5-small, do not have a `type` field.
// In this case, we can infer the tokenizer type based on the structure of the `vocab` field.
if (config.vocab) {
// @ts-ignore
return new LegacyTokenizerModel(config, ...args);
if (Array.isArray(config.vocab)) {
// config.vocab is of type `[string, number][]`
// @ts-ignore
return new Unigram(config, ...args);
} else {
// @ts-ignore
return new LegacyTokenizerModel(config, ...args);
}
}
throw new Error(`Unknown TokenizerModel type: ${config.type}`);
}
Expand Down
9 changes: 9 additions & 0 deletions tests/models/t5/tokenization.js
Original file line number Diff line number Diff line change
Expand Up @@ -237,4 +237,13 @@ export const TEST_CONFIG = {
decoded: "Hey </s>. how are you</s>",
},
},
"google-t5/t5-small": {
// Test that tokenizer type can be inferred (`type: "Unigram"` is missing)
SIMPLE: {
text: BASE_TEST_STRINGS.SIMPLE,
tokens: ["\u2581How", "\u2581are", "\u2581you", "\u2581doing", "?"],
ids: [571, 33, 25, 692, 58, 1],
decoded: "How are you doing?</s>",
},
}
};

0 comments on commit 32d8df4

Please sign in to comment.