From c8bbdd41f4f6901ae9263b0120cda9788c4d883d Mon Sep 17 00:00:00 2001 From: Sam L'Huillier Date: Fri, 17 Nov 2023 19:48:55 +0000 Subject: [PATCH 1/3] Implement max character check for WordPiece tokenizer (#398) * Implement max character check per token * Update maxInputCharsPerWord to max_input_chars_per_word Co-authored-by: Joshua Lochner * Update maxInputCharsPerWord to max_input_chars_per_word Co-authored-by: Joshua Lochner * Update to ?? Co-authored-by: Joshua Lochner --------- Co-authored-by: Joshua Lochner --- src/tokenizers.js | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index 1f7bd9fcd..9381b2543 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -270,6 +270,7 @@ class WordPieceTokenizer extends TokenizerModel { * @param {Object} config.vocab A mapping of tokens to ids. * @param {string} config.unk_token The unknown token string. * @param {string} config.continuing_subword_prefix The prefix to use for continuing subwords. + * @param {number} [config.max_input_chars_per_word=100] The maximum number of characters per word. */ constructor(config) { super(config); @@ -291,6 +292,12 @@ class WordPieceTokenizer extends TokenizerModel { */ this.unk_token = config.unk_token; + /** + * The maximum number of characters allowed per word. + * @type {number} + */ + this.max_input_chars_per_word = config.max_input_chars_per_word ?? 100; + /** * An array of tokens. * @type {string[]} @@ -310,10 +317,10 @@ class WordPieceTokenizer extends TokenizerModel { let outputTokens = []; for (let token of tokens) { let chars = [...token]; - // TODO add - // if len(chars) > self.max_input_chars_per_word: - // output_tokens.append(self.unk_token) - // continue + if (chars.length > this.max_input_chars_per_word) { + outputTokens.push(this.unk_token); + continue; + } let isUnknown = false; let start = 0; From 6fc268cb237131172e6888a3ca10d99d3d54aff2 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sat, 18 Nov 2023 12:58:21 +0200 Subject: [PATCH 2/3] Update sharp dependency version (#400) --- examples/next-client/package-lock.json | 6 +- examples/next-server/package-lock.json | 6 +- .../package-lock.json | 6 +- .../semantic-image-search/package-lock.json | 6 +- package-lock.json | 70 +++++++++++++++---- 5 files changed, 69 insertions(+), 25 deletions(-) diff --git a/examples/next-client/package-lock.json b/examples/next-client/package-lock.json index ea29ca18c..309387dca 100644 --- a/examples/next-client/package-lock.json +++ b/examples/next-client/package-lock.json @@ -4019,9 +4019,9 @@ } }, "node_modules/sharp": { - "version": "0.32.4", - "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.4.tgz", - "integrity": "sha512-exUnZewqVZC6UXqXuQ8fyJJv0M968feBi04jb9GcUHrWtkRoAKnbJt8IfwT4NJs7FskArbJ14JAFGVuooszoGg==", + "version": "0.32.6", + "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz", + "integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==", "hasInstallScript": true, "dependencies": { "color": "^4.2.3", diff --git a/examples/next-server/package-lock.json b/examples/next-server/package-lock.json index 25f40f30b..e7861f920 100644 --- a/examples/next-server/package-lock.json +++ b/examples/next-server/package-lock.json @@ -4019,9 +4019,9 @@ } }, "node_modules/sharp": { - "version": "0.32.4", - "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.4.tgz", - "integrity": "sha512-exUnZewqVZC6UXqXuQ8fyJJv0M968feBi04jb9GcUHrWtkRoAKnbJt8IfwT4NJs7FskArbJ14JAFGVuooszoGg==", + "version": "0.32.6", + "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz", + "integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==", "hasInstallScript": true, "dependencies": { "color": "^4.2.3", diff --git a/examples/semantic-image-search-client/package-lock.json b/examples/semantic-image-search-client/package-lock.json index c99b280f2..7c06d25f3 100644 --- a/examples/semantic-image-search-client/package-lock.json +++ b/examples/semantic-image-search-client/package-lock.json @@ -4073,9 +4073,9 @@ } }, "node_modules/sharp": { - "version": "0.32.4", - "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.4.tgz", - "integrity": "sha512-exUnZewqVZC6UXqXuQ8fyJJv0M968feBi04jb9GcUHrWtkRoAKnbJt8IfwT4NJs7FskArbJ14JAFGVuooszoGg==", + "version": "0.32.6", + "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz", + "integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==", "hasInstallScript": true, "dependencies": { "color": "^4.2.3", diff --git a/examples/semantic-image-search/package-lock.json b/examples/semantic-image-search/package-lock.json index f714d8b13..fbc41c984 100644 --- a/examples/semantic-image-search/package-lock.json +++ b/examples/semantic-image-search/package-lock.json @@ -4256,9 +4256,9 @@ } }, "node_modules/sharp": { - "version": "0.32.4", - "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.4.tgz", - "integrity": "sha512-exUnZewqVZC6UXqXuQ8fyJJv0M968feBi04jb9GcUHrWtkRoAKnbJt8IfwT4NJs7FskArbJ14JAFGVuooszoGg==", + "version": "0.32.6", + "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz", + "integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==", "hasInstallScript": true, "dependencies": { "color": "^4.2.3", diff --git a/package-lock.json b/package-lock.json index 3c86aafe7..4d5f8ae92 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2017,6 +2017,11 @@ "integrity": "sha512-hNfzcOV8W4NdualtqBFPyVO+54DSJuZGY9qT4pRroB6S9e3iiido2ISIC5h9R2sPJ8H3FHCIiEnsv1lPXO3KtQ==", "dev": true }, + "node_modules/b4a": { + "version": "1.6.4", + "resolved": "https://registry.npmjs.org/b4a/-/b4a-1.6.4.tgz", + "integrity": "sha512-fpWrvyVHEKyeEvbKZTVOeZF3VSKKWtJxFIxX/jaVPf+cLbGUSitjb49pHLqPV2BUNNZ0LcoeEGfE/YCpyDYHIw==" + }, "node_modules/babel-jest": { "version": "29.6.1", "resolved": "https://registry.npmjs.org/babel-jest/-/babel-jest-29.6.1.tgz", @@ -3014,9 +3019,9 @@ } }, "node_modules/detect-libc": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.1.tgz", - "integrity": "sha512-463v3ZeIrcWtdgIg6vI6XUncguvr2TnGl4SzDXinkt9mSLpBJKXT3mW6xT3VQdDN11+WVs29pgvivTc4Lp8v+w==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.2.tgz", + "integrity": "sha512-UX6sGumvvqSaXgdKGUsgZWqcUyIXZ/vZTrlRT/iobiKhGL0zL4d3osHj3uqllWJK+i+sixDS/3COVEOFbupFyw==", "engines": { "node": ">=8" } @@ -3415,6 +3420,11 @@ "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==", "dev": true }, + "node_modules/fast-fifo": { + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz", + "integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==" + }, "node_modules/fast-glob": { "version": "3.2.12", "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.2.12.tgz", @@ -5616,9 +5626,9 @@ } }, "node_modules/node-addon-api": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.0.0.tgz", - "integrity": "sha512-GyHvgPvUXBvAkXa0YvYnhilSB1A+FRYMpIVggKzPZqdaZfevZOuzfWzyvgzOwRLHBeo/MMswmJFsrNF4Nw1pmA==" + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz", + "integrity": "sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA==" }, "node_modules/node-forge": { "version": "1.3.1", @@ -6150,6 +6160,11 @@ } ] }, + "node_modules/queue-tick": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/queue-tick/-/queue-tick-1.0.1.tgz", + "integrity": "sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag==" + }, "node_modules/randombytes": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz", @@ -6689,18 +6704,18 @@ } }, "node_modules/sharp": { - "version": "0.32.0", - "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.0.tgz", - "integrity": "sha512-yLAypVcqj1toSAqRSwbs86nEzfyZVDYqjuUX8grhFpeij0DDNagKJXELS/auegDBRDg1XBtELdOGfo2X1cCpeA==", + "version": "0.32.6", + "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz", + "integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==", "hasInstallScript": true, "dependencies": { "color": "^4.2.3", - "detect-libc": "^2.0.1", - "node-addon-api": "^6.0.0", + "detect-libc": "^2.0.2", + "node-addon-api": "^6.1.0", "prebuild-install": "^7.1.1", - "semver": "^7.3.8", + "semver": "^7.5.4", "simple-get": "^4.0.1", - "tar-fs": "^2.1.1", + "tar-fs": "^3.0.4", "tunnel-agent": "^0.6.0" }, "engines": { @@ -6710,6 +6725,26 @@ "url": "https://opencollective.com/libvips" } }, + "node_modules/sharp/node_modules/tar-fs": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.4.tgz", + "integrity": "sha512-5AFQU8b9qLfZCX9zp2duONhPmZv0hGYiBPJsyUdqMjzq/mqVpy/rEUSeHk1+YitmxugaptgBh5oDGU3VsAJq4w==", + "dependencies": { + "mkdirp-classic": "^0.5.2", + "pump": "^3.0.0", + "tar-stream": "^3.1.5" + } + }, + "node_modules/sharp/node_modules/tar-stream": { + "version": "3.1.6", + "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.6.tgz", + "integrity": "sha512-B/UyjYwPpMBv+PaFSWAmtYjwdrlEaZQEhMIBFNC5oEG8lpiW8XjcSdmEaClj28ArfKScKHs2nshz3k2le6crsg==", + "dependencies": { + "b4a": "^1.6.4", + "fast-fifo": "^1.2.0", + "streamx": "^2.15.0" + } + }, "node_modules/shebang-command": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", @@ -7026,6 +7061,15 @@ "node": ">=0.10.0" } }, + "node_modules/streamx": { + "version": "2.15.5", + "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.15.5.tgz", + "integrity": "sha512-9thPGMkKC2GctCzyCUjME3yR03x2xNo0GPKGkRw2UMYN+gqWa9uqpyNWhmsNCutU5zHmkUum0LsCRQTXUgUCAg==", + "dependencies": { + "fast-fifo": "^1.1.0", + "queue-tick": "^1.0.1" + } + }, "node_modules/string_decoder": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", From 19daf2d3c135b046440a944765d2b631dc294f42 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sat, 18 Nov 2023 12:59:15 +0200 Subject: [PATCH 3/3] Add jsDelivr stats to README (#395) --- README.md | 7 +++++-- docs/scripts/build_readme.py | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index aee7a5d09..68bbe3180 100644 --- a/README.md +++ b/README.md @@ -15,10 +15,13 @@ NPM - Downloads + NPM Downloads + + + jsDelivr Hits - License + License Documentation diff --git a/docs/scripts/build_readme.py b/docs/scripts/build_readme.py index e3beaca86..82c51dad7 100644 --- a/docs/scripts/build_readme.py +++ b/docs/scripts/build_readme.py @@ -17,10 +17,13 @@ NPM - Downloads + NPM Downloads + + + jsDelivr Hits - License + License Documentation