diff --git a/README.md b/README.md index 73520ea02..ae61e0e8b 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ npm i @huggingface/transformers Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with: ```html ``` @@ -155,7 +155,7 @@ Check out the Transformers.js [template](https://huggingface.co/new-space?templa -By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/dist/), which should work out-of-the-box. You can customize this as follows: +By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.0/dist/), which should work out-of-the-box. You can customize this as follows: ### Settings diff --git a/docs/snippets/2_installation.snippet b/docs/snippets/2_installation.snippet index 96c13d2c7..353d739c8 100644 --- a/docs/snippets/2_installation.snippet +++ b/docs/snippets/2_installation.snippet @@ -7,6 +7,6 @@ npm i @huggingface/transformers Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with: ```html ``` diff --git a/docs/snippets/4_custom-usage.snippet b/docs/snippets/4_custom-usage.snippet index a0fc9546f..d505f80e4 100644 --- a/docs/snippets/4_custom-usage.snippet +++ b/docs/snippets/4_custom-usage.snippet @@ -1,6 +1,6 @@ -By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/dist/), which should work out-of-the-box. You can customize this as follows: +By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.0/dist/), which should work out-of-the-box. You can customize this as follows: ### Settings diff --git a/package-lock.json b/package-lock.json index a01f01da9..908e7d85b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@huggingface/transformers", - "version": "3.0.2", + "version": "3.1.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@huggingface/transformers", - "version": "3.0.2", + "version": "3.1.0", "license": "Apache-2.0", "dependencies": { "@huggingface/jinja": "^0.3.2", diff --git a/package.json b/package.json index 5a82076cb..372b89014 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@huggingface/transformers", - "version": "3.0.2", + "version": "3.1.0", "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!", "main": "./src/transformers.js", "types": "./types/transformers.d.ts", diff --git a/src/env.js b/src/env.js index b0cd97aef..3fd7cdbdd 100644 --- a/src/env.js +++ b/src/env.js @@ -26,7 +26,7 @@ import fs from 'fs'; import path from 'path'; import url from 'url'; -const VERSION = '3.0.2'; +const VERSION = '3.1.0'; // Check if various APIs are available (depends on environment) const IS_BROWSER_ENV = typeof self !== 'undefined'; diff --git a/src/models.js b/src/models.js index 177a87601..7133e64d5 100644 --- a/src/models.js +++ b/src/models.js @@ -3759,7 +3759,43 @@ export class ChineseCLIPModel extends ChineseCLIPPreTrainedModel { } // JinaCLIP models export class JinaCLIPPreTrainedModel extends PreTrainedModel { } -export class JinaCLIPModel extends JinaCLIPPreTrainedModel { } +export class JinaCLIPModel extends JinaCLIPPreTrainedModel { + async forward(model_inputs) { + const missing_text_inputs = !model_inputs.input_ids; + const missing_image_inputs = !model_inputs.pixel_values; + + if (missing_text_inputs && missing_image_inputs) { + throw new Error('Either `input_ids` or `pixel_values` should be provided.'); + } + + // If either `input_ids` or `pixel_values` aren't passed, we need to create dummy input since the model requires a value to be specified. + if (missing_text_inputs) { + // NOTE: We cannot pass zero-dimension tensor as input for input_ids. + // Fortunately, the majority of time is spent in the vision encoder, so this shouldn't significantly impact performance. + model_inputs.input_ids = ones([model_inputs.pixel_values.dims[0], 1]); + } + + if (missing_image_inputs) { + // NOTE: Since we create a zero-sized tensor, this does not increase computation time. + // @ts-ignore + const { image_size } = this.config.vision_config; + model_inputs.pixel_values = full([0, 3, image_size, image_size], 0.0); // (pass zero-dimension tensor) + } + + const { text_embeddings, image_embeddings, l2norm_text_embeddings, l2norm_image_embeddings } = await super.forward(model_inputs); + + const result = {}; + if (!missing_text_inputs) { + result.text_embeddings = text_embeddings; + result.l2norm_text_embeddings = l2norm_text_embeddings; + } + if (!missing_image_inputs) { + result.image_embeddings = image_embeddings; + result.l2norm_image_embeddings = l2norm_image_embeddings; + } + return result + } +} export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel { /** @type {typeof PreTrainedModel.from_pretrained} */ diff --git a/src/models/jina_clip/image_processing_jina_clip.js b/src/models/jina_clip/image_processing_jina_clip.js index 648e80d42..fccacdd73 100644 --- a/src/models/jina_clip/image_processing_jina_clip.js +++ b/src/models/jina_clip/image_processing_jina_clip.js @@ -1,5 +1,26 @@ -import { +import { ImageProcessor, } from "../../base/image_processors_utils.js"; -export class JinaCLIPImageProcessor extends ImageProcessor {} +export class JinaCLIPImageProcessor extends ImageProcessor { + constructor(config) { + // JinaCLIPImageProcessor uses a custom preprocessor_config.json, so we configure it here + const { resize_mode, fill_color, interpolation, size, ...other } = config; + + const new_size = resize_mode === 'squash' + ? { width: size, height: size } + : resize_mode === 'shortest' + ? { shortest_edge: size } + : { longest_edge: size }; + + const resample = interpolation === 'bicubic' ? 3 : 2; + super({ + ...other, + size: new_size, + resample, + do_center_crop: true, + crop_size: size, + do_normalize: true, + }); + } +} diff --git a/src/models/jina_clip/processing_jina_clip.js b/src/models/jina_clip/processing_jina_clip.js new file mode 100644 index 000000000..b5219560f --- /dev/null +++ b/src/models/jina_clip/processing_jina_clip.js @@ -0,0 +1,24 @@ + +import { Processor } from "../../base/processing_utils.js"; +import { AutoImageProcessor } from "../auto/image_processing_auto.js"; +import { AutoTokenizer } from "../../tokenizers.js"; + +export class JinaCLIPProcessor extends Processor { + static tokenizer_class = AutoTokenizer + static image_processor_class = AutoImageProcessor + + async _call(text=null, images=null, kwargs = {}) { + + if (!text && !images){ + throw new Error('Either text or images must be provided'); + } + + const text_inputs = text ? this.tokenizer(text, kwargs) : {}; + const image_inputs = images ? await this.image_processor(images, kwargs) : {}; + + return { + ...text_inputs, + ...image_inputs, + } + } +} diff --git a/src/models/processors.js b/src/models/processors.js index 735432812..cc96cd7e9 100644 --- a/src/models/processors.js +++ b/src/models/processors.js @@ -1,6 +1,7 @@ export * from './florence2/processing_florence2.js'; export * from './mgp_str/processing_mgp_str.js'; export * from './janus/processing_janus.js'; +export * from './jina_clip/processing_jina_clip.js'; export * from './owlvit/processing_owlvit.js'; export * from './pyannote/processing_pyannote.js'; export * from './qwen2_vl/processing_qwen2_vl.js'; diff --git a/tests/init.js b/tests/init.js index 65f079086..0783632c5 100644 --- a/tests/init.js +++ b/tests/init.js @@ -57,7 +57,7 @@ export function init() { registerBackend("test", onnxruntimeBackend, Number.POSITIVE_INFINITY); } -export const MAX_MODEL_LOAD_TIME = 10_000; // 10 seconds +export const MAX_MODEL_LOAD_TIME = 15_000; // 15 seconds export const MAX_TEST_EXECUTION_TIME = 30_000; // 30 seconds export const MAX_MODEL_DISPOSE_TIME = 1_000; // 1 second diff --git a/tests/processors.test.js b/tests/processors.test.js index 53f94bcaa..cafcc9f2a 100644 --- a/tests/processors.test.js +++ b/tests/processors.test.js @@ -10,6 +10,7 @@ env.useFSCache = false; const sum = (array) => Number(array.reduce((a, b) => a + b, array instanceof BigInt64Array ? 0n : 0)); const avg = (array) => sum(array) / array.length; +/** @type {Map} */ const IMAGE_CACHE = new Map(); const load_image = async (url) => { const cached = IMAGE_CACHE.get(url); @@ -40,6 +41,7 @@ const MODELS = { nougat: "Xenova/nougat-small", owlvit: "Xenova/owlvit-base-patch32", clip: "Xenova/clip-vit-base-patch16", + jina_clip: "jinaai/jina-clip-v2", vitmatte: "Xenova/vitmatte-small-distinctions-646", dinov2: "Xenova/dinov2-small-imagenet1k-1-layer", // efficientnet: 'Xenova/efficientnet-b0', @@ -490,6 +492,27 @@ describe("Processors", () => { MAX_TEST_EXECUTION_TIME, ); + // JinaCLIPImageProcessor + // - custom config overrides + it( + MODELS.jina_clip, + async () => { + const processor = await AutoImageProcessor.from_pretrained(MODELS.jina_clip); + + { + const image = await load_image(TEST_IMAGES.tiger); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + compare(pixel_values.dims, [1, 3, 512, 512]); + compare(avg(pixel_values.data), -0.06637834757566452); + + compare(original_sizes, [[408, 612]]); + compare(reshaped_input_sizes, [[512, 512]]); + } + }, + MAX_TEST_EXECUTION_TIME, + ); + // VitMatteImageProcessor // - tests custom overrides // - tests multiple inputs diff --git a/tests/utils/tensor.test.js b/tests/utils/tensor.test.js index 1bede1984..622a14281 100644 --- a/tests/utils/tensor.test.js +++ b/tests/utils/tensor.test.js @@ -69,7 +69,11 @@ describe("Tensor operations", () => { }); it("should return a crop", async () => { - const t1 = new Tensor("float32", Array.from({ length: 28 }, (_, i) => i + 1), [4, 7]); + const t1 = new Tensor( + "float32", + Array.from({ length: 28 }, (_, i) => i + 1), + [4, 7], + ); const t2 = t1.slice([1, -1], [1, -1]); const target = new Tensor("float32", [9, 10, 11, 12, 13, 16, 17, 18, 19, 20], [2, 5]); diff --git a/tests/utils/utils.test.js b/tests/utils/utils.test.js index 79c6dcc7f..77de225b8 100644 --- a/tests/utils/utils.test.js +++ b/tests/utils/utils.test.js @@ -65,14 +65,14 @@ describe("Utilities", () => { const [width, height, channels] = [2, 2, 3]; const data = Uint8Array.from({ length: width * height * channels }, (_, i) => i % 5); const tiny_image = new RawImage(data, width, height, channels); - + let image; beforeAll(async () => { image = await RawImage.fromURL("https://picsum.photos/300/200"); }); it("Can split image into separate channels", async () => { - const image_data = tiny_image.split().map(x => x.data); + const image_data = tiny_image.split().map((x) => x.data); const target = [ new Uint8Array([0, 3, 1, 4]), // Reds @@ -84,7 +84,10 @@ describe("Utilities", () => { }); it("Can splits channels for grayscale", async () => { - const image_data = tiny_image.grayscale().split().map(x => x.data); + const image_data = tiny_image + .grayscale() + .split() + .map((x) => x.data); const target = [new Uint8Array([1, 3, 2, 1])]; compare(image_data, target);