huggingface · xenova · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ npm i @huggingface/transformers
 Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
 ```html
 <script type="module">
-    import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2';
+    import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.0';
 </script>
 ```
 
@@ -155,7 +155,7 @@ Check out the Transformers.js [template](https://huggingface.co/new-space?templa
 
 
 
-By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/dist/), which should work out-of-the-box. You can customize this as follows:
+By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.0/dist/), which should work out-of-the-box. You can customize this as follows:
 
 ### Settings
 

diff --git a/docs/snippets/2_installation.snippet b/docs/snippets/2_installation.snippet
@@ -7,6 +7,6 @@ npm i @huggingface/transformers
 Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
 ```html
 <script type="module">
-    import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2';
+    import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.0';
 </script>
 ```
diff --git a/docs/snippets/4_custom-usage.snippet b/docs/snippets/4_custom-usage.snippet
@@ -1,6 +1,6 @@
 
 
-By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/dist/), which should work out-of-the-box. You can customize this as follows:
+By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.0/dist/), which should work out-of-the-box. You can customize this as follows:
 
 ### Settings
 

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@huggingface/transformers",
-  "version": "3.0.2",
+  "version": "3.1.0",
   "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
   "main": "./src/transformers.js",
   "types": "./types/transformers.d.ts",

diff --git a/src/env.js b/src/env.js
@@ -26,7 +26,7 @@ import fs from 'fs';
 import path from 'path';
 import url from 'url';
 
-const VERSION = '3.0.2';
+const VERSION = '3.1.0';
 
 // Check if various APIs are available (depends on environment)
 const IS_BROWSER_ENV = typeof self !== 'undefined';

diff --git a/src/models.js b/src/models.js
@@ -3759,7 +3759,43 @@ export class ChineseCLIPModel extends ChineseCLIPPreTrainedModel { }
 // JinaCLIP models
 export class JinaCLIPPreTrainedModel extends PreTrainedModel { }
 
-export class JinaCLIPModel extends JinaCLIPPreTrainedModel { }
+export class JinaCLIPModel extends JinaCLIPPreTrainedModel {
+    async forward(model_inputs) {
+        const missing_text_inputs = !model_inputs.input_ids;
+        const missing_image_inputs = !model_inputs.pixel_values;
+
+        if (missing_text_inputs && missing_image_inputs) {
+            throw new Error('Either `input_ids` or `pixel_values` should be provided.');
+        }
+
+        // If either `input_ids` or `pixel_values` aren't passed, we need to create dummy input since the model requires a value to be specified.
+        if (missing_text_inputs) {
+            // NOTE: We cannot pass zero-dimension tensor as input for input_ids.
+            // Fortunately, the majority of time is spent in the vision encoder, so this shouldn't significantly impact performance.
+            model_inputs.input_ids = ones([model_inputs.pixel_values.dims[0], 1]);
+        }
+
+        if (missing_image_inputs) {
+            // NOTE: Since we create a zero-sized tensor, this does not increase computation time.
+            // @ts-ignore
+            const { image_size } = this.config.vision_config;
+            model_inputs.pixel_values = full([0, 3, image_size, image_size], 0.0); // (pass zero-dimension tensor)
+        }
+
+        const { text_embeddings, image_embeddings, l2norm_text_embeddings, l2norm_image_embeddings } = await super.forward(model_inputs);
+
+        const result = {};
+        if (!missing_text_inputs) {
+            result.text_embeddings = text_embeddings;
+            result.l2norm_text_embeddings = l2norm_text_embeddings;
+        }
+        if (!missing_image_inputs) {
+            result.image_embeddings = image_embeddings;
+            result.l2norm_image_embeddings = l2norm_image_embeddings;
+        }
+        return result
+    }
+}
 
 export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */

diff --git a/src/models/jina_clip/image_processing_jina_clip.js b/src/models/jina_clip/image_processing_jina_clip.js
@@ -1,5 +1,26 @@
-import { 
+import {
     ImageProcessor,
 } from "../../base/image_processors_utils.js";
 
-export class JinaCLIPImageProcessor extends ImageProcessor {}
+export class JinaCLIPImageProcessor extends ImageProcessor {
+    constructor(config) {
+        // JinaCLIPImageProcessor uses a custom preprocessor_config.json, so we configure it here
+        const { resize_mode, fill_color, interpolation, size, ...other } = config;
+
+        const new_size = resize_mode === 'squash'
+            ? { width: size, height: size }
+            : resize_mode === 'shortest'
+                ? { shortest_edge: size }
+                : { longest_edge: size };
+
+        const resample = interpolation === 'bicubic' ? 3 : 2;
+        super({
+            ...other,
+            size: new_size,
+            resample,
+            do_center_crop: true,
+            crop_size: size,
+            do_normalize: true,
+        });
+    }
+}
diff --git a/src/models/jina_clip/processing_jina_clip.js b/src/models/jina_clip/processing_jina_clip.js
@@ -0,0 +1,24 @@
+
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+
+export class JinaCLIPProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer
+    static image_processor_class = AutoImageProcessor
+
+    async _call(text=null, images=null, kwargs = {}) {
+
+        if (!text && !images){
+            throw new Error('Either text or images must be provided');
+        }
+
+        const text_inputs = text ? this.tokenizer(text, kwargs) : {};
+        const image_inputs = images ? await this.image_processor(images, kwargs) : {};
+
+        return {
+            ...text_inputs,
+            ...image_inputs,
+        }
+    }
+}
diff --git a/src/models/processors.js b/src/models/processors.js
@@ -1,6 +1,7 @@
 export * from './florence2/processing_florence2.js';
 export * from './mgp_str/processing_mgp_str.js';
 export * from './janus/processing_janus.js';
+export * from './jina_clip/processing_jina_clip.js';
 export * from './owlvit/processing_owlvit.js';
 export * from './pyannote/processing_pyannote.js';
 export * from './qwen2_vl/processing_qwen2_vl.js';

diff --git a/tests/init.js b/tests/init.js
@@ -57,7 +57,7 @@ export function init() {
   registerBackend("test", onnxruntimeBackend, Number.POSITIVE_INFINITY);
 }
 
-export const MAX_MODEL_LOAD_TIME = 10_000; // 10 seconds
+export const MAX_MODEL_LOAD_TIME = 15_000; // 15 seconds
 export const MAX_TEST_EXECUTION_TIME = 30_000; // 30 seconds
 export const MAX_MODEL_DISPOSE_TIME = 1_000; // 1 second
 

diff --git a/tests/processors.test.js b/tests/processors.test.js
@@ -10,6 +10,7 @@ env.useFSCache = false;
 const sum = (array) => Number(array.reduce((a, b) => a + b, array instanceof BigInt64Array ? 0n : 0));
 const avg = (array) => sum(array) / array.length;
 
+/** @type {Map<string, RawImage>} */
 const IMAGE_CACHE = new Map();
 const load_image = async (url) => {
   const cached = IMAGE_CACHE.get(url);
@@ -40,6 +41,7 @@ const MODELS = {
   nougat: "Xenova/nougat-small",
   owlvit: "Xenova/owlvit-base-patch32",
   clip: "Xenova/clip-vit-base-patch16",
+  jina_clip: "jinaai/jina-clip-v2",
   vitmatte: "Xenova/vitmatte-small-distinctions-646",
   dinov2: "Xenova/dinov2-small-imagenet1k-1-layer",
   // efficientnet: 'Xenova/efficientnet-b0',
@@ -490,6 +492,27 @@ describe("Processors", () => {
       MAX_TEST_EXECUTION_TIME,
     );
 
+    // JinaCLIPImageProcessor
+    // - custom config overrides
+    it(
+      MODELS.jina_clip,
+      async () => {
+        const processor = await AutoImageProcessor.from_pretrained(MODELS.jina_clip);
+
+        {
+          const image = await load_image(TEST_IMAGES.tiger);
+          const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+          compare(pixel_values.dims, [1, 3, 512, 512]);
+          compare(avg(pixel_values.data), -0.06637834757566452);
+
+          compare(original_sizes, [[408, 612]]);
+          compare(reshaped_input_sizes, [[512, 512]]);
+        }
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
     // VitMatteImageProcessor
     //  - tests custom overrides
     //  - tests multiple inputs

diff --git a/tests/utils/tensor.test.js b/tests/utils/tensor.test.js
@@ -69,7 +69,11 @@ describe("Tensor operations", () => {
     });
 
     it("should return a crop", async () => {
-      const t1 = new Tensor("float32", Array.from({ length: 28 }, (_, i) => i + 1), [4, 7]);
+      const t1 = new Tensor(
+        "float32",
+        Array.from({ length: 28 }, (_, i) => i + 1),
+        [4, 7],
+      );
       const t2 = t1.slice([1, -1], [1, -1]);
 
       const target = new Tensor("float32", [9, 10, 11, 12, 13, 16, 17, 18, 19, 20], [2, 5]);

diff --git a/tests/utils/utils.test.js b/tests/utils/utils.test.js
@@ -65,14 +65,14 @@ describe("Utilities", () => {
     const [width, height, channels] = [2, 2, 3];
     const data = Uint8Array.from({ length: width * height * channels }, (_, i) => i % 5);
     const tiny_image = new RawImage(data, width, height, channels);
-    
+
     let image;
     beforeAll(async () => {
       image = await RawImage.fromURL("https://picsum.photos/300/200");
     });
 
     it("Can split image into separate channels", async () => {
-      const image_data = tiny_image.split().map(x => x.data);
+      const image_data = tiny_image.split().map((x) => x.data);
 
       const target = [
         new Uint8Array([0, 3, 1, 4]), // Reds
@@ -84,7 +84,10 @@ describe("Utilities", () => {
     });
 
     it("Can splits channels for grayscale", async () => {
-      const image_data = tiny_image.grayscale().split().map(x => x.data);
+      const image_data = tiny_image
+        .grayscale()
+        .split()
+        .map((x) => x.data);
       const target = [new Uint8Array([1, 3, 2, 1])];
 
       compare(image_data, target);