diff --git a/tests/models/phi3_v/test_processor_phi3_v.js b/tests/models/phi3_v/test_processor_phi3_v.js new file mode 100644 index 000000000..6896046ef --- /dev/null +++ b/tests/models/phi3_v/test_processor_phi3_v.js @@ -0,0 +1,87 @@ +import { AutoProcessor, Phi3VProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + const model_id = "onnx-community/Phi-3.5-vision-instruct"; + + describe("Phi3VProcessor", () => { + /** @type {Phi3VProcessor} */ + let processor; + let images = {}; + + beforeAll(async () => { + processor = await AutoProcessor.from_pretrained(model_id, { + // Use legacy to match python version + legacy: true, + }); + images = { + white_image: await load_cached_image("white_image"), + }; + }, MAX_PROCESSOR_LOAD_TIME); + + const create_prompt = (text, images = []) => { + const placeholder = images.map((_, i) => `<|image_${i + 1}|>\n`).join(""); + const messages = [{ role: "user", content: placeholder + text }]; + const prompt = processor.tokenizer.apply_chat_template(messages, { tokenize: false, add_generation_prompt: true }); + return prompt; + }; + + it( + "Text-only", + async () => { + const prompt = create_prompt("Hi there."); + const { input_ids, pixel_values } = await processor(prompt); + expect(input_ids.dims).toEqual([1, 11]); + expect(pixel_values).toBeUndefined(); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "Single image & text", + async () => { + const imgs = [images.white_image]; + const prompt = create_prompt("Describe this image.", imgs); + const { input_ids, attention_mask, pixel_values, image_sizes } = await processor(prompt, imgs); + expect(input_ids.dims).toEqual([1, /* 773 */ 770]); + expect(attention_mask.dims).toEqual(input_ids.dims); + expect(pixel_values.dims).toEqual([1, 5, 3, 336, 336]); + expect(image_sizes.tolist()).toEqual([[672n, 672n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "Single image (num_crops=16) & text", + async () => { + const imgs = [images.white_image]; + const prompt = create_prompt("Describe this image.", imgs); + const { input_ids, attention_mask, pixel_values, image_sizes } = await processor(prompt, imgs, { num_crops: 16 }); + expect(input_ids.dims).toEqual([1, /* 2525 */ 2522]); + expect(attention_mask.dims).toEqual(input_ids.dims); + expect(pixel_values.dims).toEqual([1, 17, 3, 336, 336]); + expect(image_sizes.tolist()).toEqual([[1344n, 1344n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "Multiple images & text", + async () => { + const imgs = [images.white_image, images.white_image]; + const prompt = create_prompt("Describe these images.", imgs); + const { input_ids, attention_mask, pixel_values, image_sizes } = await processor(prompt, imgs); + expect(input_ids.dims).toEqual([1, /* 1533 */ 1527]); + expect(attention_mask.dims).toEqual(input_ids.dims); + expect(pixel_values.dims).toEqual([2, 5, 3, 336, 336]); + expect(image_sizes.tolist()).toEqual([ + [672n, 672n], + [672n, 672n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +};