From c89299f2e8eb75c34d3766579e3d83ea1f3bd859 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Mon, 1 Jul 2024 10:12:24 +0200 Subject: [PATCH] Add `CLIPTextModel` and `CLIPVisionModel` (#829) * Add `CLIPTextModel` and `CLIPVisionModel` * Fix jinja2 version for tests --- src/models.js | 25 ++++++++++++++++++++++++- tests/requirements.txt | 1 + 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/models.js b/src/models.js index a8112912e..7c0cb0991 100644 --- a/src/models.js +++ b/src/models.js @@ -3099,6 +3099,18 @@ export class CLIPPreTrainedModel extends PreTrainedModel { } */ export class CLIPModel extends CLIPPreTrainedModel { } +/** + * The text model from CLIP without any head or projection on top. + */ +export class CLIPTextModel extends CLIPPreTrainedModel { + /** @type {PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + // Update default model file name if not provided + options.model_file_name ??= 'text_model'; + return super.from_pretrained(pretrained_model_name_or_path, options); + } +} + /** * CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output) * @@ -3126,7 +3138,6 @@ export class CLIPModel extends CLIPPreTrainedModel { } * ``` */ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel { - /** @type {PreTrainedModel.from_pretrained} */ static async from_pretrained(pretrained_model_name_or_path, options = {}) { // Update default model file name if not provided @@ -3135,6 +3146,18 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel { } } +/** + * The vision model from CLIP without any head or projection on top. + */ +export class CLIPVisionModel extends CLIPPreTrainedModel { + /** @type {PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + // Update default model file name if not provided + options.model_file_name ??= 'vision_model'; + return super.from_pretrained(pretrained_model_name_or_path, options); + } +} + /** * CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output) * diff --git a/tests/requirements.txt b/tests/requirements.txt index 5fdb28229..baa41cfda 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -3,3 +3,4 @@ sacremoses==0.0.53 sentencepiece==0.1.99 protobuf==4.24.3 rjieba==0.1.11 +jinja2==3.1.0