From 2de085b6e513a21d22afff361cca6f40a8cf4c02 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 13 Dec 2023 14:47:17 +0200 Subject: [PATCH] Add support for ChineseCLIP models (#455) * Update `VitMatteImageProcessor` test comment * Add support for ChineseCLIP models * Add chinese-clip to list of supported models * Sort zero-shot-image-classification results by score (desc) * Update expected zero-shot image classification output --- README.md | 1 + docs/snippets/6_supported-models.snippet | 1 + scripts/supported_models.py | 10 +++++++++ src/models.js | 9 ++++++++ src/pipelines.js | 26 ++++++++++++------------ src/processors.js | 2 ++ tests/pipelines.test.js | 24 +++++++++++----------- tests/processors.test.js | 1 + 8 files changed, 49 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 9d3e77a54..00827c1a7 100644 --- a/README.md +++ b/README.md @@ -274,6 +274,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. +1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou. 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet index d4485f309..a79b79db2 100644 --- a/docs/snippets/6_supported-models.snippet +++ b/docs/snippets/6_supported-models.snippet @@ -10,6 +10,7 @@ 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. +1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou. 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. diff --git a/scripts/supported_models.py b/scripts/supported_models.py index f9b794f37..bd5de355e 100644 --- a/scripts/supported_models.py +++ b/scripts/supported_models.py @@ -184,6 +184,16 @@ # 'Xenova/tiny-random-ClapModel', } }, + 'chinese_clip': { + # Zero-shot image classification + # TODO: Add `--split_modalities` option + 'zero-shot-image-classification': [ + 'OFA-Sys/chinese-clip-vit-base-patch16', + 'OFA-Sys/chinese-clip-vit-large-patch14', + 'OFA-Sys/chinese-clip-vit-large-patch14-336px', + # 'OFA-Sys/chinese-clip-vit-huge-patch14', # TODO add + ], + }, 'clip': { # Zero-shot image classification (and feature extraction) # (with and without `--split_modalities`) diff --git a/src/models.js b/src/models.js index 6a03b867b..d9fdea8f4 100644 --- a/src/models.js +++ b/src/models.js @@ -3084,8 +3084,16 @@ export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel { return super.from_pretrained(pretrained_model_name_or_path, options); } } +////////////////////////////////////////////////// + ////////////////////////////////////////////////// +// ChineseCLIP models +export class ChineseCLIPPreTrainedModel extends PreTrainedModel { } + +export class ChineseCLIPModel extends ChineseCLIPPreTrainedModel { } +////////////////////////////////////////////////// + ////////////////////////////////////////////////// // GPT2 models @@ -4677,6 +4685,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([ ['xlm-roberta', ['XLMRobertaModel', XLMRobertaModel]], ['clap', ['ClapModel', ClapModel]], ['clip', ['CLIPModel', CLIPModel]], + ['chinese_clip', ['ChineseCLIPModel', ChineseCLIPModel]], ['mobilebert', ['MobileBertModel', MobileBertModel]], ['squeezebert', ['SqueezeBertModel', SqueezeBertModel]], ['wav2vec2', ['Wav2Vec2Model', Wav2Vec2Model]], diff --git a/src/pipelines.js b/src/pipelines.js index 4041040e0..fadaeb1ad 100644 --- a/src/pipelines.js +++ b/src/pipelines.js @@ -1762,38 +1762,38 @@ export class ZeroShotImageClassificationPipeline extends Pipeline { async _call(images, candidate_labels, { hypothesis_template = "This is a photo of {}" } = {}) { - let isBatched = Array.isArray(images); + const isBatched = Array.isArray(images); images = await prepareImages(images); // Insert label into hypothesis template - let texts = candidate_labels.map( + const texts = candidate_labels.map( x => hypothesis_template.replace('{}', x) ); // Run tokenization - let text_inputs = this.tokenizer(texts, { + const text_inputs = this.tokenizer(texts, { padding: true, truncation: true }); // Run processor - let { pixel_values } = await this.processor(images); + const { pixel_values } = await this.processor(images); // Run model with both text and pixel inputs - let output = await this.model({ ...text_inputs, pixel_values }); + const output = await this.model({ ...text_inputs, pixel_values }); // Compare each image with each candidate label - let toReturn = []; - for (let batch of output.logits_per_image) { + const toReturn = []; + for (const batch of output.logits_per_image) { // Compute softmax per image - let probs = softmax(batch.data); + const probs = softmax(batch.data); - toReturn.push([...probs].map((x, i) => { - return { - score: x, - label: candidate_labels[i] - } + const result = [...probs].map((x, i) => ({ + score: x, + label: candidate_labels[i] })); + result.sort((a, b) => b.score - a.score); // sort by score in descending order + toReturn.push(result); } return isBatched ? toReturn : toReturn[0]; diff --git a/src/processors.js b/src/processors.js index d38128ab4..c932f1810 100644 --- a/src/processors.js +++ b/src/processors.js @@ -613,6 +613,7 @@ export class BitImageProcessor extends ImageFeatureExtractor { } export class DPTFeatureExtractor extends ImageFeatureExtractor { } export class GLPNFeatureExtractor extends ImageFeatureExtractor { } export class CLIPFeatureExtractor extends ImageFeatureExtractor { } +export class ChineseCLIPFeatureExtractor extends ImageFeatureExtractor { } export class ConvNextFeatureExtractor extends ImageFeatureExtractor { } export class ConvNextImageProcessor extends ConvNextFeatureExtractor { } // NOTE extends ConvNextFeatureExtractor export class ViTFeatureExtractor extends ImageFeatureExtractor { } @@ -1695,6 +1696,7 @@ export class AutoProcessor { MobileViTFeatureExtractor, OwlViTFeatureExtractor, CLIPFeatureExtractor, + ChineseCLIPFeatureExtractor, ConvNextFeatureExtractor, ConvNextImageProcessor, BitImageProcessor, diff --git a/tests/pipelines.test.js b/tests/pipelines.test.js index 50d3aecd7..5ccbb6ed8 100644 --- a/tests/pipelines.test.js +++ b/tests/pipelines.test.js @@ -1179,9 +1179,9 @@ describe('Pipelines', () => { let output = await classifier(url, classes); let expected = [ - { "score": 0.992206871509552, "label": "football" }, - { "score": 0.0013248942559584975, "label": "airport" }, - { "score": 0.006468251813203096, "label": "animals" } + { score: 0.9719080924987793, label: 'football' }, + { score: 0.022564826533198357, label: 'animals' }, + { score: 0.005527070723474026, label: 'airport' } ] compare(output, expected, 0.1); @@ -1194,17 +1194,17 @@ describe('Pipelines', () => { let expected = [ [ - { "score": 0.9919875860214233, "label": "football" }, - { "score": 0.0012227334082126617, "label": "airport" }, - { "score": 0.006789708975702524, "label": "animals" } + { score: 0.9712504148483276, label: 'football' }, + { score: 0.022469401359558105, label: 'animals' }, + { score: 0.006280169822275639, label: 'airport' } ], [ - { "score": 0.0003043194592464715, "label": "football" }, - { "score": 0.998708188533783, "label": "airport" }, - { "score": 0.0009874969255179167, "label": "animals" } + { score: 0.997433602809906, label: 'airport' }, + { score: 0.0016500800848007202, label: 'animals' }, + { score: 0.0009163151844404638, label: 'football' } ], [ - { "score": 0.015163016505539417, "label": "football" }, - { "score": 0.016037866473197937, "label": "airport" }, - { "score": 0.9687991142272949, "label": "animals" } + { score: 0.9851226806640625, label: 'animals' }, + { score: 0.007516484707593918, label: 'football' }, + { score: 0.007360846735537052, label: 'airport' } ] ]; compare(output, expected, 0.1); diff --git a/tests/processors.test.js b/tests/processors.test.js index ed83f775d..f17483827 100644 --- a/tests/processors.test.js +++ b/tests/processors.test.js @@ -345,6 +345,7 @@ describe('Processors', () => { // VitMatteImageProcessor // - tests custom overrides // - tests multiple inputs + // - tests `size_divisibility` and no size (size_divisibility=32) it(MODELS.vitmatte, async () => { const processor = await AutoProcessor.from_pretrained(m(MODELS.vitmatte))