From 9308f880c5c3632cc42a2676699d5ee460f32627 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Tue, 12 Dec 2023 17:42:48 +0200
Subject: [PATCH] Add support for DINOv2 models (#444)

* Add dinov2 models

* Add `BitImageProcessor`

* Update list of supported models
---
 README.md                                |  1 +
 docs/snippets/6_supported-models.snippet |  1 +
 scripts/supported_models.py              | 17 +++++++++++++++++
 src/models.js                            | 24 ++++++++++++++++++++++++
 src/processors.js                        |  2 ++
 tests/processors.test.js                 | 17 +++++++++++++++++
 6 files changed, 62 insertions(+)

diff --git a/README.md b/README.md
index f5dc2cc13..a56477aaa 100644
--- a/README.md
+++ b/README.md
@@ -284,6 +284,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet
index 263ae3556..bdf069c97 100644
--- a/docs/snippets/6_supported-models.snippet
+++ b/docs/snippets/6_supported-models.snippet
@@ -20,6 +20,7 @@
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
diff --git a/scripts/supported_models.py b/scripts/supported_models.py
index d56fd660c..bf1b28ae0 100644
--- a/scripts/supported_models.py
+++ b/scripts/supported_models.py
@@ -289,6 +289,23 @@
             'facebook/detr-resnet-50-panoptic',
         ],
     },
+    'dinov2': {
+        # Feature extraction
+        'feature-extraction': [
+            'facebook/dinov2-small',
+            'facebook/dinov2-base',
+            'facebook/dinov2-large',
+            # 'facebook/dinov2-giant',  # TODO add
+        ],
+
+        # Image classification
+        'image-classification': [
+            'facebook/dinov2-small-imagenet1k-1-layer',
+            'facebook/dinov2-base-imagenet1k-1-layer',
+            'facebook/dinov2-large-imagenet1k-1-layer',
+            # 'facebook/dinov2-giant-imagenet1k-1-layer',  # TODO add
+        ],
+    },
     'distilbert': {
         # Feature extraction
         'feature-extraction': [
diff --git a/src/models.js b/src/models.js
index ef52b2d98..cb80bb90a 100644
--- a/src/models.js
+++ b/src/models.js
@@ -3639,6 +3639,28 @@ export class ConvNextV2ForImageClassification extends ConvNextV2PreTrainedModel
 }
 //////////////////////////////////////////////////
 
+//////////////////////////////////////////////////
+export class Dinov2PreTrainedModel extends PreTrainedModel { }
+
+/**
+ * The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class Dinov2Model extends Dinov2PreTrainedModel { }
+
+/**
+ * Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet.
+ */
+export class Dinov2ForImageClassification extends Dinov2PreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+//////////////////////////////////////////////////
+
+
 //////////////////////////////////////////////////
 export class YolosPreTrainedModel extends PreTrainedModel { }
 export class YolosModel extends YolosPreTrainedModel { }
@@ -4330,6 +4352,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
     ['deit', ['DeiTModel', DeiTModel]],
     ['convnext', ['ConvNextModel', ConvNextModel]],
     ['convnextv2', ['ConvNextV2Model', ConvNextV2Model]],
+    ['dinov2', ['Dinov2Model', Dinov2Model]],
     ['resnet', ['ResNetModel', ResNetModel]],
     ['swin', ['SwinModel', SwinModel]],
     ['swin2sr', ['Swin2SRModel', Swin2SRModel]],
@@ -4486,6 +4509,7 @@ const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
     ['deit', ['DeiTForImageClassification', DeiTForImageClassification]],
     ['convnext', ['ConvNextForImageClassification', ConvNextForImageClassification]],
     ['convnextv2', ['ConvNextV2ForImageClassification', ConvNextV2ForImageClassification]],
+    ['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]],
     ['resnet', ['ResNetForImageClassification', ResNetForImageClassification]],
     ['swin', ['SwinForImageClassification', SwinForImageClassification]],
 ]);
diff --git a/src/processors.js b/src/processors.js
index a6f1351d1..8750894a2 100644
--- a/src/processors.js
+++ b/src/processors.js
@@ -606,6 +606,7 @@ export class ImageFeatureExtractor extends FeatureExtractor {
 
 }
 
+export class BitImageProcessor extends ImageFeatureExtractor { }
 export class DPTFeatureExtractor extends ImageFeatureExtractor { }
 export class GLPNFeatureExtractor extends ImageFeatureExtractor { }
 export class CLIPFeatureExtractor extends ImageFeatureExtractor { }
@@ -1652,6 +1653,7 @@ export class AutoProcessor {
         CLIPFeatureExtractor,
         ConvNextFeatureExtractor,
         ConvNextImageProcessor,
+        BitImageProcessor,
         DPTFeatureExtractor,
         GLPNFeatureExtractor,
         BeitFeatureExtractor,
diff --git a/tests/processors.test.js b/tests/processors.test.js
index c6703f14e..73d5b3e86 100644
--- a/tests/processors.test.js
+++ b/tests/processors.test.js
@@ -43,6 +43,7 @@ describe('Processors', () => {
             nougat: 'facebook/nougat-small',
             owlvit: 'google/owlvit-base-patch32',
             clip: 'openai/clip-vit-base-patch16',
+            dinov2: 'facebook/dinov2-small-imagenet1k-1-layer',
         }
 
         const TEST_IMAGES = {
@@ -336,6 +337,22 @@ describe('Processors', () => {
                 compare(reshaped_input_sizes, [[224, 224]]);
             }
         }, MAX_TEST_EXECUTION_TIME);
+
+        // BitImageProcessor
+        it(MODELS.dinov2, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.dinov2))
+
+            {
+                const image = await load_image(TEST_IMAGES.tiger);
+                const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+                compare(pixel_values.dims, [1, 3, 224, 224]);
+                compare(avg(pixel_values.data), 0.06262318789958954);
+
+                compare(original_sizes, [[408, 612]]);
+                compare(reshaped_input_sizes, [[224, 224]]);
+            }
+        }, MAX_TEST_EXECUTION_TIME);
     });
 
     describe('Audio processors', () => {