huggingface · NielsRogge · Nov 23, 2023 · Nov 23, 2023 · Nov 23, 2023 · Nov 23, 2023
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -754,6 +754,8 @@
         title: CLIPSeg
       - local: model_doc/clvp
         title: CLVP
+      - local: model_doc/cogvlm
+        title: CogVLM
       - local: model_doc/data2vec
         title: Data2Vec
       - local: model_doc/deplot

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
@@ -95,6 +95,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [CLVP](model_doc/clvp)                          |       ✅        |         ❌         |      ❌      |
 |                       [CodeGen](model_doc/codegen)                       |       ✅        |         ❌         |      ❌      |
 |                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ✅      |
+|                        [CogVLM](model_doc/cogvlm)                        |       ✅        |         ❌         |      ❌      |
 |                        [Cohere](model_doc/cohere)                        |       ✅        |         ❌         |      ❌      |
 |              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
 |                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |

diff --git a/docs/source/en/model_doc/cogvlm.md b/docs/source/en/model_doc/cogvlm.md
@@ -0,0 +1,56 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CogVLM
+
+## Overview
+
+The CogVLM model was proposed in [CogVLM: Visual Expert for Pretrained Language Models](https://arxiv.org/abs/2311.03079) by Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Xixuan Song, Jiazheng Xu, Bin Xu, Juanzi Li, Yuxiao Dong, Ming Ding, Jie Tang. CogVLM adds separate QKV and MLP weights to a frozen large language model, enabling a strong multimodal foundation model that performs well on various multimodal benchmarks.
+
+The abstract from the paper is the following:
+
+*We introduce CogVLM, a powerful open-source visual language foundation model. Different from the popular shallow alignment method which maps image features into the input space of language model, CogVLM bridges the gap between the frozen pretrained language model and image encoder by a trainable visual expert module in the attention and FFN layers. As a result, CogVLM enables deep fusion of vision language features without sacrificing any performance on NLP tasks. CogVLM-17B achieves state-of-the-art performance on 10 classic cross-modal benchmarks, including NoCaps, Flicker30k captioning, RefCOCO, RefCOCO+, RefCOCOg, Visual7W, GQA, ScienceQA, VizWiz VQA and TDIUC, and ranks the 2nd on VQAv2, OKVQA, TextVQA, COCO captioning, etc., surpassing or matching PaLI-X 55B.*
+
+Tips:
+
+- One can use [`CogvlmProcessor`] to prepare images and text for the model.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/THUDM/CogVLM).
+
+
+## CogvlmConfig
+
+[[autodoc]] CogvlmConfig
+
+## CogvlmVisionConfig
+
+[[autodoc]] CogvlmVisionConfig
+
+## CogvlmProcessor
+
+[[autodoc]] CogvlmProcessor
+
+## CogvlmModel
+
+[[autodoc]] CogvlmModel
+    - forward
+
+## CogvlmForCausalLM
+
+[[autodoc]] CogvlmForCausalLM
+    - forward
+    - generate
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
@@ -196,6 +196,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
 * [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
+* [CogVLM](https://huggingface.co/docs/transformers/model_doc/cogvlm#transformers.CogVLMModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
 * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -287,6 +287,11 @@
         "CodeGenConfig",
         "CodeGenTokenizer",
     ],
+    "models.cogvlm": [
+        "CogvlmConfig",
+        "CogvlmProcessor",
+        "CogvlmVisionConfig",
+    ],
     "models.cohere": ["CohereConfig"],
     "models.conditional_detr": ["ConditionalDetrConfig"],
     "models.convbert": [
@@ -1633,6 +1638,13 @@
             "CodeGenPreTrainedModel",
         ]
     )
+    _import_structure["models.cogvlm"].extend(
+        [
+            "CogvlmForCausalLM",
+            "CogvlmModel",
+            "CogvlmPreTrainedModel",
+        ]
+    )
     _import_structure["models.cohere"].extend(["CohereForCausalLM", "CohereModel", "CoherePreTrainedModel"])
     _import_structure["models.conditional_detr"].extend(
         [
@@ -4849,6 +4861,11 @@
         CodeGenConfig,
         CodeGenTokenizer,
     )
+    from .models.cogvlm import (
+        CogvlmConfig,
+        CogvlmProcessor,
+        CogvlmVisionConfig,
+    )
     from .models.cohere import CohereConfig
     from .models.conditional_detr import (
         ConditionalDetrConfig,
@@ -6190,6 +6207,11 @@
             CodeGenModel,
             CodeGenPreTrainedModel,
         )
+        from .models.cogvlm import (
+            CogvlmForCausalLM,
+            CogvlmModel,
+            CogvlmPreTrainedModel,
+        )
         from .models.cohere import (
             CohereForCausalLM,
             CohereModel,

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -49,6 +49,7 @@
     clvp,
     code_llama,
     codegen,
+    cogvlm,
     cohere,
     conditional_detr,
     convbert,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -64,6 +64,7 @@
         ("clvp", "ClvpConfig"),
         ("code_llama", "LlamaConfig"),
         ("codegen", "CodeGenConfig"),
+        ("cogvlm", "CogvlmConfig"),
         ("cohere", "CohereConfig"),
         ("conditional_detr", "ConditionalDetrConfig"),
         ("convbert", "ConvBertConfig"),
@@ -331,6 +332,7 @@
         ("clvp", "CLVP"),
         ("code_llama", "CodeLlama"),
         ("codegen", "CodeGen"),
+        ("cogvlm", "CogVLM"),
         ("cohere", "Cohere"),
         ("conditional_detr", "Conditional DETR"),
         ("convbert", "ConvBERT"),

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -48,6 +48,7 @@
         ("chinese_clip", "ChineseCLIPImageProcessor"),
         ("clip", "CLIPImageProcessor"),
         ("clipseg", "ViTImageProcessor"),
+        ("cogvlm", "CLIPImageProcessor"),
         ("conditional_detr", "ConditionalDetrImageProcessor"),
         ("convnext", "ConvNextImageProcessor"),
         ("convnextv2", "ConvNextImageProcessor"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -64,6 +64,7 @@
         ("clvp", "ClvpModelForConditionalGeneration"),
         ("code_llama", "LlamaModel"),
         ("codegen", "CodeGenModel"),
+        ("cogvlm", "CogvlmModel"),
         ("cohere", "CohereModel"),
         ("conditional_detr", "ConditionalDetrModel"),
         ("convbert", "ConvBertModel"),
@@ -693,6 +694,7 @@
     [
         ("blip", "BlipForConditionalGeneration"),
         ("blip-2", "Blip2ForConditionalGeneration"),
+        ("cogvlm", "CogvlmForCausalLM"),
         ("git", "GitForCausalLM"),
         ("idefics2", "Idefics2ForConditionalGeneration"),
         ("instructblip", "InstructBlipForConditionalGeneration"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -56,6 +56,7 @@
         ("clip", "CLIPProcessor"),
         ("clipseg", "CLIPSegProcessor"),
         ("clvp", "ClvpProcessor"),
+        ("cogvlm", "CogvlmProcessor"),
         ("flava", "FlavaProcessor"),
         ("fuyu", "FuyuProcessor"),
         ("git", "GitProcessor"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -138,6 +138,7 @@
                 ),
             ),
             ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
+            ("cogvlm", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("cohere", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
             ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
             (

diff --git a/src/transformers/models/cogvlm/__init__.py b/src/transformers/models/cogvlm/__init__.py
@@ -0,0 +1,63 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_cogvlm": [
+        "COGVLM_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CogvlmConfig",
+        "CogvlmVisionConfig",
+    ],
+    "processing_cogvlm": ["CogvlmProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_cogvlm"] = [
+        "CogvlmModel",
+        "CogvlmForCausalLM",
+        "CogvlmPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_cogvlm import (
+        COGVLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CogvlmConfig,
+        CogvlmVisionConfig,
+    )
+    from .processing_cogvlm import CogvlmProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_cogvlm import (
+            CogvlmForCausalLM,
+            CogvlmModel,
+            CogvlmPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)