diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index e48378d8c25377..fb985379bfa0a9 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -758,6 +758,8 @@
         title: CLIPSeg
       - local: model_doc/clvp
         title: CLVP
+      - local: model_doc/cogvlm
+        title: CogVLM
       - local: model_doc/data2vec
         title: Data2Vec
       - local: model_doc/deplot
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index ac026067ac24b7..be02e7ade36a2e 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -95,6 +95,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [CLVP](model_doc/clvp)                          |       ✅        |         ❌         |      ❌      |
 |                       [CodeGen](model_doc/codegen)                       |       ✅        |         ❌         |      ❌      |
 |                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ✅      |
+|                        [CogVLM](model_doc/cogvlm)                        |       ✅        |         ❌         |      ❌      |
 |                        [Cohere](model_doc/cohere)                        |       ✅        |         ❌         |      ❌      |
 |              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
 |                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/model_doc/cogvlm.md b/docs/source/en/model_doc/cogvlm.md
new file mode 100644
index 00000000000000..f4a99a4976b77e
--- /dev/null
+++ b/docs/source/en/model_doc/cogvlm.md
@@ -0,0 +1,56 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CogVLM
+
+## Overview
+
+The CogVLM model was proposed in [CogVLM: Visual Expert for Pretrained Language Models](https://arxiv.org/abs/2311.03079) by Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Xixuan Song, Jiazheng Xu, Bin Xu, Juanzi Li, Yuxiao Dong, Ming Ding, Jie Tang. CogVLM adds separate QKV and MLP weights to a frozen large language model, enabling a strong multimodal foundation model that performs well on various multimodal benchmarks.
+
+The abstract from the paper is the following:
+
+*We introduce CogVLM, a powerful open-source visual language foundation model. Different from the popular shallow alignment method which maps image features into the input space of language model, CogVLM bridges the gap between the frozen pretrained language model and image encoder by a trainable visual expert module in the attention and FFN layers. As a result, CogVLM enables deep fusion of vision language features without sacrificing any performance on NLP tasks. CogVLM-17B achieves state-of-the-art performance on 10 classic cross-modal benchmarks, including NoCaps, Flicker30k captioning, RefCOCO, RefCOCO+, RefCOCOg, Visual7W, GQA, ScienceQA, VizWiz VQA and TDIUC, and ranks the 2nd on VQAv2, OKVQA, TextVQA, COCO captioning, etc., surpassing or matching PaLI-X 55B.*
+
+Tips:
+
+- One can use [`CogvlmProcessor`] to prepare images and text for the model.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/THUDM/CogVLM).
+
+
+## CogvlmConfig
+
+[[autodoc]] CogvlmConfig
+
+## CogvlmVisionConfig
+
+[[autodoc]] CogvlmVisionConfig
+
+## CogvlmProcessor
+
+[[autodoc]] CogvlmProcessor
+
+## CogvlmModel
+
+[[autodoc]] CogvlmModel
+    - forward
+
+## CogvlmForCausalLM
+
+[[autodoc]] CogvlmForCausalLM
+    - forward
+    - generate
\ No newline at end of file
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 1569bef1f6ba1f..a6059ea20814e5 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -198,6 +198,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
 * [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
+* [CogVLM](https://huggingface.co/docs/transformers/model_doc/cogvlm#transformers.CogVLMModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
 * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c559ed61acad03..76d39598ff0c4b 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -287,6 +287,11 @@
         "CodeGenConfig",
         "CodeGenTokenizer",
     ],
+    "models.cogvlm": [
+        "CogvlmConfig",
+        "CogvlmProcessor",
+        "CogvlmVisionConfig",
+    ],
     "models.cohere": ["CohereConfig"],
     "models.conditional_detr": ["ConditionalDetrConfig"],
     "models.convbert": [
@@ -1659,6 +1664,13 @@
             "CodeGenPreTrainedModel",
         ]
     )
+    _import_structure["models.cogvlm"].extend(
+        [
+            "CogvlmForCausalLM",
+            "CogvlmModel",
+            "CogvlmPreTrainedModel",
+        ]
+    )
     _import_structure["models.cohere"].extend(["CohereForCausalLM", "CohereModel", "CoherePreTrainedModel"])
     _import_structure["models.conditional_detr"].extend(
         [
@@ -4907,6 +4919,11 @@
         CodeGenConfig,
         CodeGenTokenizer,
     )
+    from .models.cogvlm import (
+        CogvlmConfig,
+        CogvlmProcessor,
+        CogvlmVisionConfig,
+    )
     from .models.cohere import CohereConfig
     from .models.conditional_detr import (
         ConditionalDetrConfig,
@@ -6274,6 +6291,11 @@
             CodeGenModel,
             CodeGenPreTrainedModel,
         )
+        from .models.cogvlm import (
+            CogvlmForCausalLM,
+            CogvlmModel,
+            CogvlmPreTrainedModel,
+        )
         from .models.cohere import (
             CohereForCausalLM,
             CohereModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index f4c33491472833..b12df865409461 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -49,6 +49,7 @@
     clvp,
     code_llama,
     codegen,
+    cogvlm,
     cohere,
     conditional_detr,
     convbert,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 7f52b3dc280ac6..2686ef81e245ff 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -64,6 +64,7 @@
         ("clvp", "ClvpConfig"),
         ("code_llama", "LlamaConfig"),
         ("codegen", "CodeGenConfig"),
+        ("cogvlm", "CogvlmConfig"),
         ("cohere", "CohereConfig"),
         ("conditional_detr", "ConditionalDetrConfig"),
         ("convbert", "ConvBertConfig"),
@@ -336,6 +337,7 @@
         ("clvp", "CLVP"),
         ("code_llama", "CodeLlama"),
         ("codegen", "CodeGen"),
+        ("cogvlm", "CogVLM"),
         ("cohere", "Cohere"),
         ("conditional_detr", "Conditional DETR"),
         ("convbert", "ConvBERT"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index efc2d4d998ccdd..727516b2082f51 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -62,6 +62,7 @@
             ("chinese_clip", ("ChineseCLIPImageProcessor",)),
             ("clip", ("CLIPImageProcessor",)),
             ("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("cogvlm", ("CLIPImageProcessor",)),
             ("conditional_detr", ("ConditionalDetrImageProcessor",)),
             ("convnext", ("ConvNextImageProcessor",)),
             ("convnextv2", ("ConvNextImageProcessor",)),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index f674b777fca7be..ceea91194ccb05 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -64,6 +64,7 @@
         ("clvp", "ClvpModelForConditionalGeneration"),
         ("code_llama", "LlamaModel"),
         ("codegen", "CodeGenModel"),
+        ("cogvlm", "CogvlmModel"),
         ("cohere", "CohereModel"),
         ("conditional_detr", "ConditionalDetrModel"),
         ("convbert", "ConvBertModel"),
@@ -697,6 +698,7 @@
     [
         ("blip", "BlipForConditionalGeneration"),
         ("blip-2", "Blip2ForConditionalGeneration"),
+        ("cogvlm", "CogvlmForCausalLM"),
         ("git", "GitForCausalLM"),
         ("idefics2", "Idefics2ForConditionalGeneration"),
         ("instructblip", "InstructBlipForConditionalGeneration"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 7c7342bb9fb7e7..6e5c730b1a87ed 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -56,6 +56,7 @@
         ("clip", "CLIPProcessor"),
         ("clipseg", "CLIPSegProcessor"),
         ("clvp", "ClvpProcessor"),
+        ("cogvlm", "CogvlmProcessor"),
         ("flava", "FlavaProcessor"),
         ("fuyu", "FuyuProcessor"),
         ("git", "GitProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index dddab5379f5657..e7e4d6cf8dd849 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -138,6 +138,7 @@
                 ),
             ),
             ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
+            ("cogvlm", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("cohere", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
             ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
             (
diff --git a/src/transformers/models/cogvlm/__init__.py b/src/transformers/models/cogvlm/__init__.py
new file mode 100644
index 00000000000000..ab64b23dd9c5b0
--- /dev/null
+++ b/src/transformers/models/cogvlm/__init__.py
@@ -0,0 +1,61 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_cogvlm": [
+        "CogvlmConfig",
+        "CogvlmVisionConfig",
+    ],
+    "processing_cogvlm": ["CogvlmProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_cogvlm"] = [
+        "CogvlmModel",
+        "CogvlmForCausalLM",
+        "CogvlmPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_cogvlm import (
+        CogvlmConfig,
+        CogvlmVisionConfig,
+    )
+    from .processing_cogvlm import CogvlmProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_cogvlm import (
+            CogvlmForCausalLM,
+            CogvlmModel,
+            CogvlmPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/cogvlm/configuration_cogvlm.py b/src/transformers/models/cogvlm/configuration_cogvlm.py
new file mode 100644
index 00000000000000..6752106b7389eb
--- /dev/null
+++ b/src/transformers/models/cogvlm/configuration_cogvlm.py
@@ -0,0 +1,230 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CogVLM model configuration"""
+
+import os
+from typing import Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class CogvlmVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CogvlmVisionModel`]. It is used to instantiate a
+    CogVLM vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration defaults will yield a similar configuration to that of the CogVLM
+    [THUDM/cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 490):
+            The size (resolution) of each image.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of channels in each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_size (`int`, *optional*, defaults to 1792):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 15360):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 63):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used for layernorm layers.
+        initializer_range (`float`, *optional*, defaults to 1e-10):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+
+    Example:
+
+    ```python
+    >>> from transformers import CogvlmVisionConfig, CogvlmVisionModel
+
+    >>> # Initializing a CogvlmVisionConfig with THUDM/cogvlm-chat-hf style configuration
+    >>> configuration = CogvlmVisionConfig()
+
+    >>> # Initializing a CogvlmVisionModel (with random weights) from the THUDM/cogvlm-chat-hf style configuration
+    >>> model = CogvlmVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "cogvlm_vision_model"
+
+    def __init__(
+        self,
+        image_size=490,
+        num_channels=3,
+        patch_size=14,
+        hidden_size=1792,
+        intermediate_size=15360,
+        num_hidden_layers=63,
+        num_attention_heads=16,
+        hidden_act="gelu",
+        layer_norm_eps=1e-6,
+        initializer_range=1e-10,
+        dropout_prob=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.dropout_prob = dropout_prob
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CogvlmConfig
+        if config_dict.get("model_type") == "cogvlm":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CogvlmConfig(PretrainedConfig):
+    r"""
+    [`CogvlmConfig`] is the configuration class to store the configuration of a [`CogvlmForCausalLM`]. It is
+    used to instantiate a CogVLM model according to the specified arguments, defining the vision model
+    and language model configs. Instantiating a configuration with the defaults will yield a similar configuration to
+    that of the CogVLM [THUDM/cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CogvlmVisionConfig`].
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the CogVLM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CogvlmModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+
+    Example:
+
+    ```python
+    >>> from transformers import CogvlmConfig, CogvlmForCausalLM
+
+    >>> # Initializing a CogvlmConfig with THUDM/cogvlm-chat-hf style configuration
+    >>> configuration = CogvlmConfig()
+
+    >>> # Initializing a CogvlmForCausalLM (with random weights) from the THUDM/cogvlm-chat-hf style configuration
+    >>> model = CogvlmForCausalLM(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "cogvlm"
+
+    def __init__(
+        self,
+        vision_config=None,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        use_cache=True,
+        **kwargs,
+    ):
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the CogvlmVisionConfig with default values.")
+
+        self.vision_config = CogvlmVisionConfig(**vision_config)
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/src/transformers/models/cogvlm/convert_cogvlm_original_to_hf.py b/src/transformers/models/cogvlm/convert_cogvlm_original_to_hf.py
new file mode 100644
index 00000000000000..2f7f764aa164d1
--- /dev/null
+++ b/src/transformers/models/cogvlm/convert_cogvlm_original_to_hf.py
@@ -0,0 +1,237 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Convert CogVLM checkpoints from the original repository.
+"""
+
+import argparse
+
+import requests
+import torch
+from accelerate import init_empty_weights
+from PIL import Image
+
+from transformers import (
+    AutoModelForCausalLM,
+    CLIPImageProcessor,
+    CogvlmConfig,
+    CogvlmForCausalLM,
+    CogvlmProcessor,
+    LlamaTokenizer,
+)
+from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+
+
+original_device = "cuda:2"
+hf_device = "cuda:3"
+
+
+@torch.no_grad()
+def convert_cogvlm_checkpoint(
+    model_name, pytorch_dump_folder_path=None, push_to_hub=False, attn_implementation: str = "eager"
+):
+    """
+    Copy/paste/tweak model's weights to Transformers design.
+    """
+    # load original model
+    # note: we need to fix the _update_model_kwargs_for_generation method to include model_inputs
+    # after https://github.com/huggingface/transformers/pull/29114
+    revision = "refs/pr/19" if model_name == "THUDM/cogvlm-chat-hf" else None
+    original_model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+        revision=revision,
+    )
+    original_model.to(original_device)
+
+    print("Original config:", original_model.config)
+
+    # prepare dummy example
+
+    if model_name == "THUDM/cogvlm-grounding-base-hf":
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        query = "Can you provide a description of the image and include the coordinates [[x0,y0,x1,y1]] for each mentioned object?"
+    else:
+        query = "Describe this image"
+        url = "https://raw.githubusercontent.com/THUDM/CogVLM/main/assets/metrics-min.png"
+
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    tokenizer = LlamaTokenizer.from_pretrained(
+        "lmsys/vicuna-7b-v1.5", model_input_names=["input_ids", "attention_mask", "token_type_ids"]
+    )
+
+    template_version = original_model.config.template_version
+
+    # prepare original inputs
+    inputs = original_model.build_conversation_input_ids(
+        tokenizer,
+        query=query,
+        history=[],
+        images=[image],
+        template_version=template_version,
+    )
+
+    def gather_inputs(inputs, device, use_bfloat16=True):
+        dtype = torch.bfloat16 if use_bfloat16 else torch.float32
+        inputs = {
+            "input_ids": inputs["input_ids"].unsqueeze(0).to(device),
+            "token_type_ids": inputs["token_type_ids"].unsqueeze(0).to(device),
+            "attention_mask": inputs["attention_mask"].unsqueeze(0).to(device),
+            "images": [[inputs["images"][0].to(device).to(dtype)]],
+        }
+        return inputs
+
+    original_inputs = gather_inputs(inputs, device=original_device)
+
+    print("Original input_ids:", tokenizer.decode(original_inputs["input_ids"][0]))
+
+    gen_kwargs = {"max_new_tokens": 10, "do_sample": False}
+    with torch.no_grad():
+        outputs = original_model.generate(**original_inputs, **gen_kwargs)
+        outputs = outputs[:, original_inputs["input_ids"].shape[1] :]
+        original_generated_text = tokenizer.decode(outputs[0])
+
+    # load HF model
+    # rename in_channels to num_channels for sake of consistency
+    original_model.config.vision_config["num_channels"] = original_model.config.vision_config.pop("in_channels")
+
+    config = CogvlmConfig(**original_model.config.to_dict())
+    config._attn_implementation = attn_implementation
+    with init_empty_weights():
+        model = CogvlmForCausalLM(config)
+
+    # load state dict
+    missing_keys, unexpected_keys = model.load_state_dict(original_model.state_dict(), strict=False, assign=True)
+    print("Missing keys:", missing_keys)
+    print("Unexpected keys:", unexpected_keys)
+    model.to(hf_device)
+    model.eval()
+
+    # cast all parameters to bfloat16
+    for p in model.parameters():
+        p.data = p.data.to(torch.bfloat16)
+
+    # create processor
+    image_size = original_model.config.vision_config["image_size"]
+    image_processor = CLIPImageProcessor(
+        size={"height": image_size, "width": image_size},
+        do_center_crop=False,
+        image_mean=OPENAI_CLIP_MEAN,
+        image_std=OPENAI_CLIP_STD,
+    )
+    patch_size = original_model.config.vision_config["patch_size"]
+    processor = CogvlmProcessor(
+        image_processor=image_processor, tokenizer=tokenizer, image_size=image_size, patch_size=patch_size
+    )
+
+    # original_inputs = gather_inputs(inputs, device=hf_device)
+    # original_inputs["pixel_values"] = torch.stack(original_inputs.pop("images")[0])
+
+    if template_version == "chat":
+        # chat history template
+        prompt = f"Question: {query} Answer:"
+    elif template_version == "base":
+        # base history template
+        prompt = f"{query}"
+    else:
+        raise ValueError("Template version not supported")
+
+    inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_device, torch.bfloat16)
+
+    for k, v in inputs.items():
+        print(k, v.shape)
+
+    # verify generation
+    with torch.no_grad():
+        outputs = model.generate(**inputs, **gen_kwargs)
+        outputs = outputs[:, inputs["input_ids"].shape[1] :]
+        generated_text = tokenizer.decode(outputs[0])
+
+    print("Original text:", original_generated_text)
+    print("HF text:", generated_text)
+    assert original_generated_text == generated_text
+
+    print("Original input_ids:", original_inputs["input_ids"])
+    print("HF input_ids:", inputs["input_ids"])
+
+    # verify logits
+    with torch.no_grad():
+        with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
+            original_logits = original_model(**original_inputs).logits
+            logits = model(**inputs).logits
+
+    assert original_logits.shape == logits.shape
+    print("First values of original logits:", original_logits[0, :3, :3])
+    print("Mean of original logits:", original_logits.mean())
+    print("First values of HF logits:", logits[0, :3, :3])
+    print("Mean of HF logits:", logits.mean())
+
+    print("Last values of original logits:", original_logits[0, -3:, -3:])
+    print("Last values of HF logits:", logits[0, -3:, -3:])
+
+    reldiff = (original_logits[0, -3:, -3:].to("cuda:0") - logits[0, -3:, -3:].to("cuda:0")).abs()
+    print("max reldiff", reldiff.max())
+    print("median reldiff", reldiff.median())
+
+    # assert values
+    # assert torch.allclose(original_logits.to(logits.device), logits, atol=1e-4)
+    print("Logits don't match but looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        processor.save_pretrained(pytorch_dump_folder_path)
+        model.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        processor.push_to_hub(f"nielsr/{model_name.split('/')[-1]}")
+        model.push_to_hub(f"nielsr/{model_name.split('/')[-1]}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        default="THUDM/cogvlm-chat-hf",
+        type=str,
+        choices=[
+            "THUDM/cogvlm-chat-hf",
+            "THUDM/cogvlm-base-490-hf",
+            "THUDM/cogvlm-base-224-hf",
+            "THUDM/cogvlm-grounding-base-hf",
+            "THUDM/cogvlm-grounding-generalist-hf",
+        ],
+        help="Name of the model to convert",
+    )
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether to push the model and processor to the hub after converting",
+    )
+    parser.add_argument(
+        "--attn_implementation",
+        type=str,
+        default="sdpa",
+        choices=["sdpa", "eager"],
+        help="Whether to use Transformers SDPA or eager implementation for attention",
+    )
+
+    args = parser.parse_args()
+
+    convert_cogvlm_checkpoint(
+        args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.attn_implementation
+    )
diff --git a/src/transformers/models/cogvlm/modeling_cogvlm.py b/src/transformers/models/cogvlm/modeling_cogvlm.py
new file mode 100644
index 00000000000000..53c9b27a35d0ce
--- /dev/null
+++ b/src/transformers/models/cogvlm/modeling_cogvlm.py
@@ -0,0 +1,1129 @@
+# coding=utf-8
+# Copyright 2024 THUDM and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch CogVLM model."""
+
+import math
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.utils.logging import get_logger
+
+from ...cache_utils import Cache, DynamicCache
+from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from .configuration_cogvlm import CogvlmConfig
+
+
+if TYPE_CHECKING:
+    from transformers.utils import ModelOutput
+
+logger = get_logger(__name__)
+
+LANGUAGE_TOKEN_TYPE = 0
+VISION_TOKEN_TYPE = 1
+
+_CONFIG_FOR_DOC = "CogvlmConfig"
+
+
+COGVLM_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`CogvlmConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+COGVLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`CLIPImageProcessor.__call__`] for details.
+
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CogvlmPatchEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_patches = (config.image_size // config.patch_size) ** 2 + 1
+        self.proj = nn.Conv2d(
+            config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
+        )
+        self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.position_embedding = nn.Embedding(self.num_patches, config.hidden_size)
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.FloatTensor:
+        embeddings = self.proj(pixel_values)
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+        cls_token = self.cls_embedding.expand(embeddings.shape[0], -1, -1)
+        embeddings = torch.cat((cls_token, embeddings), dim=1)
+        embeddings += self.position_embedding.weight.unsqueeze(0)
+        return embeddings
+
+
+class CogvlmVisionAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.query_key_value = nn.Linear(config.hidden_size, config.hidden_size * 3)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.output_dropout = torch.nn.Dropout(config.dropout_prob)
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.scale = 1.0 / math.sqrt(head_dim)
+
+    def forward(self, hidden_state: torch.FloatTensor) -> torch.FloatTensor:
+        batch_size, sequence_length, _ = hidden_state.shape
+        qkv = self.query_key_value(hidden_state)
+
+        # reshape to (3, batch_size, sequence_length, num_heads, head_dim)
+        qkv = qkv.reshape(batch_size, sequence_length, 3, self.num_heads, -1).permute(2, 0, 1, 3, 4)
+        queries, keys, values = qkv[0], qkv[1], qkv[2]
+
+        # import xformers.ops as xops
+
+        # out = xops.memory_efficient_attention(
+        #     queries,
+        #     keys,
+        #     values,
+        #     scale=self.scale,
+        # )
+
+        # output = self.dense(out.view(batch_size, sequence_length, -1))
+        # output = self.output_dropout(output)
+        # return output
+
+        queries = queries.transpose(1, 2)
+        keys = keys.transpose(1, 2)
+        values = values.transpose(1, 2)
+
+        queries = queries * self.scale
+        attention_scores = queries @ keys.transpose(-2, -1)
+
+        # PyTorch already accumulates softmax on fp32 (Reference: https://github.com/pytorch/pytorch/pull/103167)
+        attention_probs = attention_scores.softmax(-1)
+        attention_output = attention_probs @ values
+        attention_output = attention_output.transpose(1, 2).contiguous()
+
+        output = self.dense(attention_output.view(batch_size, sequence_length, -1))
+        output = self.output_dropout(output)
+        return output
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->CogvlmVision
+class CogvlmVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class CogvlmVisionTransformerLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = CogvlmVisionAttention(config)
+        self.mlp = CogvlmVisionMLP(config)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        attention_input = hidden_states
+
+        attention_output = self.attention(attention_input)
+
+        attention_output = self.input_layernorm(attention_output)
+        hidden_states = attention_input + attention_output
+        mlp_input = hidden_states
+        mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
+        output = mlp_input + mlp_output
+        return output
+
+
+class CogvlmVisionTransformer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.ModuleList([CogvlmVisionTransformerLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(self, hidden_states):
+        for idx, layer_module in enumerate(self.layers):
+            hidden_states = layer_module(hidden_states)
+
+        return hidden_states
+
+
+class CogvlmVisionGLU(nn.Module):
+    def __init__(self, config, in_features):
+        super().__init__()
+        self.linear_proj = nn.Linear(in_features, config.hidden_size, bias=False)
+        self.norm1 = nn.LayerNorm(config.hidden_size)
+        self.act1 = nn.GELU()
+        self.act2 = nn.functional.silu
+        self.dense_h_to_4h = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.dense_4h_to_h = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+
+    def forward(self, hidden_state):
+        hidden_state = self.linear_proj(hidden_state)
+        hidden_state = self.act1(self.norm1(hidden_state))
+        hidden_state = self.act2(self.gate_proj(hidden_state)) * self.dense_h_to_4h(hidden_state)
+        hidden_state = self.dense_4h_to_h(hidden_state)
+        return hidden_state
+
+
+class CogvlmVisionModel(nn.Module):
+    def __init__(self, config: CogvlmConfig):
+        super().__init__()
+
+        self.patch_embedding = CogvlmPatchEmbedding(config.vision_config)
+        self.transformer = CogvlmVisionTransformer(config.vision_config)
+        self.linear_proj = CogvlmVisionGLU(config, in_features=config.vision_config.hidden_size)
+        # parameters for beginning of image (boi) and end of image (eoi)
+        self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.FloatTensor:
+        hidden_state = self.patch_embedding(pixel_values)
+        hidden_state = self.transformer(hidden_state)
+        hidden_state = hidden_state[:, 1:]
+        hidden_state = self.linear_proj(hidden_state)
+        beginning_of_image_features = self.boi.expand(hidden_state.shape[0], -1, -1)
+        end_of_image_features = self.eoi.expand(hidden_state.shape[0], -1, -1)
+        hidden_state = torch.cat((beginning_of_image_features, hidden_state, end_of_image_features), dim=1)
+        return hidden_state
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralRMSNorm with Mistral->Cogvlm
+class CogvlmRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        CogvlmRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    # Ignore copy
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * hidden_states).to(input_dtype)
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Cogvlm
+class CogvlmMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+def get_expert_mask(token_type_ids: torch.LongTensor) -> (torch.BoolTensor, torch.BoolTensor):
+    vision_token_mask = torch.zeros_like(token_type_ids, dtype=torch.bool)
+    vision_token_mask[:, :-1] = (token_type_ids[:, :-1] == VISION_TOKEN_TYPE) & (
+        token_type_ids[:, 1:] == VISION_TOKEN_TYPE
+    )
+    language_token_mask = ~vision_token_mask
+    return vision_token_mask, language_token_mask
+
+
+class CogvlmVisionExpertMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.language_mlp = CogvlmMLP(config)
+        self.vision_mlp = CogvlmMLP(config)
+
+    def forward(self, hidden_states: torch.FloatTensor, token_type_ids: torch.LongTensor):
+        output = torch.empty(hidden_states.shape, dtype=hidden_states.dtype, device=hidden_states.device)
+        vision_token_mask, language_token_mask = get_expert_mask(token_type_ids)
+        output[vision_token_mask] = self.vision_mlp(hidden_states[vision_token_mask])
+        output[language_token_mask] = self.language_mlp(hidden_states[language_token_mask])
+        return output
+
+
+class CogvlmRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = self._compute_inv_freq(device)
+        self.register_buffer("inv_freq", inv_freq)
+        self.max_seq_len_cached = 0
+
+    def _compute_inv_freq(self, device=None):
+        return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device) / self.dim))
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[:, None, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[:, None, :].to(dtype), persistent=False)
+
+    def forward(self, x, seq_len):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:seq_len, ...].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class CogvlmVisionExpertAttention(nn.Module):
+    def __init__(self, config: CogvlmConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.layer_idx = layer_idx
+
+        self.rotary_emb = CogvlmRotaryEmbedding(self.head_dim)
+        self.vision_expert_query_key_value = nn.Linear(self.hidden_size, self.hidden_size * 3, bias=False)
+        self.vision_expert_dense = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.language_expert_query_key_value = nn.Linear(self.hidden_size, self.hidden_size * 3, bias=False)
+        self.language_expert_dense = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+
+        # Reference: https://github.com/bigscience-workshop/Megatron-DeepSpeed/pull/118 and SDPA C++ implementation.
+        self.sqrt_scale = math.sqrt(1 / math.sqrt(self.head_dim))
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [batch_size, sequence_length, num_head*head_dim] into a 4D tensor with size
+        [batch_size, num_heads, seq_length, head_dim]."""
+        new_tensor_shape = tensor.size()[:-1] + (self.num_heads, self.head_dim)
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        token_type_ids: torch.LongTensor,
+        position_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        batch_size, q_len, hidden_size = hidden_states.size()
+        vision_token_mask, language_token_mask = get_expert_mask(token_type_ids)
+
+        mixed_raw_layer = torch.empty(
+            (batch_size, q_len, hidden_size * 3), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+        mixed_raw_layer[vision_token_mask] = self.vision_expert_query_key_value(hidden_states[vision_token_mask])
+        mixed_raw_layer[language_token_mask] = self.language_expert_query_key_value(hidden_states[language_token_mask])
+
+        query_states, key_states, value_states = torch.split(mixed_raw_layer, self.hidden_size, dim=-1)
+        query_states = self._transpose_for_scores(query_states)
+        key_states = self._transpose_for_scores(key_states)
+        value_states = self._transpose_for_scores(value_states)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
+
+        cos, sin = self.rotary_emb(value_states, seq_len=position_ids.max() + 1)
+        cos, sin = (
+            nn.functional.embedding(position_ids, cos.squeeze(1)),
+            nn.functional.embedding(position_ids, sin.squeeze(1)),
+        )
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin, position_ids, unsqueeze_dim=1
+        )
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attn_weights = torch.matmul(query_states * self.sqrt_scale, (key_states * self.sqrt_scale).transpose(2, 3))
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # PyTorch already accumulates softmax on fp32 (Reference: https://github.com/pytorch/pytorch/pull/103167)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = nn.functional.dropout(attn_weights, p=0.0, training=self.training)
+        context_layer = torch.matmul(attn_weights, value_states)
+
+        if context_layer.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {context_layer.size()}"
+            )
+
+        context_layer = context_layer.transpose(1, 2).contiguous()
+
+        context_layer = context_layer.reshape(batch_size, q_len, self.hidden_size)
+
+        attn_output = torch.empty(context_layer.shape, dtype=hidden_states.dtype, device=hidden_states.device)
+        attn_output[vision_token_mask] = self.vision_expert_dense(context_layer[vision_token_mask])
+        attn_output[language_token_mask] = self.language_expert_dense(context_layer[language_token_mask])
+
+        return (
+            (attn_output, attn_weights, past_key_value) if output_attentions else (attn_output, None, past_key_value)
+        )
+
+
+class CogvlmVisionExpertSdpaAttention(CogvlmVisionExpertAttention):
+    """
+    CogVLM attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `CogvlmVisionExpertAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from CogvlmVisionExpertAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        token_type_ids: torch.LongTensor,
+        position_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "CogVLM is using CogvlmVisionExpertSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+            )
+
+        batch_size, q_len, hidden_size = hidden_states.size()
+        vision_token_mask, language_token_mask = get_expert_mask(token_type_ids)
+
+        mixed_raw_layer = torch.empty(
+            (batch_size, q_len, hidden_size * 3), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+        mixed_raw_layer[vision_token_mask] = self.vision_expert_query_key_value(hidden_states[vision_token_mask])
+        mixed_raw_layer[language_token_mask] = self.language_expert_query_key_value(hidden_states[language_token_mask])
+
+        query_states, key_states, value_states = torch.split(mixed_raw_layer, self.hidden_size, dim=-1)
+        query_states = self._transpose_for_scores(query_states)
+        key_states = self._transpose_for_scores(key_states)
+        value_states = self._transpose_for_scores(value_states)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
+
+        cos, sin = self.rotary_emb(value_states, seq_len=position_ids.max() + 1)
+        cos, sin = (
+            nn.functional.embedding(position_ids, cos.squeeze(1)),
+            nn.functional.embedding(position_ids, sin.squeeze(1)),
+        )
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin, position_ids, unsqueeze_dim=1
+        )
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_mask_bool = attention_mask == 0
+        is_full = (attention_mask_bool > 0).all()
+
+        context_layer = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=None,
+            dropout_p=0.0,
+            is_causal=not is_full,
+        )
+
+        if context_layer.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {context_layer.size()}"
+            )
+
+        context_layer = context_layer.transpose(1, 2).contiguous().reshape(batch_size, q_len, self.hidden_size)
+
+        attn_output = torch.empty(context_layer.shape, dtype=hidden_states.dtype, device=hidden_states.device)
+        attn_output[vision_token_mask] = self.vision_expert_dense(context_layer[vision_token_mask])
+        attn_output[language_token_mask] = self.language_expert_dense(context_layer[language_token_mask])
+
+        return (attn_output, None, past_key_value)
+
+
+COGVLM_ATTENTION_CLASSES = {
+    "eager": CogvlmVisionExpertAttention,
+    "sdpa": CogvlmVisionExpertSdpaAttention,
+}
+
+
+class CogvlmDecoderLayer(nn.Module):
+    def __init__(self, config: CogvlmConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = COGVLM_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        self.mlp = CogvlmVisionExpertMLP(config)
+        self.input_layernorm = CogvlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = CogvlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        token_type_ids: torch.LongTensor,
+        position_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states, token_type_ids=token_type_ids)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class CogvlmPreTrainedModel(PreTrainedModel):
+    config_class = CogvlmConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = False
+    _supports_sdpa = True
+    _no_split_modules = ["CogvlmDecoderLayer", "CogvlmVisionTransformerLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+def build_position_ids(
+    token_type_ids: torch.BoolTensor, attention_mask: Optional[torch.BoolTensor] = None
+) -> torch.LongTensor:
+    """
+    Create position_ids based on provided token_type_ids and attention_mask.
+    """
+
+    if attention_mask is not None:
+        tmp = token_type_ids.clone()
+        tmp[~(attention_mask.bool())] = -1
+    else:
+        tmp = token_type_ids.clone()
+
+    # image beginning-of-image (boi), end-of-image (eoi) token as LANGUAGE_TOKEN_TYPE
+    is_boi_eoi = torch.zeros_like(token_type_ids, dtype=torch.bool)
+    is_boi_eoi[:, 1:] |= (tmp[:, 1:] == VISION_TOKEN_TYPE) & (tmp[:, :-1] == LANGUAGE_TOKEN_TYPE)
+    is_boi_eoi[:, 0] |= tmp[:, 0] == VISION_TOKEN_TYPE
+    is_boi_eoi[:, :-1] |= (tmp[:, :-1] == VISION_TOKEN_TYPE) & (tmp[:, 1:] == LANGUAGE_TOKEN_TYPE)
+    is_boi_eoi[:, -1] |= tmp[:, -1] == VISION_TOKEN_TYPE
+    tmp[is_boi_eoi] = LANGUAGE_TOKEN_TYPE
+
+    # final position ids
+    position_ids = torch.zeros_like(token_type_ids, dtype=torch.long)
+    position_ids[:, 1:] = (tmp[:, 1:] == LANGUAGE_TOKEN_TYPE) | (
+        (tmp[:, 1:] == VISION_TOKEN_TYPE) & (tmp[:, :-1] == LANGUAGE_TOKEN_TYPE)
+    )
+    position_ids = position_ids.cumsum(dim=-1)
+    return position_ids
+
+
+@add_start_docstrings(
+    """
+    CogVLM model without any head on top, just outputting raw hidden states.
+    """,
+    COGVLM_START_DOCSTRING,
+)
+class CogvlmModel(CogvlmPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.vision = CogvlmVisionModel(config)
+        self.num_vision_tokens = (
+            self.config.vision_config.image_size // self.config.vision_config.patch_size
+        ) ** 2 + 2
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [CogvlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = CogvlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        vision_input_ids = torch.tensor(
+            [self.config.bos_token_id] + [self.config.pad_token_id] * self.num_vision_tokens,
+        )
+        self.register_buffer("vision_input_ids", vision_input_ids, persistent=False)
+        vision_token_type_ids = torch.tensor([LANGUAGE_TOKEN_TYPE] + [VISION_TOKEN_TYPE] * self.num_vision_tokens)
+        self.register_buffer("vision_token_type_ids", vision_token_type_ids, persistent=False)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def encode_images(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        images_features = self.vision(pixel_values)
+        return images_features
+
+    @add_start_docstrings_to_model_forward(COGVLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=BaseModelOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: List[List[torch.Tensor]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored,
+                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import CogvlmProcessor, CogvlmModel
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> processor = CogvlmProcessor.from_pretrained("THUDM/cogvlm-chat-hf")
+        >>> model = CogvlmModel.from_pretrained("THUDM/cogvlm-chat-hf")
+
+        >>> # load image
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> query = "Describe this image"
+
+        >>> prompt = f"Question: {query} Answer:"
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+
+        >>> # forward pass
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if past_key_values is not None:
+            pass  # generate mode with past_key_values. the image features are already mapped
+        else:
+            if pixel_values is not None:
+                # multi-modality
+                if token_type_ids is None:
+                    raise ValueError("Multi-modality requires `token_type_ids`!")
+                if input_ids is not None:
+                    if len(input_ids) != len(pixel_values):
+                        raise ValueError("Make sure to pass as many texts as images")
+                inputs_embeds = self.embed_tokens(input_ids) if input_ids is not None else inputs_embeds
+                images_features = self.encode_images(pixel_values)
+                images_features = images_features.reshape(-1, images_features.shape[-1])
+                images_features = images_features.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
+                inputs_embeds = inputs_embeds.index_put([token_type_ids == VISION_TOKEN_TYPE], images_features)
+            else:  # single-modality
+                if token_type_ids is None:
+                    token_type_ids = (
+                        torch.ones_like(input_ids, dtype=torch.long, device=input_ids.device) * LANGUAGE_TOKEN_TYPE
+                    )
+                assert not (
+                    token_type_ids == VISION_TOKEN_TYPE
+                ).any(), f"{(token_type_ids == VISION_TOKEN_TYPE).sum()}"
+                inputs_embeds = self.embed_tokens(input_ids)
+
+            if position_ids is None:
+                position_ids = build_position_ids(token_type_ids, attention_mask)
+            input_ids = None
+
+        # next: forward pass, which largely is copy from llama and adapted to add `token_type_ids`
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_seq_length()
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                token_type_ids=token_type_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+
+@add_start_docstrings(
+    """
+    CogVLM model with a language modeling head on top (a linear layer on top of the hidden states).
+    """,
+    COGVLM_START_DOCSTRING,
+)
+class CogvlmForCausalLM(CogvlmPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.model = CogvlmModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(COGVLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored,
+                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import CogvlmProcessor, CogvlmForCausalLM
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> processor = CogvlmProcessor.from_pretrained("THUDM/cogvlm-chat-hf")
+        >>> model = CogvlmForCausalLM.from_pretrained("THUDM/cogvlm-chat-hf")
+
+        >>> # load image
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> query = "Describe this image"
+
+        >>> prompt = f"Question: {query} Answer:"
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+        >>> outputs = model.generate(**inputs)
+
+        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def _prepare_attention_mask_for_generation(
+        self,
+        inputs: torch.Tensor,
+        pad_token_id: Optional[int],
+        eos_token_id: Optional[Union[int, List[int]]],
+    ) -> torch.LongTensor:
+        return torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device)  # type: ignore
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        token_type_ids,
+        pixel_values=None,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        position_ids = kwargs.get("position_ids", None)
+        if past_key_values is not None:
+            if position_ids is None:
+                # the reason we add + 2 + 1 here is because we have 2 additional vision tokens,
+                # and we need to add 1 to take into account the one extra token that is going to
+                # be sent through the model
+                position_ids = build_position_ids(token_type_ids, attention_mask) + 2 + 1
+            position_ids = position_ids[:, -1:]
+            input_ids = input_ids[:, -1:]
+            token_type_ids = token_type_ids[:, -1:]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "token_type_ids": token_type_ids,
+                "pixel_values": pixel_values,
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: "ModelOutput",
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        standardize_cache_format: bool = False,
+        model_inputs: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+        if getattr(outputs, "state", None) is not None:
+            model_kwargs["state"] = outputs.state
+
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            new_token_type_ids = (
+                torch.ones(size=(token_type_ids.shape[0], 1), dtype=token_type_ids.dtype, device=token_type_ids.device)
+                * LANGUAGE_TOKEN_TYPE
+            )
+            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, new_token_type_ids], dim=-1)
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+
+        return model_kwargs
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
diff --git a/src/transformers/models/cogvlm/processing_cogvlm.py b/src/transformers/models/cogvlm/processing_cogvlm.py
new file mode 100644
index 00000000000000..bb671627a40907
--- /dev/null
+++ b/src/transformers/models/cogvlm/processing_cogvlm.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for CogVLM.
+"""
+
+from typing import List, Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+LANGUAGE_TOKEN_TYPE = 0
+VISION_TOKEN_TYPE = 1
+
+
+class CogvlmProcessor(ProcessorMixin):
+    r"""
+    Constructs a CogVLM processor which wraps a CLIP image processor and a LLaMa tokenizer into a single processor.
+
+    [`CogvlmProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizer`]. See the docstring
+    of [`~CogvlmProcessor.__call__`] and [`~CogvlmProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`CLIPImageProcessor`):
+            An instance of [`CLIPImageProcessor`]. The image processor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of ['LlamaTokenizer`]. The tokenizer is a required input.
+        image_size (`int`):
+            The image size used by the model.
+        patch_size (`int`):
+            The patch size used by the model.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "CLIPImageProcessor"
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+
+    def __init__(self, image_processor, tokenizer, image_size: int, patch_size: int):
+        super().__init__(image_processor, tokenizer)
+        self.image_size = image_size
+        self.patch_size = patch_size
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        add_special_tokens: bool = False,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = True,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_token_type_ids: bool = True,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        This method uses [`CLIPImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`LlamaTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify either images or text.")
+
+        input_ids = [self.tokenizer.bos_token_id]
+        token_type_ids = [LANGUAGE_TOKEN_TYPE]
+        pixel_values = None
+
+        if images is not None:
+            num_vision_tokens = (self.image_size // self.patch_size) ** 2 + 2
+            input_ids += [self.tokenizer.pad_token_id] * num_vision_tokens
+            token_type_ids += [VISION_TOKEN_TYPE] * num_vision_tokens
+            pixel_values = self.image_processor(images, return_tensors=return_tensors).pixel_values
+
+        if text is not None:
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                # TODO support the following 3 flags
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=None,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=None,
+                **kwargs,
+            )
+            text_ids = text_encoding.input_ids
+            input_ids += text_ids
+            token_type_ids += [LANGUAGE_TOKEN_TYPE] * len(text_ids)
+
+        data = {}
+        data["input_ids"] = [input_ids] if return_tensors is not None else input_ids
+        if return_token_type_ids:
+            data["token_type_ids"] = [token_type_ids] if return_tensors is not None else token_type_ids
+        if return_attention_mask:
+            attention_mask = [1] * len(input_ids)
+            data["attention_mask"] = [attention_mask] if return_tensors is not None else attention_mask
+
+        result = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if pixel_values is not None:
+            result["pixel_values"] = pixel_values
+
+        return result
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index c9267debc5de81..7fdcc414937f28 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2052,6 +2052,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class CogvlmForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CogvlmModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CogvlmPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class CohereForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/cogvlm/__init__.py b/tests/models/cogvlm/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/cogvlm/test_modeling_cogvlm.py b/tests/models/cogvlm/test_modeling_cogvlm.py
new file mode 100644
index 00000000000000..2b812d41c764c0
--- /dev/null
+++ b/tests/models/cogvlm/test_modeling_cogvlm.py
@@ -0,0 +1,261 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch CogVLM model."""
+
+import copy
+import unittest
+
+import requests
+
+from transformers import CogvlmConfig, CogvlmVisionConfig
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import CogvlmForCausalLM, CogvlmModel
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import CogvlmProcessor
+
+
+class CogvlmModelTester:
+    def __init__(
+        self,
+        parent,
+        num_channels=3,
+        image_size=32,
+        patch_size=2,
+        batch_size=1,
+        text_seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        num_labels=3,
+    ):
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.batch_size = batch_size
+        self.text_seq_length = text_seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.parent = parent
+        self.is_training = is_training
+
+        self.vision_seq_length = (self.image_size // self.patch_size) ** 2 + 2
+        self.seq_length = self.text_seq_length + self.vision_seq_length
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).to(torch_device)
+        token_type_ids = torch.cat(
+            [
+                torch.zeros(self.batch_size, self.text_seq_length, dtype=torch.long),
+                torch.ones(self.batch_size, self.vision_seq_length, dtype=torch.long),
+            ],
+            dim=1,
+        ).to(torch_device)
+
+        attention_mask = None
+        if self.use_input_mask:
+            attention_mask = random_attention_mask([self.batch_size, self.seq_length]).to(torch_device)
+
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]).to(
+            torch_device
+        )
+
+        labels = (
+            ids_tensor([self.batch_size, self.seq_length], self.num_labels).to(torch_device)
+            if self.use_labels
+            else None
+        )
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, token_type_ids, pixel_values, labels
+
+    def get_vision_config(self):
+        return CogvlmVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            initializer_range=self.initializer_range,
+        )
+
+    def get_config(self):
+        return CogvlmConfig(
+            vision_config=self.get_vision_config().to_dict(),
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            max_position_embeddings=self.max_position_embeddings,
+            vocab_size=self.vocab_size,
+        )
+
+    def create_and_check_for_causal_lm(self, config, input_ids, attention_mask, token_type_ids, pixel_values, labels):
+        model = CogvlmForCausalLM(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(
+                pixel_values=pixel_values,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+            )
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            pixel_values,
+            _,
+        ) = config_and_inputs
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class CogvlmModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (CogvlmForCausalLM, CogvlmModel) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": CogvlmModel, "image-to-text": CogvlmForCausalLM} if is_torch_available() else {}
+    )
+
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = CogvlmModelTester(self)
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+        if return_labels:
+            if model_class.__name__ == "CogvlmForCausalLM":
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+
+        return inputs_dict
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
+    # Copied from tests.models.blip_2.test_modeling_blip_2.Blip2ModelTest.test_cpu_offload
+    def test_cpu_offload(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "THUDM/cogvlm-chat-hf"
+        model = CogvlmForCausalLM.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+
+
+@require_vision
+@require_torch
+@slow
+class CogvlmModelIntegrationTest(unittest.TestCase):
+    def test_inference_opt(self):
+        processor = CogvlmProcessor.from_pretrained("THUDM/cogvlm-chat-hf")
+        model = CogvlmForCausalLM.from_pretrained("THUDM/cogvlm-chat-hf", torch_dtype=torch.float16).to(torch_device)
+
+        # prepare image
+        image = prepare_img()
+        inputs = processor(images=image, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Test output
+        self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118])
+        self.assertEqual("a woman sitting on the beach with a dog", generated_text)
+
+        # image and context
+        prompt = "Question: which city is this? Answer:"
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Test output
+        self.assertEqual(
+            predictions[0].tolist(),
+            [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
+        )
+        self.assertEqual(generated_text, "it's not a city, it's a beach")
diff --git a/tests/models/cogvlm/test_processor_cogvlm.py b/tests/models/cogvlm/test_processor_cogvlm.py
new file mode 100644
index 00000000000000..67b6cebbe35117
--- /dev/null
+++ b/tests/models/cogvlm/test_processor_cogvlm.py
@@ -0,0 +1,159 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers.testing_utils import require_vision
+from transformers.utils import is_vision_available
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import (
+        AutoProcessor,
+        CLIPImageProcessor,
+        CogvlmProcessor,
+        LlamaTokenizerFast,
+        PreTrainedTokenizerFast,
+    )
+
+
+@require_vision
+class CogvlmProcessorTest(unittest.TestCase):
+    # Ignore copy
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = CLIPImageProcessor()
+        tokenizer = LlamaTokenizerFast.from_pretrained("stas/tiny-random-llama-2")
+
+        processor = CogvlmProcessor(image_processor, tokenizer, image_size=10, patch_size=2)
+
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = CogvlmProcessor(
+            tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor(), image_size=10, patch_size=2
+        )
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = CogvlmProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, CLIPImageProcessor)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CogvlmProcessor(tokenizer=tokenizer, image_processor=image_processor, image_size=10, patch_size=2)
+
+        image_input = self.prepare_image_inputs()
+
+        input_feat_extract = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CogvlmProcessor(tokenizer=tokenizer, image_processor=image_processor, image_size=10, patch_size=2)
+
+        input_str = "lower newer"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str, return_token_type_ids=False)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CogvlmProcessor(tokenizer=tokenizer, image_processor=image_processor, image_size=10, patch_size=2)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CogvlmProcessor(tokenizer=tokenizer, image_processor=image_processor, image_size=10, patch_size=2)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CogvlmProcessor(tokenizer=tokenizer, image_processor=image_processor, image_size=10, patch_size=2)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt
index cd87d09ec8ec6d..427b5789de46f5 100644
--- a/utils/not_doctested.txt
+++ b/utils/not_doctested.txt
@@ -469,6 +469,8 @@ src/transformers/models/clip/modeling_tf_clip.py
 src/transformers/models/clipseg/configuration_clipseg.py
 src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
 src/transformers/models/codegen/modeling_codegen.py
+src/transformers/models/cogvlm/configuration_cogvlm.py
+src/transformers/models/cogvlm/modeling_cogvlm.py
 src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
 src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py
 src/transformers/models/convbert/modeling_convbert.py