huggingface · tomaarsen · Dec 19, 2024 · Dec 9, 2024 · Dec 10, 2024 · Dec 11, 2024
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -450,6 +450,7 @@
     "models.fuyu": ["FuyuConfig"],
     "models.gemma": ["GemmaConfig"],
     "models.gemma2": ["Gemma2Config"],
+    "models.modernbert": ["ModernBertConfig"],
     "models.git": [
         "GitConfig",
         "GitProcessor",
@@ -2299,6 +2300,15 @@
             "Gemma2PreTrainedModel",
         ]
     )
+    _import_structure["models.modernbert"].extend(
+        [
+            "ModernBertForCausalLM",
+            "ModernBertForSequenceClassification",
+            "ModernBertForTokenClassification",
+            "ModernBertModel",
+            "ModernBertPreTrainedModel",
+        ]
+    )
     _import_structure["models.git"].extend(
         [
             "GitForCausalLM",
@@ -5328,6 +5338,7 @@
     from .models.fuyu import FuyuConfig
     from .models.gemma import GemmaConfig
     from .models.gemma2 import Gemma2Config
+    from .models.modernbert import ModernBertConfig
     from .models.git import (
         GitConfig,
         GitProcessor,
@@ -7056,6 +7067,13 @@
             Gemma2Model,
             Gemma2PreTrainedModel,
         )
+        from .models.modernbert import (
+            ModernBertForCausalLM,
+            ModernBertForSequenceClassification,
+            ModernBertForTokenClassification,
+            ModernBertModel,
+            ModernBertPreTrainedModel,
+        )
         from .models.git import (
             GitForCausalLM,
             GitModel,

diff --git a/src/transformers/loss/loss_utils.py b/src/transformers/loss/loss_utils.py
@@ -47,6 +47,22 @@ def ForCausalLMLoss(
     return loss
 
 
+def ForMaskedLMLoss(
+    logits, labels, vocab_size: int, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs
+):
+    # Upcast to float if we need to compute the loss to avoid potential precision issues
+    logits = logits.float()
+
+    # Flatten the tokens
+    logits = logits.view(-1, vocab_size)
+    labels = labels.view(-1)
+    # Enable model parallelism
+
+    labels = labels.to(logits.device)
+    loss = fixed_cross_entropy(logits, labels, num_items_in_batch, ignore_index, **kwargs)
+    return loss
+
+
 def ForSequenceClassificationLoss(labels, pooled_logits, config, **kwargs):
     num_labels = config.num_labels
     if config.problem_type is None:
@@ -101,6 +117,7 @@ def ForTokenClassification(logits, labels, config, **kwargs):
 
 LOSS_MAPPING = {
     "ForCausalLM": ForCausalLMLoss,
+    "ForMaskedLM": ForMaskedLMLoss,
     "ForQuestionAnswering": ForQuestionAnsweringLoss,
     "ForSequenceClassification": ForSequenceClassificationLoss,
     "ForTokenClassification": ForTokenClassification,

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -96,6 +96,7 @@
     fuyu,
     gemma,
     gemma2,
+    modernbert,
     git,
     glm,
     glpn,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -113,6 +113,7 @@
         ("fuyu", "FuyuConfig"),
         ("gemma", "GemmaConfig"),
         ("gemma2", "Gemma2Config"),
+        ("modernbert", "ModernBertConfig"),
         ("git", "GitConfig"),
         ("glm", "GlmConfig"),
         ("glpn", "GLPNConfig"),
@@ -417,6 +418,7 @@
         ("fuyu", "Fuyu"),
         ("gemma", "Gemma"),
         ("gemma2", "Gemma2"),
+        ("modernbert", "ModernBERT"),
         ("git", "GIT"),
         ("glm", "GLM"),
         ("glpn", "GLPN"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -110,6 +110,7 @@
         ("funnel", ("FunnelModel", "FunnelBaseModel")),
         ("gemma", "GemmaModel"),
         ("gemma2", "Gemma2Model"),
+        ("modernbert", "ModernBertModel"),
         ("git", "GitModel"),
         ("glm", "GlmModel"),
         ("glpn", "GLPNModel"),
@@ -487,6 +488,7 @@
         ("fuyu", "FuyuForCausalLM"),
         ("gemma", "GemmaForCausalLM"),
         ("gemma2", "Gemma2ForCausalLM"),
+        ("modernbert", "ModernBertForCausalLM"),
         ("git", "GitForCausalLM"),
         ("glm", "GlmForCausalLM"),
         ("gpt-sw3", "GPT2LMHeadModel"),
@@ -945,6 +947,7 @@
         ("funnel", "FunnelForSequenceClassification"),
         ("gemma", "GemmaForSequenceClassification"),
         ("gemma2", "Gemma2ForSequenceClassification"),
+        ("modernbert", "ModernBertForSequenceClassification"),
         ("glm", "GlmForSequenceClassification"),
         ("gpt-sw3", "GPT2ForSequenceClassification"),
         ("gpt2", "GPT2ForSequenceClassification"),
@@ -1136,6 +1139,7 @@
         ("funnel", "FunnelForTokenClassification"),
         ("gemma", "GemmaForTokenClassification"),
         ("gemma2", "Gemma2ForTokenClassification"),
+        ("modernbert", "ModernBertForTokenClassification"),
         ("glm", "GlmForTokenClassification"),
         ("gpt-sw3", "GPT2ForTokenClassification"),
         ("gpt2", "GPT2ForTokenClassification"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -203,6 +203,7 @@
                     "GemmaTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            ("modernbert", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
             ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
             ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),

diff --git a/src/transformers/models/modernbert/__init__.py b/src/transformers/models/modernbert/__init__.py
@@ -0,0 +1,61 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_modernbert": ["ModernBertConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_modernbert"] = [
+        "ModernBertForCausalLM",
+        "ModernBertModel",
+        "ModernBertPreTrainedModel",
+        "ModernBertForSequenceClassification",
+        "ModernBertForTokenClassification",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_modernbert import ModernBertConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_modernbert import (
+            ModernBertForCausalLM,
+            ModernBertForSequenceClassification,
+            ModernBertForTokenClassification,
+            ModernBertModel,
+            ModernBertPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py
@@ -0,0 +1,183 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/modernbert/modular_modernbert.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_modernbert.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+
+
+class ModernBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the ModernBert-7B.
+    e.g. [answerdotai/modernbert-base](https://huggingface.co/answerdotai/modernbert-base)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the ModernBert model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ModernBertModel`]
+        hidden_size (`int`, *optional*, defaults to 2304):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 26):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256): scaling factor used on the attention scores
+        sliding_window (`int`, *optional*, defaults to 4096): in ModernBert, every other layer uses sliding window attention. This is the
+            size of the sliding window.
+        final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits.
+        attn_logit_softcapping (`float`, *optional*, defaults to 50.0): scaling factor when applying tanh softcapping on the attention scores.
+        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
+
+    ```python
+    >>> from transformers import ModernBertModel, ModernBertConfig
+    >>> # Initializing a ModernBert modernbert-7b style configuration
+    >>> configuration = ModernBertConfig()
+    >>> # Initializing a model from the modernbert-7b style configuration
+    >>> model = ModernBertModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "modernbert"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50368,
+        hidden_size=768,
+        intermediate_size=1152,
+        num_hidden_layers=22,
+        num_attention_heads=12,
+        hidden_activation="gelu",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        initalizer_cutoff_factor=2.0,
+        norm_eps=1e-5,
+        norm_bias=False,
+        pad_token_id=50283,
+        eos_token_id=50282,
+        bos_token_id=50281,
+        cls_token_id=50281,
+        sep_token_id=50282,
+        tie_word_embeddings=True,
+        global_rope_theta=160000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        global_attn_every_n_layers=3,
+        local_attention=128,
+        local_rope_theta=10000.0,
+        skip_first_prenorm=True,
+        embedding_norm=True,
+        embedding_dropout=0.0,
+        mlp_bias=False,
+        mlp_dropout=0.0,
+        unpad_inputs=True,
+        unpad_no_grad=True,
+        decoder_bias=True,
+        classifier_dropout=0.0,
+        classifier_pooling="mean",
+        classifier_norm=True,
+        classifier_bias=False,
+        classifier_activation=None,
+        deterministic_flash_attn=False,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            cls_token_id=cls_token_id,
+            sep_token_id=sep_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.initializer_range = initializer_range
+        self.initalizer_cutoff_factor = initalizer_cutoff_factor
+        self.norm_eps = norm_eps
+        self.norm_bias = norm_bias
+        self.global_rope_theta = global_rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.hidden_activation = hidden_activation
+        self.global_attn_every_n_layers = global_attn_every_n_layers
+        self.local_attention = local_attention
+        self.local_rope_theta = local_rope_theta
+        self.skip_first_prenorm = skip_first_prenorm
+        self.embedding_norm = embedding_norm
+        self.embedding_dropout = embedding_dropout
+        self.mlp_bias = mlp_bias
+        self.mlp_dropout = mlp_dropout
+        self.unpad_inputs = unpad_inputs
+        self.unpad_no_grad = unpad_no_grad
+        self.decoder_bias = decoder_bias
+        self.classifier_dropout = classifier_dropout
+        self.classifier_pooling = classifier_pooling
+        self.classifier_bias = classifier_bias
+        self.classifier_norm = classifier_norm
+        self.classifier_activation = classifier_activation if classifier_activation is not None else hidden_activation
+        self.deterministic_flash_attn = deterministic_flash_attn
-Original file line number
+Diff line change
@@ Expand Up / @@ -96,6 +96,7 @@ @@
         fuyu,
         gemma,
         gemma2,
+        modernbert,
         git,
         glm,
         glpn,
@@ Expand Down @@