huggingface · younesbelkada · May 15, 2024 · Apr 19, 2024 · Apr 22, 2024 · Apr 22, 2024
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
@@ -45,6 +45,10 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt
 # For video model testing
 RUN python3 -m pip install --no-cache-dir decord av==9.2.0
 
+# For GGUF tests
+RUN python3 -m pip install --no-cache-dir gguf
+
+
 # For `dinat` model
 # The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
 RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -137,6 +137,8 @@
     title: Troubleshoot
   - local: hf_quantizer
     title: Contribute new quantization method
+  - local: gguf
+    title: Interoperability with GGUF files
   title: Developer guides
 - sections:
   - local: performance

diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md
@@ -0,0 +1,95 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GGUF and interaction with Transformers
+
+The GGUF file format is used to store models for inference with [GGML](https://github.com/ggerganov/ggml) and other 
+libraries that depend on it, like the very popular [llama.cpp](https://github.com/ggerganov/llama.cpp) or 
+[whisper.cpp](https://github.com/ggerganov/whisper.cpp).
+
+It is a file format [supported by the Hugging Face Hub](https://huggingface.co/docs/hub/en/gguf) with features 
+allowing for quick inspection of tensors and metadata within the file.
+
+This file format is designed as a "single-file-format" where a single file usually contains both the configuration
+attributes, the tokenizer vocabulary and other attributes, as well as all tensors to be loaded in the model. These
+files come in different formats according to the quantization type of the file. We briefly go over some of them
+[here](https://huggingface.co/docs/hub/en/gguf#quantization-types).
+
+## Support within Transformers
+
+We have added the ability to load `gguf` files within `transformers` in order to offer further training/fine-tuning
+capabilities to gguf models, before converting back those models to `gguf` to use within the `ggml` ecosystem. When
+loading a model, we first dequantize it to fp32, before loading the weights to be used in PyTorch.
+
+> [!NOTE]
+> The support is still very exploratory and we welcome contributions in order to solidify it across quantization types
+> and model architectures.
+
+For now, here are the supported model architectures and quantization types:
+
+### Supported quantization types
+
+The initial supported quantization types are decided according to the popular quantized files that have been shared
+on the Hub.
+
+- F32
+- Q2_K
+- Q3_K
+- Q4_0
+- Q4_K
+- Q6_K
+- Q8_0
+
+We take example from the excellent [99991/pygguf](https://github.com/99991/pygguf) Python parser to dequantize the 
+weights.
+
+### Supported model architectures
+
+For now the supported model architectures are the architectures that have been very popular on the Hub, namely:
+
+- LLaMa
+- Mistral
+
+## Example usage
+
+In order to load `gguf` files in `transformers`, you should specify the `from_gguf` argument to the `from_pretrained`
+methods of both tokenizers and models. Here is how one would load a tokenizer and a model, which can be loaded
+from the exact same file:
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id, from_gguf=filename)
+model = AutoModelForCausalLM.from_pretrained(model_id, from_gguf=filename)
+```
+
+Now you have access to the full, unquantized version of the model in the PyTorch ecosystem, where you can combine it
+with a plethora of other tools.
+
+In order to convert back to a `gguf` file, we recommend using the 
+[`convert-hf-to-gguf.py` file](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) from llama.cpp.
+
+Here's how you would complete the script above to save the model and export it back to `gguf`:
+
+```py
+tokenizer.save_pretrained('directory')
+model.save_pretrained('directory')
+
+!python ${path_to_llama_cpp}/convert-hf-to-gguf.py ${directory}
+```
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -27,6 +27,7 @@
 
 from . import __version__
 from .dynamic_module_utils import custom_object_save
+from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
 from .utils import (
     CONFIG_NAME,
     PushToHubMixin,
@@ -658,6 +659,8 @@ def _get_config_dict(
         from_auto_class = kwargs.pop("_from_auto", False)
         commit_hash = kwargs.pop("_commit_hash", None)
 
+        from_gguf = kwargs.get("from_gguf", None)
-        from_gguf = kwargs.get("from_gguf", None)
+        from_gguf = kwargs.pop("from_gguf", None)
-        from_gguf = kwargs.get("from_gguf", None)
+        from_gguf = kwargs.pop("from_gguf", None)
+
         if trust_remote_code is True:
             logger.warning(
                 "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is"
@@ -676,10 +679,10 @@ def _get_config_dict(
             resolved_config_file = pretrained_model_name_or_path
             is_local = True
         elif is_remote_url(pretrained_model_name_or_path):
-            configuration_file = pretrained_model_name_or_path
+            configuration_file = pretrained_model_name_or_path if from_gguf is None else from_gguf
             resolved_config_file = download_url(pretrained_model_name_or_path)
         else:
-            configuration_file = kwargs.pop("_configuration_file", CONFIG_NAME)
+            configuration_file = kwargs.pop("_configuration_file", CONFIG_NAME) if from_gguf is None else from_gguf
 
             try:
                 # Load from local folder or from cache or download from model Hub and cache
@@ -712,8 +715,12 @@ def _get_config_dict(
                 )
 
         try:
-            # Load config dict
-            config_dict = cls._dict_from_json_file(resolved_config_file)
+            if from_gguf is None:
+                # Load config dict
+                config_dict = cls._dict_from_json_file(resolved_config_file)
+            else:
+                config_dict = load_gguf_checkpoint(resolved_config_file, return_tensors=False)["config"]
+
             config_dict["_commit_hash"] = commit_hash
         except (json.JSONDecodeError, UnicodeDecodeError):
             raise EnvironmentError(

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
@@ -1394,9 +1394,8 @@ def tokenizer(self, proto):
 
     def normalizer(self, proto):
         sequence = []
-        if hasattr(self.original_tokenizer, "add_prefix_space"):
-            if self.original_tokenizer.add_prefix_space:
-                sequence += [normalizers.Prepend(prepend="▁")]
+        if getattr(self.original_tokenizer, "add_prefix_space", True):
+            sequence += [normalizers.Prepend(prepend="▁")]
         sequence += [normalizers.Replace(pattern=" ", content="▁")]
         return normalizers.Sequence(sequence)
 

diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
@@ -42,6 +42,14 @@
         "set_hf_deepspeed_config",
         "unset_hf_deepspeed_config",
     ],
+    "ggml": [
+        "GGUF_CONFIG_MAPPING",
+        "GGUF_TENSOR_MAPPING",
+        "GGUF_TOKENIZER_MAPPING",
+        "_gguf_parse_value",
+        "load_dequant_gguf_tensor",
+        "load_gguf",
+    ],
     "integration_utils": [
         "INTEGRATION_TO_CALLBACK",
         "AzureMLCallback",
@@ -111,6 +119,14 @@
         set_hf_deepspeed_config,
         unset_hf_deepspeed_config,
     )
+    from .ggml import (
+        GGUF_CONFIG_MAPPING,
+        GGUF_TENSOR_MAPPING,
+        GGUF_TOKENIZER_MAPPING,
+        _gguf_parse_value,
+        load_dequant_gguf_tensor,
+        load_gguf,
+    )
     from .integration_utils import (
         INTEGRATION_TO_CALLBACK,
         AzureMLCallback,