ggerganov · ngxson · Sep 29, 2024 · Sep 30, 2024 · Sep 30, 2024 · Oct 1, 2024
diff --git a/Makefile b/Makefile
@@ -926,6 +926,7 @@ OBJ_LLAMA = \
 	src/llama-vocab.o \
 	src/llama-grammar.o \
 	src/llama-sampling.o \
+	src/llama-vision.o \
 	src/unicode.o \
 	src/unicode-data.o
 
@@ -937,6 +938,7 @@ OBJ_COMMON = \
 	common/ngram-cache.o \
 	common/sampling.o \
 	common/train.o \
+	common/vision.o \
 	common/build-info.o \
 	common/json-schema-to-grammar.o
 
@@ -1120,6 +1122,7 @@ src/llama.o: \
 	src/llama-vocab.h \
 	src/llama-grammar.h \
 	src/llama-sampling.h \
+	src/llama-vision.h \
 	src/unicode.h \
 	include/llama.h \
 	ggml/include/ggml-cuda.h \
@@ -1152,6 +1155,17 @@ src/llama-sampling.o: \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+src/llama-vision.o: \
+	src/llama-vision.cpp \
+	src/llama-vision.h \
+	include/llama.h \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml-metal.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-alloc.h \
+	ggml/include/ggml-backend.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 $(LIB_LLAMA): \
 	$(OBJ_LLAMA) \
 	$(LIB_GGML)
@@ -1209,6 +1223,12 @@ common/ngram-cache.o: \
 	common/ngram-cache.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+common/vision.o: \
+	common/vision.cpp \
+	common/vision.h \
+	common/stb_image.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 $(LIB_COMMON): \
 	$(OBJ_COMMON) \
 	$(LIB_LLAMA) \
@@ -1457,7 +1477,6 @@ llama-server: \
 	examples/server/json-schema-to-grammar.mjs.hpp \
 	examples/server/loading.html.hpp \
 	common/json.hpp \
-	common/stb_image.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -1480,7 +1499,6 @@ libllava.a: examples/llava/llava.cpp \
 	examples/llava/llava.h \
 	examples/llava/clip.cpp \
 	examples/llava/clip.h \
-	common/stb_image.h \
 	common/base64.hpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual

diff --git a/common/common.cpp b/common/common.cpp
@@ -1474,6 +1474,27 @@ std::vector<llama_token> llama_tokenize(
     return result;
 }
 
+// TODO: this function is hacky, need to be improved
+std::vector<llama_token> llama_tokenize_with_img(
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special) {
+    static const std::string IMG_PLACEMENT = "<img_placement>";
+    std::vector<std::string> parts = string_split(text, IMG_PLACEMENT);
+    std::vector<llama_token> output;
+    for (const auto & part : parts) {
+        bool add_bos = &parts.front() == &part;
+        auto tokens = llama_tokenize(ctx, part, add_special && add_bos, parse_special);
+        output.insert(output.end(), tokens.begin(), tokens.end());
+        if (&parts.back() != &part) {
+            // add image token to middle of 2 parts
+            output.push_back(TOKEN_IMG_PLACEMENT);
+        }
+    }
+    return output;
+}
+
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
     std::string piece;
     piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'

diff --git a/common/common.h b/common/common.h
@@ -378,6 +378,20 @@ static std::vector<T> string_split(const std::string & str, char delim) {
     return values;
 }
 
+// split string by a `std::string delim` instead of `char delim`
+static std::vector<std::string> string_split(std::string s, const std::string & delimiter) {
+    std::vector<std::string> tokens;
+    size_t pos = 0;
+    std::string token;
+    while ((pos = s.find(delimiter)) != std::string::npos) {
+        token = s.substr(0, pos);
+        tokens.push_back(token);
+        s.erase(0, pos + delimiter.length());
+    }
+    tokens.push_back(s);
+    return tokens;
+}
+
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
 
@@ -447,6 +461,17 @@ std::vector<llama_token> llama_tokenize(
                         bool   add_special,
                         bool   parse_special = false);
 
+const llama_token TOKEN_IMG_PLACEMENT = -1000;
+
+// tokenize with "placeholder" for image embedding tokens
+// "<img_placement>" will be replaced with TOKEN_IMG_PLACEMENT
+// TODO: this function is hacky, need to be improved
+std::vector<llama_token> llama_tokenize_with_img(
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special = false);
+
 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(

diff --git a/common/vision.cpp b/common/vision.cpp
@@ -0,0 +1,38 @@
+#include "vision.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+#include <vector>
+#include <fstream>
+
+llama_img * load_image_from_file(const char * fname) {
+    std::ifstream file(fname, std::ios::binary);
+    if (!file) {
+        throw std::runtime_error("Unable to open file");
+    }
+    std::vector<char> image_bytes = std::vector<char>(
+        std::istreambuf_iterator<char>(file),
+        std::istreambuf_iterator<char>());
+    // decode image to byte array
+    int nx, ny, nc;
+    auto * bytes = (unsigned char *) image_bytes.data();
+    auto * img = stbi_load_from_memory(bytes, image_bytes.size(), &nx, &ny, &nc, 3);
+    if (!img) {
+        throw std::runtime_error("failed to decode image bytes");
+    }
+    // printf("nx=%d ny=%d nc=%d\n", nx, ny, nc);
+    // GGML_ASSERT(nc == 3);
+    // for (int y = 0; y < ny; y++) {
+    //     for (int x = 0; x < nx; x++) {
+    //         unsigned char * pix = img + x*nc + y*nc*nx;
+    //         printf("%02x%02x%02x ", pix[0], pix[1], pix[2]);
+    //     }
+    //     printf("\n");
+    // }
+    // printf("\n");
+    llama_img * result = llama_img_init(nx, ny);
+    memcpy(result->data, img, nx*ny*3);
+    stbi_image_free(img);
+    return result;
+}
diff --git a/common/vision.h b/common/vision.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "llama.h"
+
+#include <string>
+#include <vector>
+
+llama_img * load_image_from_file(const char * fname);
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -11,6 +11,7 @@
 import os
 import re
 import sys
+from transformers import AutoConfig
 from enum import IntEnum
 from pathlib import Path
 from hashlib import sha256
@@ -66,6 +67,12 @@ class Model:
     dir_model_card: Path
     is_lora: bool
 
+    # for vision model
+    preprocessor_config: dict[str, Any] | None = None
+    vparams: dict[str, Any] | None = None
+    v_tensor_map: gguf.TensorNameMap
+    v_tensor_names: set[str] | None
+
     # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
 
@@ -95,6 +102,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
         self.model_name = model_name
         self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
         self.is_lora = is_lora  # true if model is used inside convert_lora_to_gguf.py
+        self.preprocessor_config = self.load_preprocessor_config(self.dir_model)
 
         # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
         if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -210,9 +218,13 @@ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int |
 
     def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
         new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
-        if new_name is None:
+        new_name_vision = self.v_tensor_map.get_name(key=name, try_suffixes=try_suffixes)
+        if new_name is not None:
+            return new_name
+        elif new_name_vision is not None:
+            return new_name_vision
+        else:
             raise ValueError(f"Can not map tensor {name!r}")
-        return new_name
 
     def set_gguf_parameters(self):
         self.gguf_writer.add_block_count(self.block_count)
@@ -452,7 +464,22 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
     @staticmethod
     def load_hparams(dir_model: Path):
         with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-            return json.load(f)
+            hparams = json.load(f)
+            if "text_config" in hparams:
+                text_config = hparams["text_config"]
+                if "_name_or_path" in text_config:
+                    text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict()
+                hparams = {**text_config, **hparams}
+            return hparams
+
+    @staticmethod
+    def load_preprocessor_config(dir_model: Path):
+        file_path = dir_model / "preprocessor_config.json"
+        if os.path.exists(file_path):
+            with open(file_path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        else:
+            return None
 
     @classmethod
     def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -1501,10 +1528,17 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed norms: {norms}")
 
 
-@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration")
 class LlamaModel(Model):
     model_arch = gguf.MODEL_ARCH.LLAMA
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if "vision_config" in self.hparams:
+            self.vparams = self.hparams["vision_config"]
+        if self.vparams is not None:
+            self.v_tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAVA_VISION, self.vparams["num_hidden_layers"])
+
     def set_vocab(self):
         try:
             self._set_vocab_sentencepiece()
@@ -1554,6 +1588,26 @@ def set_gguf_parameters(self):
         if self.hparams.get("vocab_size", 32000) == 49152:
             self.gguf_writer.add_add_bos_token(False)
 
+        # For vision model
+        if self.vparams is not None and self.preprocessor_config is not None:
+            self.gguf_writer.add_vision_type("clip-vit")
+            self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
+            self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
+            self.gguf_writer.add_vision_clip_architecture("llava")
+            self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"])
+            self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"])
+            self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
+            self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
+            self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"])
+            self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"])
+            self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"])
+            self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
+            max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
+            self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)
+            # TODO: should not hardcode these, but they are currently missing from config.json
+            self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP)
+            self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05)
+
     @staticmethod
     def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
         if n_head_kv is not None and n_head != n_head_kv:
@@ -1568,6 +1622,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         n_head = self.hparams["num_attention_heads"]
         n_kv_head = self.hparams.get("num_key_value_heads")
 
+        # For vision model
+        if name.startswith("language_model"):
+            name = name.replace("language_model.", "")
+        if "post_layernorm" in name:
+            return [] # skip post_layernorm
+
         if name.endswith(("q_proj.weight", "q_proj.bias")):
             data_torch = LlamaModel.permute(data_torch, n_head, n_head)
         if name.endswith(("k_proj.weight", "k_proj.bias")):

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -23,7 +23,7 @@
 #include "ggml-vulkan.h"
 #endif
 
-#define STB_IMAGE_IMPLEMENTATION
+#include "vision.h" // without this, we get duplicated symbol error
 #include "stb_image.h"
 
 #include <cassert>