Skip to content

Commit

Permalink
vocab : refactor tokenizer to reduce init overhead (#9449)
Browse files Browse the repository at this point in the history
* refactor tokenizer

* llama : make llm_tokenizer more private

ggml-ci

* refactor tokenizer

* refactor tokenizer

* llama : make llm_tokenizer more private

ggml-ci

* remove unused files

* remove unused fileds to avoid unused filed build error

* avoid symbol link error

* Update src/llama.cpp

* Update src/llama.cpp

---------

Co-authored-by: Georgi Gerganov <[email protected]>
  • Loading branch information
kylo5aby and ggerganov authored Sep 28, 2024
1 parent 9a91311 commit 6102037
Show file tree
Hide file tree
Showing 5 changed files with 239 additions and 142 deletions.
14 changes: 7 additions & 7 deletions examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ static void print_sample_weights(TransformerWeights *w){

//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.

struct llama_vocab {
struct my_llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
Expand Down Expand Up @@ -525,7 +525,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
return out.str();
}

static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) {
if (is_ggml_file(filename)) {
LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
struct ggml_context * ctx_data = NULL;
Expand Down Expand Up @@ -583,13 +583,13 @@ static void load_vocab(const char * filename, const Config * config, struct llam
const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused
vocab->id_to_token.resize(n_vocab);
for (llama_vocab::id id=0; id<n_vocab; ++id) {
for (my_llama_vocab::id id=0; id<n_vocab; ++id) {
float_t score = file.read_f32();
uint32_t len = file.read_u32();
std::string text = file.read_string(len);

unsigned char byte_val;
llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
if (id == UNKNOWN_TOKEN_ID) {
text = "<unk>";
type = LLAMA_TOKEN_TYPE_UNKNOWN;
Expand Down Expand Up @@ -631,7 +631,7 @@ static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const floa
}

static void save_as_llama_model(
struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
) {
// convert AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings
Expand Down Expand Up @@ -671,7 +671,7 @@ static void save_as_llama_model(
std::vector<const char*> tokens;
std::vector<float> scores;
std::vector<llama_token_type> token_types;
for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) {
tokens.push_back(token_data.text.c_str());
scores.push_back(token_data.score);
token_types.push_back(token_data.type);
Expand Down Expand Up @@ -905,7 +905,7 @@ int main(int argc, char ** argv) {
fclose(file);
}

struct llama_vocab vocab;
struct my_llama_vocab vocab;
load_vocab(params.fn_vocab_model, &config, &vocab);

struct my_llama_model model;
Expand Down
Loading

0 comments on commit 6102037

Please sign in to comment.