diff --git a/convert.py b/convert.py index 54e6a71aacad79..8cf481c27a7dc2 100644 --- a/convert.py +++ b/convert.py @@ -133,7 +133,7 @@ def make_tensors_list() -> List[str]: @dataclass class Params: n_vocab: int - n_vocab_sp: int + n_vocab_base: int n_embd: int n_mult: int n_head: int @@ -146,7 +146,7 @@ def guessed(model: 'LazyModel', vocab: 'Vocab', file_type: GGMLFileType) -> 'Par return Params( n_vocab=n_vocab, - n_vocab_sp=vocab.vocab_special_size, + n_vocab_base=vocab.vocab_size_base, n_embd=n_embd, n_mult=256, n_head=n_embd // 128, @@ -190,7 +190,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], fn else: tokenizer_config = {} for key, value in tokenizer_config.items(): - if not isinstance(value, dict) or not isinstance(value, str): + if not isinstance(value, dict) and not isinstance(value, str): continue token_id = TOKEN_NAME_TO_ID.get(key, -1) if token_id == -1: @@ -203,15 +203,13 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], fn else: special_tokens = {} for key, value in special_tokens.items(): - if not isinstance(value, dict) or not isinstance(value, str): + if not isinstance(value, dict) and not isinstance(value, str): continue token_id = TOKEN_NAME_TO_ID.get(key, -1) if token_id == -1 or token_id in self.special_tokens_map: continue self.special_tokens_map[token_id] = value["content"] if isinstance(value, dict) else value - self.vocab_special_size: int = len(self.added_tokens_list) + len(self.special_tokens_map) - def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]: tokenizer = self.sentencepiece_tokenizer special_tokens = [tokenizer.bos_id(), tokenizer.eos_id(), tokenizer.pad_id()] @@ -258,7 +256,7 @@ def __init__(self, tokens: List[Tuple[bytes, float]]): self.tokens = tokens self.special_tokens = [] self.vocab_size = len(tokens) - self.vocab_special_size = 0 + self.vocab_size_base = 0 def all_tokens(self) -> Iterable[Tuple[bytes, float]]: return self.tokens @@ -976,17 +974,16 @@ def __init__(self, fname_out: Path) -> None: def write_file_header(self, params: Params) -> None: self.fout.write(b"ggjt"[::-1]) # magic values = [ - 4, # file version + 1, # file version params.n_vocab, - params.n_vocab_sp, params.n_embd, params.n_mult, params.n_head, params.n_layer, - params.n_embd // params.n_head, # rot (obsolete) + params.n_vocab_base | 0xF0000000, # reuse obsolete rot value to store vocab_base params.file_type.value, ] - self.fout.write(struct.pack("i" * len(values), *values)) + self.fout.write(struct.pack("I" * len(values), *values)) def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None: sname = name.encode('utf-8') @@ -1000,13 +997,15 @@ def write_vocab(self, vocab: Vocab) -> None: self.fout.write(struct.pack("i", len(text))) self.fout.write(text) self.fout.write(struct.pack("f", score)) + + def write_vocab_sp(self, vocab: Vocab) -> None: for token_id in vocab.all_special_tokens(): self.fout.write(struct.pack("i", token_id)) @staticmethod def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: of = OutputFile(fname_out) - params = Params(n_vocab=vocab.vocab_size, n_vocab_sp=vocab.vocab_special_size, n_embd=0, n_mult=0, + params = Params(n_vocab=vocab.vocab_size, n_vocab_base=vocab.vocab_size_base, n_embd=0, n_mult=0, n_head=1, n_layer=0, file_type=GGMLFileType.AllF32) of = OutputFile(fname_out) of.write_file_header(params) diff --git a/llama.cpp b/llama.cpp index 2b59343cf46f97..07fc17f0d52e67 100644 --- a/llama.cpp +++ b/llama.cpp @@ -128,13 +128,12 @@ static const std::map & MEM_REQ_EVAL() // default hparams (LLaMA 7B) struct llama_hparams { uint32_t n_vocab = 32000; - uint32_t n_vocab_sp = 0; + uint32_t n_vocab_base = 32000; uint32_t n_ctx = 512; // this is provided as user input? uint32_t n_embd = 4096; uint32_t n_mult = 256; uint32_t n_head = 32; uint32_t n_layer = 32; - uint32_t n_rot = 64; enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; bool operator!=(const llama_hparams & other) const { @@ -460,7 +459,6 @@ enum llama_file_version { LLAMA_FILE_VERSION_GGJT_V1, // added padding LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format - LLAMA_FILE_VERSION_GGJT_V4, // improved support for added/special tokens }; struct llama_file_loader { @@ -476,6 +474,7 @@ struct llama_file_loader { read_hparams(); read_vocab(); read_tensor_metadata(file_idx, tensors_map); + set_vocab_sp(); } void read_magic() { uint32_t magic = file.read_u32(); @@ -498,7 +497,6 @@ struct llama_file_loader { case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return; case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return; case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return; - case 4: file_version = LLAMA_FILE_VERSION_GGJT_V4; return; } } @@ -507,12 +505,12 @@ struct llama_file_loader { } void read_hparams() { hparams.n_vocab = file.read_u32(); - hparams.n_vocab_sp = file_version >= LLAMA_FILE_VERSION_GGJT_V4 ? file.read_u32() : 0; hparams.n_embd = file.read_u32(); hparams.n_mult = file.read_u32(); hparams.n_head = file.read_u32(); hparams.n_layer = file.read_u32(); - hparams.n_rot = file.read_u32(); + hparams.n_vocab_base = file.read_u32(); + hparams.n_vocab_base = (hparams.n_vocab_base & 0xF0000000) == 0 ? hparams.n_vocab : (hparams.n_vocab_base & ~0xF0000000); // this bitwise operation is necessary for compatibility with older models hparams.ftype = (enum llama_ftype) file.read_u32(); } void read_vocab() { @@ -533,20 +531,6 @@ struct llama_file_loader { tok_score.tok = std::move(word); tok_score.score = score; } - - vocab.special_token_to_id.reserve(hparams.n_vocab_sp); - - for (uint32_t i = 0; i < hparams.n_vocab_sp; i++) { - llama_vocab::id token_id = file.read_u32(); - const auto & word = vocab.id_to_token[token_id].tok; - - vocab.special_token_trie.add(word); - vocab.special_token_to_id[word] = token_id; - - if (vocab.max_special_token_length < word.size()) { - vocab.max_special_token_length = word.size(); - } - } } void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) { while (file.tell() < file.size) { @@ -601,6 +585,24 @@ struct llama_file_loader { tensors_map.tensors.at(idx).shards.push_back(shard); } } + void set_vocab_sp() { + uint32_t vocab_sp = 3 + hparams.n_vocab - hparams.n_vocab_base; + vocab.special_token_to_id.reserve(vocab_sp); + for (uint32_t i = 0; i < vocab_sp; i++) { + llama_vocab::id token_id = i > 2 ? hparams.n_vocab_base + i : i; + const auto & word = vocab.id_to_token[token_id].tok; + if (word.empty()) { + continue; + } + + vocab.special_token_trie.add(word); + vocab.special_token_to_id[word] = token_id; + + if (vocab.max_special_token_length < word.size()) { + vocab.max_special_token_length = word.size(); + } + } + } }; struct llama_file_saver { @@ -620,12 +622,11 @@ struct llama_file_saver { void write_hparams(enum llama_ftype new_ftype) { const llama_hparams & hparams = any_file_loader->hparams; file.write_u32(hparams.n_vocab); - file.write_u32(hparams.n_vocab_sp); file.write_u32(hparams.n_embd); file.write_u32(hparams.n_mult); file.write_u32(hparams.n_head); file.write_u32(hparams.n_layer); - file.write_u32(hparams.n_rot); + file.write_u32(hparams.n_vocab_base | 0xF0000000); // this bitwise operation is necessary for compatibility with older models file.write_u32(new_ftype); } void write_vocab() { @@ -639,9 +640,6 @@ struct llama_file_saver { file.write_raw(token_score.tok.data(), token_score.tok.size()); file.write_raw(&token_score.score, sizeof(token_score.score)); } - for (const auto & pair : any_file_loader->vocab.special_token_to_id) { - file.write_u32(pair.second); - } } void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { switch (new_type) { @@ -1015,8 +1013,7 @@ static const char *llama_file_version_name(llama_file_version version) { case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)"; - case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (pre #1931)"; - case LLAMA_FILE_VERSION_GGJT_V4: return "ggjt v4 (latest)"; + case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)"; } return "unknown"; @@ -1113,7 +1110,7 @@ static void llama_model_load_internal( fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_embd/hparams.n_head); fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size()); diff --git a/llama.h b/llama.h index 26121536cff601..1241ba6c0ec443 100644 --- a/llama.h +++ b/llama.h @@ -32,7 +32,7 @@ #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' -#define LLAMA_FILE_VERSION 4 +#define LLAMA_FILE_VERSION 3 #define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT #define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN