From 021b3bda0ef50119829490db6b98ae6f7451fe8a Mon Sep 17 00:00:00 2001 From: Zhenzhong1 Date: Tue, 9 Jan 2024 18:27:41 -0800 Subject: [PATCH] pylint & add gguf to requirements.txt --- neural_speed/convert/convert_chatglm.py | 179 ++++-------- neural_speed/convert/convert_llama.py | 344 ++++++++---------------- requirements.txt | 3 +- 3 files changed, 161 insertions(+), 365 deletions(-) diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py index 57de1d3ab..4ac7c771b 100644 --- a/neural_speed/convert/convert_chatglm.py +++ b/neural_speed/convert/convert_chatglm.py @@ -17,8 +17,8 @@ import numpy as np from pathlib import Path import argparse -from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, - Literal, Optional, Sequence, Tuple, TypeVar, Union) +from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, + TypeVar, Union) from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer from sentencepiece import SentencePieceProcessor # type: ignore import gguf @@ -36,10 +36,9 @@ def bytes_to_unicode(): And avoids mapping to whitespace/control characters the bpe code barfs on. """ bs = list(range(ord("!"), - ord("~") + 1)) + list(range( - ord("¡"), - ord("¬") + 1)) + list(range(ord("®"), - ord("ÿ") + 1)) + ord("~") + 1)) + list(range(ord("¡"), + ord("¬") + 1)) + list(range(ord("®"), + ord("ÿ") + 1)) cs = bs[:] n = 0 for b in range(2**8): @@ -54,10 +53,8 @@ def bytes_to_unicode(): class SentencePieceVocab: - def __init__(self, fname_tokenizer: Path, - fname_added_tokens: Optional[Path]) -> None: - self.sentencepiece_tokenizer = SentencePieceProcessor( - str(fname_tokenizer)) + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None: + self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) added_tokens: Dict[str, int] if fname_added_tokens is not None: added_tokens = json.load(open(fname_added_tokens)) @@ -68,13 +65,11 @@ def __init__(self, fname_tokenizer: Path, actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: raise Exception( - f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}" - ) + f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) self.added_tokens_list = [text for (text, idx) in items] self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len( - self.added_tokens_list) + self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens @@ -93,8 +88,7 @@ def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]: byte_value = int(piece[3:-1], 16) text = struct.pack("B", byte_value) else: - text = tokenizer.id_to_piece(i).replace("\u2581", - " ").encode("utf-8") + text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") score: float = tokenizer.get_score(i) yield text, score @@ -130,8 +124,7 @@ def load_vocab_for_glm1(path: Path) -> SentencePieceVocab: pass the directory as --vocab-dir") added_tokens_path = path.parent / "added_tokens.json" print(f"Loading vocab file {path}") - return SentencePieceVocab( - path, added_tokens_path if added_tokens_path.exists() else None) + return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) def load_vocab_for_glm2(path: Path) -> SentencePieceVocab: @@ -152,12 +145,10 @@ def load_vocab_for_glm2(path: Path) -> SentencePieceVocab: pass the directory as --vocab-dir") added_tokens_path = path.parent / "added_tokens.json" print(f"Loading vocab file {path}") - return SentencePieceVocab( - path, added_tokens_path if added_tokens_path.exists() else None) + return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) -def chatglm2_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, - hparams): +def chatglm2_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams): print("ChatGLM-2.gguf converting: ") list_vars = model.state_dict() for name in list_vars.keys(): @@ -188,23 +179,14 @@ def chatglm2_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, gguf_writer.add_uint32('word_embed_proj_dim', 0) gguf_writer.add_uint32('do_layer_norm_before', 0) - gguf_writer.add_uint32('multi_query_group_num', - hparams["multi_query_group_num"]) + gguf_writer.add_uint32('multi_query_group_num', hparams["multi_query_group_num"]) gguf_writer.add_uint32('ffn_hidden_size', hparams["ffn_hidden_size"]) gguf_writer.add_uint32('inner_hidden_size', 0) - gguf_writer.add_int32( - 'bos_token_id', - tokenizer.bos_token_id if tokenizer.bos_token_id is not None else -1) - gguf_writer.add_int32( - 'eos_token_id', - tokenizer.eos_token_id if tokenizer.eos_token_id is not None else -1) - gguf_writer.add_int32( - 'pad_token_id', - tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1) - gguf_writer.add_int32( - 'sep_token_id', - tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1) + gguf_writer.add_int32('bos_token_id', tokenizer.bos_token_id if tokenizer.bos_token_id is not None else -1) + gguf_writer.add_int32('eos_token_id', tokenizer.eos_token_id if tokenizer.eos_token_id is not None else -1) + gguf_writer.add_int32('pad_token_id', tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1) + gguf_writer.add_int32('sep_token_id', tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1) def write_vocab_gguf(dir_model): print("gguf: get tokenizer metadata") @@ -215,9 +197,7 @@ def write_vocab_gguf(dir_model): if Path(dir_model + "/tokenizer.model").is_file(): # vocab type sentencepiece - print( - "gguf: get sentencepiece tokenizer vocab, scores and token types" - ) + print("gguf: get sentencepiece tokenizer vocab, scores and token types") vocab = load_vocab_for_glm2(Path(dir_model)) @@ -227,9 +207,7 @@ def write_vocab_gguf(dir_model): scores.append(score) if Path(dir_model + "/added_tokens.json").is_file(): - with open(dir_model + "/added_tokens.json", - "r", - encoding="utf-8") as f: + with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f: addtokens_json = json.load(f) print("gguf: get added tokens") @@ -248,51 +226,37 @@ def write_vocab_gguf(dir_model): if Path(dir_model + "/tokenizer.json").is_file(): # Look for special tokens in tokenizer.json if it exists - with open(dir_model + "/tokenizer.json", "r", - encoding="utf-8") as f: + with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: tokenizer = json.load(f) - if "added_tokens" in tokenizer and Path( - dir_model + "/tokenizer_config.json").is_file(): + if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file(): - with open(dir_model + "/tokenizer_config.json", - "r", - encoding="utf-8") as f: + with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f: tokenizer_config = json.load(f) - if "bos_token" in tokenizer_config and tokenizer_config[ - "bos_token"] != None: + if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None: for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["bos_token"][ - "content"]: + if key["content"] == tokenizer_config["bos_token"]["content"]: gguf_writer.add_bos_token_id(key["id"]) - if "eos_token" in tokenizer_config and tokenizer_config[ - "eos_token"] != None: + if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None: for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["eos_token"][ - "content"]: + if key["content"] == tokenizer_config["eos_token"]["content"]: gguf_writer.add_eos_token_id(key["id"]) - if "unk_token" in tokenizer_config and tokenizer_config[ - "unk_token"] != None: + if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None: for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["unk_token"][ - "content"]: + if key["content"] == tokenizer_config["unk_token"]["content"]: gguf_writer.add_unk_token_id(key["id"]) - if "sep_token" in tokenizer_config and tokenizer_config[ - "sep_token"] != None: + if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None: for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["sep_token"][ - "content"]: + if key["content"] == tokenizer_config["sep_token"]["content"]: gguf_writer.add_sep_token_id(key["id"]) - if "pad_token" in tokenizer_config and tokenizer_config[ - "pad_token"] != None: + if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None: for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["pad_token"][ - "content"]: + if key["content"] == tokenizer_config["pad_token"]["content"]: gguf_writer.add_pad_token_id(key["id"]) else: # If no tokenizer.json: Look for special tokens in config.json @@ -391,26 +355,13 @@ def chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", hparams["multi_query_group_num"])) fout.write(struct.pack("i", hparams["ffn_hidden_size"])) fout.write(struct.pack("i", 0)) - fout.write(struct.pack("f", hparams.get("layernorm_epsilon", - 1e-6))) # rms norm eps + fout.write(struct.pack("f", hparams.get("layernorm_epsilon", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base - fout.write( - struct.pack( - "i", tokenizer.bos_token_id - if tokenizer.bos_token_id is not None else 1)) - fout.write( - struct.pack( - "i", tokenizer.eos_token_id - if tokenizer.eos_token_id is not None else 2)) - fout.write( - struct.pack( - "i", tokenizer.pad_token_id - if tokenizer.pad_token_id is not None else -1)) - fout.write( - struct.pack( - "i", tokenizer.sep_token_id - if tokenizer.sep_token_id is not None else -1)) + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) + fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) + fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) + fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1)) vocab = load_vocab_for_glm2(Path(dir_model)) counter = 0 @@ -499,26 +450,13 @@ def chatglm1_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", hparams["inner_hidden_size"])) - fout.write(struct.pack("f", hparams.get("rms_norm_eps", - 1e-6))) # rms norm eps + fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base - fout.write( - struct.pack( - "i", tokenizer.bos_token_id - if tokenizer.bos_token_id is not None else -1)) - fout.write( - struct.pack( - "i", tokenizer.eos_token_id - if tokenizer.eos_token_id is not None else -1)) - fout.write( - struct.pack( - "i", tokenizer.pad_token_id - if tokenizer.pad_token_id is not None else -1)) - fout.write( - struct.pack( - "i", tokenizer.sep_token_id - if tokenizer.sep_token_id is not None else -1)) + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else -1)) + fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else -1)) + fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) + fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1)) vocab = load_vocab_for_glm1(Path(dir_model)) counter = 0 @@ -576,17 +514,10 @@ def chatglm1_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): def main(args_in: Optional[List[str]] = None) -> None: - parser = argparse.ArgumentParser( - description="Convert a model to a NE compatible file") - parser.add_argument("--outtype", - choices=["f32", "f16"], - help="output format (default: based on input)") - parser.add_argument("--outfile", - type=Path, - help="path to write to; default: based on input") - parser.add_argument("model", - type=Path, - help="directory containing model file") + parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") + parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file") parser.add_argument("--format", type=str, default="NE", @@ -607,22 +538,16 @@ def main(args_in: Optional[List[str]] = None) -> None: if args.outtype == "f16": ftype = 1 - tokenizer = AutoTokenizer.from_pretrained(dir_model, - trust_remote_code=True) - model = AutoModel.from_pretrained(dir_model, - low_cpu_mem_usage=True, - trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True) if hasattr(model.config, "multi_query_attention"): if args.format == "GGUF": - chatglm2_convert_gguf(model, tokenizer, dir_model, fname_out, - ftype, hparams) + chatglm2_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams) else: - chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, - hparams) + chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams) else: - chatglm1_convert(model, tokenizer, dir_model, fname_out, ftype, - hparams) + chatglm1_convert(model, tokenizer, dir_model, fname_out, ftype, hparams) if __name__ == '__main__': diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py index 95bb19e93..8b448c95d 100644 --- a/neural_speed/convert/convert_llama.py +++ b/neural_speed/convert/convert_llama.py @@ -31,8 +31,8 @@ from abc import ABCMeta, abstractmethod from dataclasses import dataclass from pathlib import Path -from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, - Literal, Optional, Sequence, Tuple, TypeVar, Union) +from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, + TypeVar, Union) import numpy as np from sentencepiece import SentencePieceProcessor # type: ignore import gguf @@ -70,13 +70,7 @@ class QuantizedDataType: DataType = Union[UnquantizedDataType, QuantizedDataType] -DATA_TYPE_TO_FTYPE: Dict[DataType, int] = { - DT_F32: 0, - DT_F16: 1, - DT_Q4_0: 2, - DT_Q4_1: 3, - DT_BOOL: 4 -} +DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {DT_F32: 0, DT_F16: 1, DT_Q4_0: 2, DT_Q4_1: 3, DT_BOOL: 4} FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \ {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()} @@ -171,22 +165,18 @@ def guessed(model: 'LazyModel') -> 'Params': n_head=n_embd // 128, n_head_kv=n_embd // 128, f_norm_eps=1e-5, - n_layer=next( - i for i in itertools.count() - if f"model.layers.{i}.self_attn.q_proj.weight" not in model), + n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model), ) @staticmethod - def loadHFTransformerJson(model: 'LazyModel', - config_path: Path) -> 'Params': + def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params': config = json.load(open(config_path)) n_vocab = config["vocab_size"] n_embd = config["hidden_size"] n_layer = config["num_hidden_layers"] n_head = config["num_attention_heads"] - n_head_kv = config[ - "num_key_value_heads"] if "num_key_value_heads" in config else n_head + n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head ffn_hidden_size = config["intermediate_size"] rms_norm_eps = config["rms_norm_eps"] rope_theta = config["rope_theta"] if "rope_theta" in config else 10000 @@ -207,8 +197,7 @@ def loadHFTransformerJson(model: 'LazyModel', # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, # "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} @staticmethod - def loadOriginalParamsJson(model: 'LazyModel', - config_path: Path) -> 'Params': + def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params': config = json.load(open(config_path)) n_vocab = config["vocab_size"] if "vocab_size" in config else -1 @@ -240,8 +229,7 @@ def load(model: 'ModelPlus') -> 'Params': if hf_config_path.exists(): params = Params.loadHFTransformerJson(model.model, hf_config_path) elif orig_config_path.exists(): - params = Params.loadOriginalParamsJson(model.model, - orig_config_path) + params = Params.loadOriginalParamsJson(model.model, orig_config_path) elif model.format != 'none': params = Params.guessed(model.model) else: @@ -253,10 +241,8 @@ def load(model: 'ModelPlus') -> 'Params': class SentencePieceVocab: - def __init__(self, fname_tokenizer: Path, - fname_added_tokens: Optional[Path]) -> None: - self.sentencepiece_tokenizer = SentencePieceProcessor( - str(fname_tokenizer)) + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None: + self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) added_tokens: Dict[str, int] if fname_added_tokens is not None: added_tokens = json.load(open(fname_added_tokens)) @@ -266,15 +252,12 @@ def __init__(self, fname_tokenizer: Path, expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: - print( - f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}" - ) + print(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") added_tokens = {} items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) self.added_tokens_list = [text for (text, idx) in items] self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len( - self.added_tokens_list) + self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens @@ -293,8 +276,7 @@ def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]: byte_value = int(piece[3:-1], 16) text = struct.pack("B", byte_value) else: - text = tokenizer.id_to_piece(i).replace("\u2581", - " ").encode("utf-8") + text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") score: float = tokenizer.get_score(i) yield text, score @@ -331,12 +313,10 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: if n_head_kv is not None and n_head != n_head_kv: n_head //= n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, - *weights.shape[1:]).swapaxes(1, 2).reshape( - weights.shape)) + *weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape)) -def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, - addends: Optional[NDArray], +def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray: # First reinterpret each row from a list of int32s containing 8 values each # to a list of uint8s containing 2 values each. @@ -344,8 +324,7 @@ def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, # Then split out the two values per int8 (which requires an actual # conversion because numpy doesn't natively support int4s). - qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], - dtype=np.uint8) + qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8) qvalues[:, 0::2] = qvalues_pack8 & 0xf qvalues[:, 1::2] = qvalues_pack8 >> 4 @@ -418,9 +397,7 @@ def permute(self, n_head: int, kv_head: int) -> 'UnquantizedTensor': return UnquantizedTensor(permute(self.ndarray, n_head, kv_head)) -def load_unquantized(lazy_tensor: 'LazyTensor', - expected_dtype: Any = None, - convert: bool = False) -> NDArray: +def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray: tensor = lazy_tensor.load() assert isinstance(tensor, UnquantizedTensor) @@ -431,9 +408,7 @@ def load_unquantized(lazy_tensor: 'LazyTensor', if convert: tensor.ndarray = tensor.ndarray.astype(expected_dtype) else: - raise ValueError( - f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}' - ) + raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}') return tensor.ndarray @@ -441,13 +416,10 @@ def load_unquantized(lazy_tensor: 'LazyTensor', class NEQuantizedTensor(Tensor): data_type: QuantizedDataType - def __init__(self, ndarray: NDArray, shape: List[int], - data_type: DataType) -> None: + def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None: rows, columns = shape assert data_type in (DT_Q4_1, DT_Q4_0) # for now - assert isinstance( - data_type, - QuantizedDataType) # redundant, but mypy complains without this + assert isinstance(data_type, QuantizedDataType) # redundant, but mypy complains without this assert columns % data_type.groupsize == 0 words_in_block = 6 if data_type == DT_Q4_1 else 5 self.ndarray = ndarray.view(dtype=np.uint32).reshape( @@ -463,8 +435,7 @@ def astype(self, data_type: DataType) -> Tensor: addends = self.ndarray[:, :, 1].view(np.float32) else: addends = None - qweights = self.ndarray[:, :, -4:].reshape( - [self.shape[0], self.shape[1] // 8]) + qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8]) dq = dequantize_q4(qweights, scales, addends, g_idx=None) return UnquantizedTensor(dq).astype(data_type) @@ -473,8 +444,7 @@ def to_ne(self) -> 'NEQuantizedTensor': return self def permute(self, n_head: int, kv_head: int) -> 'NEQuantizedTensor': - return NEQuantizedTensor(permute(self.ndarray, n_head, kv_head), - self.shape, self.data_type) + return NEQuantizedTensor(permute(self.ndarray, n_head, kv_head), self.shape, self.data_type) NECompatibleTensor = Union[UnquantizedTensor, NEQuantizedTensor] @@ -500,9 +470,7 @@ def permute(self, n_head: int, kv_head: int) -> Tensor: class GPTQForLLaMaQuantizedTensor(Tensor): def __init__(self, model: 'LazyModel', namebase: str) -> None: qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32) - scales = load_unquantized(model[f"{namebase}.scales"], - np.float32, - convert=True) + scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True) bias = model.get(f"{namebase}.bias") if bias is not None: @@ -564,8 +532,7 @@ def astype(self, data_type: DataType) -> Tensor: assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False return self.regroup(data_type.groupsize) - dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), - self.scales, self.addends, self.g_idx) + dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx) return UnquantizedTensor(dequantized).astype(data_type) def groupsize(self) -> int: @@ -573,8 +540,7 @@ def groupsize(self) -> int: assert self.shape[1] % self.scales.shape[1] == 0 return self.shape[1] // self.scales.shape[1] - def regroup(self, - new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor': + def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor': # Old versions of GPTQ-for-LLaMa shared scales and addends between all the # columns in a row. Newer versions share them between every set of N # columns in a row, where N is the `groupsize` parameter, usually 128. The @@ -585,12 +551,9 @@ def regroup(self, old_groupsize = self.groupsize() assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize ret = copy.copy(self) - ret.addends = self.addends.repeat(old_groupsize // new_groupsize, - axis=1) + ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1) ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1) - ret.data_type = QuantizedDataType(groupsize=new_groupsize, - have_addends=True, - have_g_idx=False) + ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False) return ret def permute(self, n_head: int, kv_head: int) -> Tensor: @@ -605,8 +568,7 @@ def to_ne(self) -> NEQuantizedTensor: # - weights (int4 * 32, 16 bytes) if self.groupsize() != 32: - raise Exception( - "should have been regrouped before converting to ne") + raise Exception("should have been regrouped before converting to ne") # Since the output format is mixed between integers and floats, we have # to hackily view the floats as int32s just so numpy will let us @@ -615,13 +577,10 @@ def to_ne(self) -> NEQuantizedTensor: scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis] # Split into groups of 4 columns (i.e. 32 columns of quantized data): - grouped = self.qweight.reshape( - [self.qweight.shape[0], self.qweight.shape[1] // 4, 4]) + grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4]) # And concatenate: - grouped = np.concatenate([scales_view, addends_view, grouped], - axis=2, - casting='no') + grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no') return NEQuantizedTensor(grouped, self.shape, DT_Q4_1) @@ -635,8 +594,7 @@ class LazyTensor: def load(self) -> Tensor: ret = self._load() - assert ret.data_type == self.data_type, (self.data_type, ret.data_type, - self.description) + assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description) return ret def astype(self, data_type: DataType) -> 'LazyTensor': @@ -645,23 +603,19 @@ def astype(self, data_type: DataType) -> 'LazyTensor': def load() -> Tensor: return self.load().astype(data_type) - return LazyTensor(load, self.shape, data_type, - f'convert({data_type}) {self.description}') + return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}') def validate_conversion_to(self, data_type: DataType) -> None: if data_type == self.data_type: return if isinstance(data_type, QuantizedDataType): if not isinstance(self.data_type, QuantizedDataType): - raise Exception( - f"Can't turn an unquantized tensor into a quantized type ({data_type})" - ) + raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})") if self.data_type.have_g_idx: sys.stderr.write( "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively\ supported by NE. For now you can still convert this model by passing `--outtype f16` to \ - dequantize, but that will result in a much larger output file for no quality benefit.\n" - ) + dequantize, but that will result in a much larger output file for no quality benefit.\n") sys.exit(1) assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends @@ -674,8 +628,7 @@ class ModelPlus: model: LazyModel paths: List[Path] # Where this was read from. format: Literal['ne', 'torch', 'safetensors'] - vocab: Optional[ - Vocab] # For NE models (which have vocab built in), the vocab. + vocab: Optional[Vocab] # For NE models (which have vocab built in), the vocab. def merge_sharded(models: List[LazyModel]) -> LazyModel: @@ -701,18 +654,15 @@ def convert(name: str) -> LazyTensor: # split by rows axis = 0 concatenated_shape = list(lazy_tensors[0].shape) - concatenated_shape[axis] = sum(tensor.shape[axis] - for tensor in lazy_tensors) + concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors) def load() -> UnquantizedTensor: ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors] concatenated: NDArray = np.concatenate(ndarrays, axis=axis) return UnquantizedTensor(concatenated) - description = 'concatenated[[' + '] | ['.join( - lt.description for lt in lazy_tensors) + ']]' - return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, - description) + description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]' + return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description) return {name: convert(name) for name in names} @@ -740,18 +690,15 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus: return ModelPlus(model, paths, format, vocab) -def permute_lazy(lazy_tensor: LazyTensor, n_head: int, - n_head_kv: int) -> LazyTensor: +def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: def load() -> Tensor: return lazy_tensor.load().permute(n_head, n_head_kv) - return LazyTensor( - load, lazy_tensor.shape, lazy_tensor.data_type, - f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) + return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, + f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) -def convert_transformers_to_orig(model: LazyModel, - params: Params) -> LazyModel: +def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel: out: LazyModel = {} out["tok_embeddings.weight"] = model["model.embed_tokens.weight"] out["norm.weight"] = model["model.norm.weight"] @@ -760,28 +707,19 @@ def convert_transformers_to_orig(model: LazyModel, for i in itertools.count(): if f"model.layers.{i}.self_attn.q_proj.weight" not in model: break - out[f"layers.{i}.attention.wq.weight"] = permute_lazy( - model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, - params.n_head) - out[f"layers.{i}.attention.wk.weight"] = permute_lazy( - model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, - params.n_head_kv) - out[f"layers.{i}.attention.wv.weight"] = model[ - f"model.layers.{i}.self_attn.v_proj.weight"] - out[f"layers.{i}.attention.wo.weight"] = model[ - f"model.layers.{i}.self_attn.o_proj.weight"] - - out[f"layers.{i}.feed_forward.w1.weight"] = model[ - f"model.layers.{i}.mlp.gate_proj.weight"] - out[f"layers.{i}.feed_forward.w2.weight"] = model[ - f"model.layers.{i}.mlp.down_proj.weight"] - out[f"layers.{i}.feed_forward.w3.weight"] = model[ - f"model.layers.{i}.mlp.up_proj.weight"] - - out[f"layers.{i}.attention_norm.weight"] = model[ - f"model.layers.{i}.input_layernorm.weight"] - out[f"layers.{i}.ffn_norm.weight"] = model[ - f"model.layers.{i}.post_attention_layernorm.weight"] + out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], + params.n_head, params.n_head) + out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], + params.n_head, params.n_head_kv) + out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] + out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"] + + out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"] + out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"] + out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"] + + out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"] + out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"] return out @@ -803,20 +741,16 @@ def convert(name: str) -> Tuple[str, LazyTensor]: # GPTQForLLaMaQuantizedTensor (which is executed when the modelis # actually loaded). lazy_scales = model[f"{namebase}.scales"] - scales_width = 1 if lazy_scales.shape[ - 1] == 1 else lazy_scales.shape[0] + scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0] assert real_shape[1] % scales_width == 0 groupsize = real_shape[1] // scales_width have_g_idx = f"{namebase}.g_idx" in model - data_type = QuantizedDataType(groupsize=groupsize, - have_addends=True, - have_g_idx=have_g_idx) + data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx) def load() -> Tensor: return GPTQForLLaMaQuantizedTensor(model, namebase) - return (orig_name, - LazyTensor(load, real_shape, data_type, '[quantized]')) + return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]')) else: return (name, model[name]) @@ -844,8 +778,7 @@ class LazyStorage: class LazyUnpickler(pickle.Unpickler): - def __init__(self, fp: IO[bytes], data_base_path: str, - zip_file: zipfile.ZipFile): + def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile): super().__init__(fp) self.data_base_path = data_base_path self.zip_file = zip_file @@ -887,12 +820,10 @@ def lazy_rebuild_tensor_v2( def load() -> UnquantizedTensor: elm_count = stride[0] * size[0] - return UnquantizedTensor( - storage.load(storage_offset, elm_count).reshape(size)) + return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size)) description = f'pickled storage_offset={storage_offset} in {storage.description}' - return LazyTensor(load, list(size), storage.kind.data_type, - description) + return LazyTensor(load, list(size), storage.kind.data_type, description) # @staticmethod def rebuild_from_type_v2(func, new_type, args, state): @@ -920,9 +851,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')] assert len(pickle_paths) == 1, pickle_paths pickle_fp = zf.open(pickle_paths[0], 'r') - unpickler = LazyUnpickler(pickle_fp, - data_base_path=pickle_paths[0][:-4], - zip_file=zf) + unpickler = LazyUnpickler(pickle_fp, data_base_path=pickle_paths[0][:-4], zip_file=zf) model = unpickler.load() as_dict = dict(model.items()) return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None) @@ -954,20 +883,13 @@ def convert(info: Dict[str, Any]) -> LazyTensor: buf = byte_buf[begin:end] def load() -> UnquantizedTensor: - return UnquantizedTensor( - np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) + return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) description = f'safetensors begin={begin} end={end} type={data_type} path={path}' return LazyTensor(load, shape, data_type, description) - model = { - name: convert(info) - for (name, info) in header.items() if name != '__metadata__' - } - return ModelPlus(model=model, - paths=[path], - format='safetensors', - vocab=None) + model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'} + return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None) def must_read(fp: IO[bytes], length: int) -> bytes: @@ -985,8 +907,7 @@ def lazy_load_ne_file(fp: io.BufferedReader, path: Path) -> ModelPlus: else: assert magic == b'ne' version = None - n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack( - '<7i', must_read(fp, 28)) + n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack('<7i', must_read(fp, 28)) tokens: List[Tuple[bytes, float]] = [] for i in range(n_vocab): @@ -1016,12 +937,10 @@ def lazy_load_ne_file(fp: io.BufferedReader, path: Path) -> ModelPlus: mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)) fp.raw.seek(off) # needed on Windows - def read_tensor( - ) -> None: # this is a function so that variables captured in `load` don't change + def read_tensor() -> None: # this is a function so that variables captured in `load` don't change shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12)) assert 0 <= shape_len <= 3 - shape: List[int] = list( - struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len))) + shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len))) shape = shape[::-1] name = must_read(fp, name_len).decode('utf-8') data_type = FTYPE_TO_DATA_TYPE[ftype] @@ -1047,8 +966,7 @@ def load() -> Tensor: ndarray = np.frombuffer(buf, dtype=np.uint32) return NEQuantizedTensor(ndarray, shape, data_type) else: - return UnquantizedTensor( - np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) + return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) description = f'ne offset={offset} type={data_type} path={path}' model[name] = LazyTensor(load, shape, data_type, description) @@ -1082,8 +1000,7 @@ def lazy_load_file(path: Path) -> ModelPlus: Out = TypeVar('Out') -def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], - concurrency: int) -> Iterable[Out]: +def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]: '''Parallel map, but with backpressure. If the caller doesn't call `next` fast enough, this will stop calling `func` at some point rather than letting results pile up in memory. Specifically, there is a max of one @@ -1105,9 +1022,7 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None: # NEVocab comes from the same file as the model so shouldn't mismatch: assert isinstance(vocab, SentencePieceVocab) if params.n_vocab == vocab.vocab_size_base: - print( - "Ignoring added_tokens.json since model matches vocab size without it." - ) + print("Ignoring added_tokens.json since model matches vocab size without it.") vocab.added_tokens_list = [] vocab.vocab_size = vocab.vocab_size_base return @@ -1159,12 +1074,9 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None: self.fout.write(struct.pack("i", 0)) self.fout.write(struct.pack("i", 0)) - def write_tensor_header(self, name: str, shape: Sequence[int], - data_type: DataType) -> None: + def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None: sname = name.encode('utf-8') - self.fout.write( - struct.pack("iii", len(shape), len(sname), - DATA_TYPE_TO_FTYPE[data_type])) + self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type])) self.fout.write(struct.pack("i" * len(shape), *shape[::-1])) self.fout.write(sname) self.fout.seek((self.fout.tell() + 31) & -32) @@ -1190,8 +1102,7 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: of.fout.close() @staticmethod - def write_all(fname_out: Path, params: Params, model: LazyModel, - vocab: Vocab, file_type: NEFileType) -> None: + def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab, file_type: NEFileType) -> None: check_vocab_size(params, vocab) of = OutputFile(fname_out) of.write_file_header(params, file_type) @@ -1203,13 +1114,11 @@ def do_item(item: Tuple[str, LazyTensor]) -> NDArray: return lazy_tensor.load().to_ne().ndarray ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8) - for i, ((name, lazy_tensor), - ndarray) in enumerate(zip(model.items(), ndarrays)): + for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) - print( - f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | \ + print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | \ type {lazy_tensor.data_type}") of.write_tensor_header( name, @@ -1251,8 +1160,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None: self.gguf_writer.add_int32('pad_token_id', 0) self.gguf_writer.add_int32('sep_token_id', 0) - def write_tensor_header_gguf(self, name: str, shape: Sequence[int], - data_type: DataType, data) -> None: + def write_tensor_header_gguf(self, name: str, shape: Sequence[int], data_type: DataType, data) -> None: # sname = name.encode('utf-8') # self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type])) # self.fout.write(struct.pack("i" * len(shape), *shape[::-1])) @@ -1308,8 +1216,7 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: of.fout.close() @staticmethod - def write_all(fname_out: Path, params: Params, model: LazyModel, - vocab: Vocab, file_type: NEFileType) -> None: + def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab, file_type: NEFileType) -> None: check_vocab_size(params, vocab) of = OutputFile_GGUF(fname_out) of.write_file_header(params, file_type) @@ -1321,42 +1228,32 @@ def do_item(item: Tuple[str, LazyTensor]) -> NDArray: return lazy_tensor.load().to_ne().ndarray ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8) - for i, ((name, lazy_tensor), - ndarray) in enumerate(zip(model.items(), ndarrays)): + for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) - print( - f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | \ + print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | \ type {lazy_tensor.data_type}") - of.write_tensor_header_gguf(name, lazy_tensor.shape, - lazy_tensor.data_type, ndarray) + of.write_tensor_header_gguf(name, lazy_tensor.shape, lazy_tensor.data_type, ndarray) of.end() of.fout.close() -def pick_output_type(model: LazyModel, - output_type_str: Optional[str]) -> NEFileType: +def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> NEFileType: wq_type = model["layers.0.attention.wq.weight"].data_type - if output_type_str == "f32" or (output_type_str is None - and wq_type in (DT_F32, DT_BF16)): + if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)): return NEFileType.AllF32 - if output_type_str == "f16" or (output_type_str is None - and wq_type == DT_F16): + if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16): return NEFileType.MostlyF16 - if output_type_str == "q4_1" or (output_type_str is None and isinstance( - wq_type, QuantizedDataType) and wq_type.have_addends): + if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType) + and wq_type.have_addends): if isinstance(model["output.weight"].data_type, QuantizedDataType): return NEFileType.MostlyQ4_1 else: return NEFileType.PerLayerIsQ4_1 - if output_type_str == "q4_0" or (output_type_str is None and isinstance( - wq_type, QuantizedDataType)): + if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)): return NEFileType.MostlyQ4_0 - name_to_type = { - name: lazy_tensor.data_type - for (name, lazy_tensor) in model.items() - } + name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()} raise Exception(f"Unexpected combination of types: {name_to_type}") @@ -1370,12 +1267,8 @@ def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel: return model -def convert_to_output_type(model: LazyModel, - output_type: NEFileType) -> LazyModel: - return { - name: tensor.astype(output_type.type_for_tensor(name, tensor)) - for (name, tensor) in model.items() - } +def convert_to_output_type(model: LazyModel, output_type: NEFileType) -> LazyModel: + return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) for (name, tensor) in model.items()} def nth_multifile_path(path: Path, n: int) -> Optional[Path]: @@ -1425,10 +1318,7 @@ def load_some_model(path: Path) -> ModelPlus: files = list(path.glob("model-00001-of-*.safetensors")) if not files: # Try the PyTorch patterns too, with lower priority - globs = [ - "consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", - "pytorch_model.bin" - ] + globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] files = [file for glob in globs for file in path.glob(glob)] if not files: # Try NE too, but with lower priority, since if both a non-NE @@ -1438,9 +1328,7 @@ def load_some_model(path: Path) -> ModelPlus: if not files: raise Exception(f"Can't find model in directory {path}") if len(files) > 1: - raise Exception( - f"Found multiple models in {path}, not sure which to pick: {files}" - ) + raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}") path = files[0] paths = find_multifile_paths(path) @@ -1475,8 +1363,7 @@ def load_vocab(path: Path) -> SentencePieceVocab: pass the directory as --vocab-dir") added_tokens_path = path.parent / "added_tokens.json" print(f"Loading vocab file {path}") - return SentencePieceVocab( - path, added_tokens_path if added_tokens_path.exists() else None) + return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) def default_outfile(model_paths: List[Path], params: Params) -> Path: @@ -1501,41 +1388,26 @@ def do_dump_model(model_plus: ModelPlus) -> None: print(f"model_plus.format = {model_plus.format!r}") print(f"model_plus.vocab = {model_plus.vocab!r}") for name, lazy_tensor in model_plus.model.items(): - print( - f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}" - ) + print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") def main(args_in: Optional[List[str]] = None) -> None: - parser = argparse.ArgumentParser( - description="Convert a LLaMa model to a NE compatible file") - parser.add_argument("--dump", + parser = argparse.ArgumentParser(description="Convert a LLaMa model to a NE compatible file") + parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") + parser.add_argument("--dump-single", action="store_true", - help="don't convert, just show what's in the model") - parser.add_argument( - "--dump-single", - action="store_true", - help="don't convert, just show what's in a single model file") - parser.add_argument("--vocab-only", - action="store_true", - help="extract only the vocab") + help="don't convert, just show what's in a single model file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)") - parser.add_argument( - "--vocab-dir", - type=Path, - help="directory containing tokenizer.model, if separate from model file" - ) - parser.add_argument("--outfile", + parser.add_argument("--vocab-dir", + type=Path, + help="directory containing tokenizer.model, if separate from model file") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, - help="path to write to; default: based on input") - parser.add_argument( - "model", - type=Path, - help= - "directory containing model file, or model file itself (*.pth, *.pt, *.bin)" - ) + help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") parser.add_argument("--format", type=str, default="NE", @@ -1562,8 +1434,7 @@ def main(args_in: Optional[List[str]] = None) -> None: if model_plus.vocab is not None and args.vocab_dir is None: vocab = model_plus.vocab else: - vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[ - 0].parent + vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab = load_vocab(vocab_dir) model = model_plus.model @@ -1574,8 +1445,7 @@ def main(args_in: Optional[List[str]] = None) -> None: outfile = args.outfile or default_outfile(model_plus.paths, params) if args.format == "GGUF": - OutputFile_GGUF.write_all(outfile, params, model, vocab, - output_type) + OutputFile_GGUF.write_all(outfile, params, model, vocab, output_type) else: OutputFile.write_all(outfile, params, model, vocab, output_type) diff --git a/requirements.txt b/requirements.txt index f82674779..e30a99c1d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ datasets transformers_stream_generator tiktoken py-cpuinfo -cmake \ No newline at end of file +gguf +cmake