From 021b3bda0ef50119829490db6b98ae6f7451fe8a Mon Sep 17 00:00:00 2001
From: Zhenzhong1 <zhenzhong.xu@intel.com>
Date: Tue, 9 Jan 2024 18:27:41 -0800
Subject: [PATCH] pylint & add gguf to requirements.txt

---
 neural_speed/convert/convert_chatglm.py | 179 ++++--------
 neural_speed/convert/convert_llama.py   | 344 ++++++++----------------
 requirements.txt                        |   3 +-
 3 files changed, 161 insertions(+), 365 deletions(-)

diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py
index 57de1d3ab..4ac7c771b 100644
--- a/neural_speed/convert/convert_chatglm.py
+++ b/neural_speed/convert/convert_chatglm.py
@@ -17,8 +17,8 @@
 import numpy as np
 from pathlib import Path
 import argparse
-from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
-                    Literal, Optional, Sequence, Tuple, TypeVar, Union)
+from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple,
+                    TypeVar, Union)
 from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
 from sentencepiece import SentencePieceProcessor  # type: ignore
 import gguf
@@ -36,10 +36,9 @@ def bytes_to_unicode():
     And avoids mapping to whitespace/control characters the bpe code barfs on.
     """
     bs = list(range(ord("!"),
-                    ord("~") + 1)) + list(range(
-                        ord("¡"),
-                        ord("¬") + 1)) + list(range(ord("®"),
-                                                    ord("ÿ") + 1))
+                    ord("~") + 1)) + list(range(ord("¡"),
+                                                ord("¬") + 1)) + list(range(ord("®"),
+                                                                            ord("ÿ") + 1))
     cs = bs[:]
     n = 0
     for b in range(2**8):
@@ -54,10 +53,8 @@ def bytes_to_unicode():
 
 
 class SentencePieceVocab:
-    def __init__(self, fname_tokenizer: Path,
-                 fname_added_tokens: Optional[Path]) -> None:
-        self.sentencepiece_tokenizer = SentencePieceProcessor(
-            str(fname_tokenizer))
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
         added_tokens: Dict[str, int]
         if fname_added_tokens is not None:
             added_tokens = json.load(open(fname_added_tokens))
@@ -68,13 +65,11 @@ def __init__(self, fname_tokenizer: Path,
         actual_ids = sorted(added_tokens.values())
         if expected_ids != actual_ids:
             raise Exception(
-                f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}"
-            )
+                f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
         items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
         self.added_tokens_list = [text for (text, idx) in items]
         self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(
-            self.added_tokens_list)
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
         self.fname_tokenizer = fname_tokenizer
         self.fname_added_tokens = fname_added_tokens
 
@@ -93,8 +88,7 @@ def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
                 byte_value = int(piece[3:-1], 16)
                 text = struct.pack("B", byte_value)
             else:
-                text = tokenizer.id_to_piece(i).replace("\u2581",
-                                                        " ").encode("utf-8")
+                text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
             score: float = tokenizer.get_score(i)
             yield text, score
 
@@ -130,8 +124,7 @@ def load_vocab_for_glm1(path: Path) -> SentencePieceVocab:
                 pass the directory as --vocab-dir")
     added_tokens_path = path.parent / "added_tokens.json"
     print(f"Loading vocab file {path}")
-    return SentencePieceVocab(
-        path, added_tokens_path if added_tokens_path.exists() else None)
+    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
 
 
 def load_vocab_for_glm2(path: Path) -> SentencePieceVocab:
@@ -152,12 +145,10 @@ def load_vocab_for_glm2(path: Path) -> SentencePieceVocab:
                 pass the directory as --vocab-dir")
     added_tokens_path = path.parent / "added_tokens.json"
     print(f"Loading vocab file {path}")
-    return SentencePieceVocab(
-        path, added_tokens_path if added_tokens_path.exists() else None)
+    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
 
 
-def chatglm2_convert_gguf(model, tokenizer, dir_model, fname_out, ftype,
-                          hparams):
+def chatglm2_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams):
     print("ChatGLM-2.gguf converting: ")
     list_vars = model.state_dict()
     for name in list_vars.keys():
@@ -188,23 +179,14 @@ def chatglm2_convert_gguf(model, tokenizer, dir_model, fname_out, ftype,
     gguf_writer.add_uint32('word_embed_proj_dim', 0)
     gguf_writer.add_uint32('do_layer_norm_before', 0)
 
-    gguf_writer.add_uint32('multi_query_group_num',
-                           hparams["multi_query_group_num"])
+    gguf_writer.add_uint32('multi_query_group_num', hparams["multi_query_group_num"])
     gguf_writer.add_uint32('ffn_hidden_size', hparams["ffn_hidden_size"])
     gguf_writer.add_uint32('inner_hidden_size', 0)
 
-    gguf_writer.add_int32(
-        'bos_token_id',
-        tokenizer.bos_token_id if tokenizer.bos_token_id is not None else -1)
-    gguf_writer.add_int32(
-        'eos_token_id',
-        tokenizer.eos_token_id if tokenizer.eos_token_id is not None else -1)
-    gguf_writer.add_int32(
-        'pad_token_id',
-        tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)
-    gguf_writer.add_int32(
-        'sep_token_id',
-        tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1)
+    gguf_writer.add_int32('bos_token_id', tokenizer.bos_token_id if tokenizer.bos_token_id is not None else -1)
+    gguf_writer.add_int32('eos_token_id', tokenizer.eos_token_id if tokenizer.eos_token_id is not None else -1)
+    gguf_writer.add_int32('pad_token_id', tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)
+    gguf_writer.add_int32('sep_token_id', tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1)
 
     def write_vocab_gguf(dir_model):
         print("gguf: get tokenizer metadata")
@@ -215,9 +197,7 @@ def write_vocab_gguf(dir_model):
 
         if Path(dir_model + "/tokenizer.model").is_file():
             # vocab type sentencepiece
-            print(
-                "gguf: get sentencepiece tokenizer vocab, scores and token types"
-            )
+            print("gguf: get sentencepiece tokenizer vocab, scores and token types")
 
             vocab = load_vocab_for_glm2(Path(dir_model))
 
@@ -227,9 +207,7 @@ def write_vocab_gguf(dir_model):
                 scores.append(score)
 
             if Path(dir_model + "/added_tokens.json").is_file():
-                with open(dir_model + "/added_tokens.json",
-                          "r",
-                          encoding="utf-8") as f:
+                with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
                     addtokens_json = json.load(f)
 
                     print("gguf: get added tokens")
@@ -248,51 +226,37 @@ def write_vocab_gguf(dir_model):
         if Path(dir_model + "/tokenizer.json").is_file():
             # Look for special tokens in tokenizer.json if it exists
 
-            with open(dir_model + "/tokenizer.json", "r",
-                      encoding="utf-8") as f:
+            with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
                 tokenizer = json.load(f)
 
-            if "added_tokens" in tokenizer and Path(
-                    dir_model + "/tokenizer_config.json").is_file():
+            if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
 
-                with open(dir_model + "/tokenizer_config.json",
-                          "r",
-                          encoding="utf-8") as f:
+                with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
                     tokenizer_config = json.load(f)
 
-                if "bos_token" in tokenizer_config and tokenizer_config[
-                        "bos_token"] != None:
+                if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
                     for key in tokenizer["added_tokens"]:
-                        if key["content"] == tokenizer_config["bos_token"][
-                                "content"]:
+                        if key["content"] == tokenizer_config["bos_token"]["content"]:
                             gguf_writer.add_bos_token_id(key["id"])
 
-                if "eos_token" in tokenizer_config and tokenizer_config[
-                        "eos_token"] != None:
+                if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
                     for key in tokenizer["added_tokens"]:
-                        if key["content"] == tokenizer_config["eos_token"][
-                                "content"]:
+                        if key["content"] == tokenizer_config["eos_token"]["content"]:
                             gguf_writer.add_eos_token_id(key["id"])
 
-                if "unk_token" in tokenizer_config and tokenizer_config[
-                        "unk_token"] != None:
+                if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
                     for key in tokenizer["added_tokens"]:
-                        if key["content"] == tokenizer_config["unk_token"][
-                                "content"]:
+                        if key["content"] == tokenizer_config["unk_token"]["content"]:
                             gguf_writer.add_unk_token_id(key["id"])
 
-                if "sep_token" in tokenizer_config and tokenizer_config[
-                        "sep_token"] != None:
+                if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
                     for key in tokenizer["added_tokens"]:
-                        if key["content"] == tokenizer_config["sep_token"][
-                                "content"]:
+                        if key["content"] == tokenizer_config["sep_token"]["content"]:
                             gguf_writer.add_sep_token_id(key["id"])
 
-                if "pad_token" in tokenizer_config and tokenizer_config[
-                        "pad_token"] != None:
+                if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
                     for key in tokenizer["added_tokens"]:
-                        if key["content"] == tokenizer_config["pad_token"][
-                                "content"]:
+                        if key["content"] == tokenizer_config["pad_token"]["content"]:
                             gguf_writer.add_pad_token_id(key["id"])
         else:
             # If no tokenizer.json: Look for special tokens in config.json
@@ -391,26 +355,13 @@ def chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
     fout.write(struct.pack("i", hparams["multi_query_group_num"]))
     fout.write(struct.pack("i", hparams["ffn_hidden_size"]))
     fout.write(struct.pack("i", 0))
-    fout.write(struct.pack("f", hparams.get("layernorm_epsilon",
-                                            1e-6)))  # rms norm eps
+    fout.write(struct.pack("f", hparams.get("layernorm_epsilon", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
 
-    fout.write(
-        struct.pack(
-            "i", tokenizer.bos_token_id
-            if tokenizer.bos_token_id is not None else 1))
-    fout.write(
-        struct.pack(
-            "i", tokenizer.eos_token_id
-            if tokenizer.eos_token_id is not None else 2))
-    fout.write(
-        struct.pack(
-            "i", tokenizer.pad_token_id
-            if tokenizer.pad_token_id is not None else -1))
-    fout.write(
-        struct.pack(
-            "i", tokenizer.sep_token_id
-            if tokenizer.sep_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
+    fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
+    fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))
 
     vocab = load_vocab_for_glm2(Path(dir_model))
     counter = 0
@@ -499,26 +450,13 @@ def chatglm1_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", hparams["inner_hidden_size"]))
-    fout.write(struct.pack("f", hparams.get("rms_norm_eps",
-                                            1e-6)))  # rms norm eps
+    fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
 
-    fout.write(
-        struct.pack(
-            "i", tokenizer.bos_token_id
-            if tokenizer.bos_token_id is not None else -1))
-    fout.write(
-        struct.pack(
-            "i", tokenizer.eos_token_id
-            if tokenizer.eos_token_id is not None else -1))
-    fout.write(
-        struct.pack(
-            "i", tokenizer.pad_token_id
-            if tokenizer.pad_token_id is not None else -1))
-    fout.write(
-        struct.pack(
-            "i", tokenizer.sep_token_id
-            if tokenizer.sep_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))
 
     vocab = load_vocab_for_glm1(Path(dir_model))
     counter = 0
@@ -576,17 +514,10 @@ def chatglm1_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
 
 
 def main(args_in: Optional[List[str]] = None) -> None:
-    parser = argparse.ArgumentParser(
-        description="Convert a model to a NE compatible file")
-    parser.add_argument("--outtype",
-                        choices=["f32", "f16"],
-                        help="output format (default: based on input)")
-    parser.add_argument("--outfile",
-                        type=Path,
-                        help="path to write to; default: based on input")
-    parser.add_argument("model",
-                        type=Path,
-                        help="directory containing model file")
+    parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
+    parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
+    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("model", type=Path, help="directory containing model file")
     parser.add_argument("--format",
                         type=str,
                         default="NE",
@@ -607,22 +538,16 @@ def main(args_in: Optional[List[str]] = None) -> None:
     if args.outtype == "f16":
         ftype = 1
 
-    tokenizer = AutoTokenizer.from_pretrained(dir_model,
-                                              trust_remote_code=True)
-    model = AutoModel.from_pretrained(dir_model,
-                                      low_cpu_mem_usage=True,
-                                      trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+    model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
 
     if hasattr(model.config, "multi_query_attention"):
         if args.format == "GGUF":
-            chatglm2_convert_gguf(model, tokenizer, dir_model, fname_out,
-                                  ftype, hparams)
+            chatglm2_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams)
         else:
-            chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype,
-                             hparams)
+            chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams)
     else:
-        chatglm1_convert(model, tokenizer, dir_model, fname_out, ftype,
-                         hparams)
+        chatglm1_convert(model, tokenizer, dir_model, fname_out, ftype, hparams)
 
 
 if __name__ == '__main__':
diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index 95bb19e93..8b448c95d 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -31,8 +31,8 @@
 from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
-from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
-                    Literal, Optional, Sequence, Tuple, TypeVar, Union)
+from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple,
+                    TypeVar, Union)
 import numpy as np
 from sentencepiece import SentencePieceProcessor  # type: ignore
 import gguf
@@ -70,13 +70,7 @@ class QuantizedDataType:
 
 DataType = Union[UnquantizedDataType, QuantizedDataType]
 
-DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {
-    DT_F32: 0,
-    DT_F16: 1,
-    DT_Q4_0: 2,
-    DT_Q4_1: 3,
-    DT_BOOL: 4
-}
+DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {DT_F32: 0, DT_F16: 1, DT_Q4_0: 2, DT_Q4_1: 3, DT_BOOL: 4}
 
 FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \
     {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}
@@ -171,22 +165,18 @@ def guessed(model: 'LazyModel') -> 'Params':
             n_head=n_embd // 128,
             n_head_kv=n_embd // 128,
             f_norm_eps=1e-5,
-            n_layer=next(
-                i for i in itertools.count()
-                if f"model.layers.{i}.self_attn.q_proj.weight" not in model),
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model),
         )
 
     @staticmethod
-    def loadHFTransformerJson(model: 'LazyModel',
-                              config_path: Path) -> 'Params':
+    def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
         config = json.load(open(config_path))
 
         n_vocab = config["vocab_size"]
         n_embd = config["hidden_size"]
         n_layer = config["num_hidden_layers"]
         n_head = config["num_attention_heads"]
-        n_head_kv = config[
-            "num_key_value_heads"] if "num_key_value_heads" in config else n_head
+        n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
         ffn_hidden_size = config["intermediate_size"]
         rms_norm_eps = config["rms_norm_eps"]
         rope_theta = config["rope_theta"] if "rope_theta" in config else 10000
@@ -207,8 +197,7 @@ def loadHFTransformerJson(model: 'LazyModel',
     # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8,
     #  "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
     @staticmethod
-    def loadOriginalParamsJson(model: 'LazyModel',
-                               config_path: Path) -> 'Params':
+    def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
         config = json.load(open(config_path))
 
         n_vocab = config["vocab_size"] if "vocab_size" in config else -1
@@ -240,8 +229,7 @@ def load(model: 'ModelPlus') -> 'Params':
         if hf_config_path.exists():
             params = Params.loadHFTransformerJson(model.model, hf_config_path)
         elif orig_config_path.exists():
-            params = Params.loadOriginalParamsJson(model.model,
-                                                   orig_config_path)
+            params = Params.loadOriginalParamsJson(model.model, orig_config_path)
         elif model.format != 'none':
             params = Params.guessed(model.model)
         else:
@@ -253,10 +241,8 @@ def load(model: 'ModelPlus') -> 'Params':
 
 
 class SentencePieceVocab:
-    def __init__(self, fname_tokenizer: Path,
-                 fname_added_tokens: Optional[Path]) -> None:
-        self.sentencepiece_tokenizer = SentencePieceProcessor(
-            str(fname_tokenizer))
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
         added_tokens: Dict[str, int]
         if fname_added_tokens is not None:
             added_tokens = json.load(open(fname_added_tokens))
@@ -266,15 +252,12 @@ def __init__(self, fname_tokenizer: Path,
         expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
         actual_ids = sorted(added_tokens.values())
         if expected_ids != actual_ids:
-            print(
-                f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}"
-            )
+            print(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
             added_tokens = {}
         items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
         self.added_tokens_list = [text for (text, idx) in items]
         self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(
-            self.added_tokens_list)
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
         self.fname_tokenizer = fname_tokenizer
         self.fname_added_tokens = fname_added_tokens
 
@@ -293,8 +276,7 @@ def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
                 byte_value = int(piece[3:-1], 16)
                 text = struct.pack("B", byte_value)
             else:
-                text = tokenizer.id_to_piece(i).replace("\u2581",
-                                                        " ").encode("utf-8")
+                text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
             score: float = tokenizer.get_score(i)
             yield text, score
 
@@ -331,12 +313,10 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
     if n_head_kv is not None and n_head != n_head_kv:
         n_head //= n_head_kv
     return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2,
-                            *weights.shape[1:]).swapaxes(1, 2).reshape(
-                                weights.shape))
+                            *weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape))
 
 
-def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray,
-                  addends: Optional[NDArray],
+def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray],
                   g_idx: Optional[NDArray]) -> NDArray:
     # First reinterpret each row from a list of int32s containing 8 values each
     # to a list of uint8s containing 2 values each.
@@ -344,8 +324,7 @@ def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray,
 
     # Then split out the two values per int8 (which requires an actual
     # conversion because numpy doesn't natively support int4s).
-    qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2],
-                       dtype=np.uint8)
+    qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8)
     qvalues[:, 0::2] = qvalues_pack8 & 0xf
     qvalues[:, 1::2] = qvalues_pack8 >> 4
 
@@ -418,9 +397,7 @@ def permute(self, n_head: int, kv_head: int) -> 'UnquantizedTensor':
         return UnquantizedTensor(permute(self.ndarray, n_head, kv_head))
 
 
-def load_unquantized(lazy_tensor: 'LazyTensor',
-                     expected_dtype: Any = None,
-                     convert: bool = False) -> NDArray:
+def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
     tensor = lazy_tensor.load()
     assert isinstance(tensor, UnquantizedTensor)
 
@@ -431,9 +408,7 @@ def load_unquantized(lazy_tensor: 'LazyTensor',
         if convert:
             tensor.ndarray = tensor.ndarray.astype(expected_dtype)
         else:
-            raise ValueError(
-                f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}'
-            )
+            raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
 
     return tensor.ndarray
 
@@ -441,13 +416,10 @@ def load_unquantized(lazy_tensor: 'LazyTensor',
 class NEQuantizedTensor(Tensor):
     data_type: QuantizedDataType
 
-    def __init__(self, ndarray: NDArray, shape: List[int],
-                 data_type: DataType) -> None:
+    def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None:
         rows, columns = shape
         assert data_type in (DT_Q4_1, DT_Q4_0)  # for now
-        assert isinstance(
-            data_type,
-            QuantizedDataType)  # redundant, but mypy complains without this
+        assert isinstance(data_type, QuantizedDataType)  # redundant, but mypy complains without this
         assert columns % data_type.groupsize == 0
         words_in_block = 6 if data_type == DT_Q4_1 else 5
         self.ndarray = ndarray.view(dtype=np.uint32).reshape(
@@ -463,8 +435,7 @@ def astype(self, data_type: DataType) -> Tensor:
             addends = self.ndarray[:, :, 1].view(np.float32)
         else:
             addends = None
-        qweights = self.ndarray[:, :, -4:].reshape(
-            [self.shape[0], self.shape[1] // 8])
+        qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8])
 
         dq = dequantize_q4(qweights, scales, addends, g_idx=None)
         return UnquantizedTensor(dq).astype(data_type)
@@ -473,8 +444,7 @@ def to_ne(self) -> 'NEQuantizedTensor':
         return self
 
     def permute(self, n_head: int, kv_head: int) -> 'NEQuantizedTensor':
-        return NEQuantizedTensor(permute(self.ndarray, n_head, kv_head),
-                                 self.shape, self.data_type)
+        return NEQuantizedTensor(permute(self.ndarray, n_head, kv_head), self.shape, self.data_type)
 
 
 NECompatibleTensor = Union[UnquantizedTensor, NEQuantizedTensor]
@@ -500,9 +470,7 @@ def permute(self, n_head: int, kv_head: int) -> Tensor:
 class GPTQForLLaMaQuantizedTensor(Tensor):
     def __init__(self, model: 'LazyModel', namebase: str) -> None:
         qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32)
-        scales = load_unquantized(model[f"{namebase}.scales"],
-                                  np.float32,
-                                  convert=True)
+        scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True)
 
         bias = model.get(f"{namebase}.bias")
         if bias is not None:
@@ -564,8 +532,7 @@ def astype(self, data_type: DataType) -> Tensor:
             assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False
             return self.regroup(data_type.groupsize)
 
-        dequantized = dequantize_q4(np.ascontiguousarray(self.qweight),
-                                    self.scales, self.addends, self.g_idx)
+        dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx)
         return UnquantizedTensor(dequantized).astype(data_type)
 
     def groupsize(self) -> int:
@@ -573,8 +540,7 @@ def groupsize(self) -> int:
         assert self.shape[1] % self.scales.shape[1] == 0
         return self.shape[1] // self.scales.shape[1]
 
-    def regroup(self,
-                new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor':
+    def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor':
         # Old versions of GPTQ-for-LLaMa shared scales and addends between all the
         # columns in a row.  Newer versions share them between every set of N
         # columns in a row, where N is the `groupsize` parameter, usually 128.  The
@@ -585,12 +551,9 @@ def regroup(self,
         old_groupsize = self.groupsize()
         assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize
         ret = copy.copy(self)
-        ret.addends = self.addends.repeat(old_groupsize // new_groupsize,
-                                          axis=1)
+        ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1)
         ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1)
-        ret.data_type = QuantizedDataType(groupsize=new_groupsize,
-                                          have_addends=True,
-                                          have_g_idx=False)
+        ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
         return ret
 
     def permute(self, n_head: int, kv_head: int) -> Tensor:
@@ -605,8 +568,7 @@ def to_ne(self) -> NEQuantizedTensor:
         #     - weights (int4 * 32, 16 bytes)
 
         if self.groupsize() != 32:
-            raise Exception(
-                "should have been regrouped before converting to ne")
+            raise Exception("should have been regrouped before converting to ne")
 
         # Since the output format is mixed between integers and floats, we have
         # to hackily view the floats as int32s just so numpy will let us
@@ -615,13 +577,10 @@ def to_ne(self) -> NEQuantizedTensor:
         scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis]
 
         # Split into groups of 4 columns (i.e. 32 columns of quantized data):
-        grouped = self.qweight.reshape(
-            [self.qweight.shape[0], self.qweight.shape[1] // 4, 4])
+        grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4])
 
         # And concatenate:
-        grouped = np.concatenate([scales_view, addends_view, grouped],
-                                 axis=2,
-                                 casting='no')
+        grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no')
 
         return NEQuantizedTensor(grouped, self.shape, DT_Q4_1)
 
@@ -635,8 +594,7 @@ class LazyTensor:
 
     def load(self) -> Tensor:
         ret = self._load()
-        assert ret.data_type == self.data_type, (self.data_type, ret.data_type,
-                                                 self.description)
+        assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
         return ret
 
     def astype(self, data_type: DataType) -> 'LazyTensor':
@@ -645,23 +603,19 @@ def astype(self, data_type: DataType) -> 'LazyTensor':
         def load() -> Tensor:
             return self.load().astype(data_type)
 
-        return LazyTensor(load, self.shape, data_type,
-                          f'convert({data_type}) {self.description}')
+        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
 
     def validate_conversion_to(self, data_type: DataType) -> None:
         if data_type == self.data_type:
             return
         if isinstance(data_type, QuantizedDataType):
             if not isinstance(self.data_type, QuantizedDataType):
-                raise Exception(
-                    f"Can't turn an unquantized tensor into a quantized type ({data_type})"
-                )
+                raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
             if self.data_type.have_g_idx:
                 sys.stderr.write(
                     "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively\
                      supported by NE.  For now you can still convert this model by passing `--outtype f16` to \
-                     dequantize, but that will result in a much larger output file for no quality benefit.\n"
-                )
+                     dequantize, but that will result in a much larger output file for no quality benefit.\n")
                 sys.exit(1)
             assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
 
@@ -674,8 +628,7 @@ class ModelPlus:
     model: LazyModel
     paths: List[Path]  # Where this was read from.
     format: Literal['ne', 'torch', 'safetensors']
-    vocab: Optional[
-        Vocab]  # For NE models (which have vocab built in), the vocab.
+    vocab: Optional[Vocab]  # For NE models (which have vocab built in), the vocab.
 
 
 def merge_sharded(models: List[LazyModel]) -> LazyModel:
@@ -701,18 +654,15 @@ def convert(name: str) -> LazyTensor:
             # split by rows
             axis = 0
         concatenated_shape = list(lazy_tensors[0].shape)
-        concatenated_shape[axis] = sum(tensor.shape[axis]
-                                       for tensor in lazy_tensors)
+        concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
 
         def load() -> UnquantizedTensor:
             ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
             concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
             return UnquantizedTensor(concatenated)
 
-        description = 'concatenated[[' + '] | ['.join(
-            lt.description for lt in lazy_tensors) + ']]'
-        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type,
-                          description)
+        description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
+        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
 
     return {name: convert(name) for name in names}
 
@@ -740,18 +690,15 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
     return ModelPlus(model, paths, format, vocab)
 
 
-def permute_lazy(lazy_tensor: LazyTensor, n_head: int,
-                 n_head_kv: int) -> LazyTensor:
+def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
     def load() -> Tensor:
         return lazy_tensor.load().permute(n_head, n_head_kv)
 
-    return LazyTensor(
-        load, lazy_tensor.shape, lazy_tensor.data_type,
-        f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
+    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type,
+                      f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
 
 
-def convert_transformers_to_orig(model: LazyModel,
-                                 params: Params) -> LazyModel:
+def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
     out: LazyModel = {}
     out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
     out["norm.weight"] = model["model.norm.weight"]
@@ -760,28 +707,19 @@ def convert_transformers_to_orig(model: LazyModel,
     for i in itertools.count():
         if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
             break
-        out[f"layers.{i}.attention.wq.weight"] = permute_lazy(
-            model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head,
-            params.n_head)
-        out[f"layers.{i}.attention.wk.weight"] = permute_lazy(
-            model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head,
-            params.n_head_kv)
-        out[f"layers.{i}.attention.wv.weight"] = model[
-            f"model.layers.{i}.self_attn.v_proj.weight"]
-        out[f"layers.{i}.attention.wo.weight"] = model[
-            f"model.layers.{i}.self_attn.o_proj.weight"]
-
-        out[f"layers.{i}.feed_forward.w1.weight"] = model[
-            f"model.layers.{i}.mlp.gate_proj.weight"]
-        out[f"layers.{i}.feed_forward.w2.weight"] = model[
-            f"model.layers.{i}.mlp.down_proj.weight"]
-        out[f"layers.{i}.feed_forward.w3.weight"] = model[
-            f"model.layers.{i}.mlp.up_proj.weight"]
-
-        out[f"layers.{i}.attention_norm.weight"] = model[
-            f"model.layers.{i}.input_layernorm.weight"]
-        out[f"layers.{i}.ffn_norm.weight"] = model[
-            f"model.layers.{i}.post_attention_layernorm.weight"]
+        out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"],
+                                                              params.n_head, params.n_head)
+        out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"],
+                                                              params.n_head, params.n_head_kv)
+        out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
+        out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
+
+        out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
+        out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"]
+        out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"]
+
+        out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"]
+        out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"]
     return out
 
 
@@ -803,20 +741,16 @@ def convert(name: str) -> Tuple[str, LazyTensor]:
             # GPTQForLLaMaQuantizedTensor (which is executed when the modelis
             # actually loaded).
             lazy_scales = model[f"{namebase}.scales"]
-            scales_width = 1 if lazy_scales.shape[
-                1] == 1 else lazy_scales.shape[0]
+            scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0]
             assert real_shape[1] % scales_width == 0
             groupsize = real_shape[1] // scales_width
             have_g_idx = f"{namebase}.g_idx" in model
-            data_type = QuantizedDataType(groupsize=groupsize,
-                                          have_addends=True,
-                                          have_g_idx=have_g_idx)
+            data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx)
 
             def load() -> Tensor:
                 return GPTQForLLaMaQuantizedTensor(model, namebase)
 
-            return (orig_name,
-                    LazyTensor(load, real_shape, data_type, '[quantized]'))
+            return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]'))
         else:
             return (name, model[name])
 
@@ -844,8 +778,7 @@ class LazyStorage:
 
 
 class LazyUnpickler(pickle.Unpickler):
-    def __init__(self, fp: IO[bytes], data_base_path: str,
-                 zip_file: zipfile.ZipFile):
+    def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
         super().__init__(fp)
         self.data_base_path = data_base_path
         self.zip_file = zip_file
@@ -887,12 +820,10 @@ def lazy_rebuild_tensor_v2(
 
         def load() -> UnquantizedTensor:
             elm_count = stride[0] * size[0]
-            return UnquantizedTensor(
-                storage.load(storage_offset, elm_count).reshape(size))
+            return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
 
         description = f'pickled storage_offset={storage_offset} in {storage.description}'
-        return LazyTensor(load, list(size), storage.kind.data_type,
-                          description)
+        return LazyTensor(load, list(size), storage.kind.data_type, description)
 
     # @staticmethod
     def rebuild_from_type_v2(func, new_type, args, state):
@@ -920,9 +851,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
     pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
     assert len(pickle_paths) == 1, pickle_paths
     pickle_fp = zf.open(pickle_paths[0], 'r')
-    unpickler = LazyUnpickler(pickle_fp,
-                              data_base_path=pickle_paths[0][:-4],
-                              zip_file=zf)
+    unpickler = LazyUnpickler(pickle_fp, data_base_path=pickle_paths[0][:-4], zip_file=zf)
     model = unpickler.load()
     as_dict = dict(model.items())
     return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
@@ -954,20 +883,13 @@ def convert(info: Dict[str, Any]) -> LazyTensor:
         buf = byte_buf[begin:end]
 
         def load() -> UnquantizedTensor:
-            return UnquantizedTensor(
-                np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+            return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
 
         description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
         return LazyTensor(load, shape, data_type, description)
 
-    model = {
-        name: convert(info)
-        for (name, info) in header.items() if name != '__metadata__'
-    }
-    return ModelPlus(model=model,
-                     paths=[path],
-                     format='safetensors',
-                     vocab=None)
+    model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
+    return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
 
 
 def must_read(fp: IO[bytes], length: int) -> bytes:
@@ -985,8 +907,7 @@ def lazy_load_ne_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
     else:
         assert magic == b'ne'
         version = None
-    n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack(
-        '<7i', must_read(fp, 28))
+    n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack('<7i', must_read(fp, 28))
 
     tokens: List[Tuple[bytes, float]] = []
     for i in range(n_vocab):
@@ -1016,12 +937,10 @@ def lazy_load_ne_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
     mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
     fp.raw.seek(off)  # needed on Windows
 
-    def read_tensor(
-    ) -> None:  # this is a function so that variables captured in `load` don't change
+    def read_tensor() -> None:  # this is a function so that variables captured in `load` don't change
         shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
         assert 0 <= shape_len <= 3
-        shape: List[int] = list(
-            struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len)))
+        shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len)))
         shape = shape[::-1]
         name = must_read(fp, name_len).decode('utf-8')
         data_type = FTYPE_TO_DATA_TYPE[ftype]
@@ -1047,8 +966,7 @@ def load() -> Tensor:
                 ndarray = np.frombuffer(buf, dtype=np.uint32)
                 return NEQuantizedTensor(ndarray, shape, data_type)
             else:
-                return UnquantizedTensor(
-                    np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+                return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
 
         description = f'ne offset={offset} type={data_type} path={path}'
         model[name] = LazyTensor(load, shape, data_type, description)
@@ -1082,8 +1000,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
 Out = TypeVar('Out')
 
 
-def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In],
-                         concurrency: int) -> Iterable[Out]:
+def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
     '''Parallel map, but with backpressure.  If the caller doesn't call `next`
     fast enough, this will stop calling `func` at some point rather than
     letting results pile up in memory.  Specifically, there is a max of one
@@ -1105,9 +1022,7 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
         # NEVocab comes from the same file as the model so shouldn't mismatch:
         assert isinstance(vocab, SentencePieceVocab)
         if params.n_vocab == vocab.vocab_size_base:
-            print(
-                "Ignoring added_tokens.json since model matches vocab size without it."
-            )
+            print("Ignoring added_tokens.json since model matches vocab size without it.")
             vocab.added_tokens_list = []
             vocab.vocab_size = vocab.vocab_size_base
             return
@@ -1159,12 +1074,9 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
         self.fout.write(struct.pack("i", 0))
         self.fout.write(struct.pack("i", 0))
 
-    def write_tensor_header(self, name: str, shape: Sequence[int],
-                            data_type: DataType) -> None:
+    def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
         sname = name.encode('utf-8')
-        self.fout.write(
-            struct.pack("iii", len(shape), len(sname),
-                        DATA_TYPE_TO_FTYPE[data_type]))
+        self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type]))
         self.fout.write(struct.pack("i" * len(shape), *shape[::-1]))
         self.fout.write(sname)
         self.fout.seek((self.fout.tell() + 31) & -32)
@@ -1190,8 +1102,7 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
         of.fout.close()
 
     @staticmethod
-    def write_all(fname_out: Path, params: Params, model: LazyModel,
-                  vocab: Vocab, file_type: NEFileType) -> None:
+    def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab, file_type: NEFileType) -> None:
         check_vocab_size(params, vocab)
         of = OutputFile(fname_out)
         of.write_file_header(params, file_type)
@@ -1203,13 +1114,11 @@ def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
             return lazy_tensor.load().to_ne().ndarray
 
         ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
-        for i, ((name, lazy_tensor),
-                ndarray) in enumerate(zip(model.items(), ndarrays)):
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
 
             size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
             padi = len(str(len(model)))
-            print(
-                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | \
+            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | \
                  type {lazy_tensor.data_type}")
             of.write_tensor_header(
                 name,
@@ -1251,8 +1160,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
         self.gguf_writer.add_int32('pad_token_id', 0)
         self.gguf_writer.add_int32('sep_token_id', 0)
 
-    def write_tensor_header_gguf(self, name: str, shape: Sequence[int],
-                                 data_type: DataType, data) -> None:
+    def write_tensor_header_gguf(self, name: str, shape: Sequence[int], data_type: DataType, data) -> None:
         # sname = name.encode('utf-8')
         # self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type]))
         # self.fout.write(struct.pack("i" * len(shape), *shape[::-1]))
@@ -1308,8 +1216,7 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
         of.fout.close()
 
     @staticmethod
-    def write_all(fname_out: Path, params: Params, model: LazyModel,
-                  vocab: Vocab, file_type: NEFileType) -> None:
+    def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab, file_type: NEFileType) -> None:
         check_vocab_size(params, vocab)
         of = OutputFile_GGUF(fname_out)
         of.write_file_header(params, file_type)
@@ -1321,42 +1228,32 @@ def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
             return lazy_tensor.load().to_ne().ndarray
 
         ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
-        for i, ((name, lazy_tensor),
-                ndarray) in enumerate(zip(model.items(), ndarrays)):
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
             size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
             padi = len(str(len(model)))
-            print(
-                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | \
+            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | \
                  type {lazy_tensor.data_type}")
-            of.write_tensor_header_gguf(name, lazy_tensor.shape,
-                                        lazy_tensor.data_type, ndarray)
+            of.write_tensor_header_gguf(name, lazy_tensor.shape, lazy_tensor.data_type, ndarray)
 
         of.end()
         of.fout.close()
 
 
-def pick_output_type(model: LazyModel,
-                     output_type_str: Optional[str]) -> NEFileType:
+def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> NEFileType:
     wq_type = model["layers.0.attention.wq.weight"].data_type
-    if output_type_str == "f32" or (output_type_str is None
-                                    and wq_type in (DT_F32, DT_BF16)):
+    if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
         return NEFileType.AllF32
-    if output_type_str == "f16" or (output_type_str is None
-                                    and wq_type == DT_F16):
+    if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
         return NEFileType.MostlyF16
-    if output_type_str == "q4_1" or (output_type_str is None and isinstance(
-            wq_type, QuantizedDataType) and wq_type.have_addends):
+    if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)
+                                     and wq_type.have_addends):
         if isinstance(model["output.weight"].data_type, QuantizedDataType):
             return NEFileType.MostlyQ4_1
         else:
             return NEFileType.PerLayerIsQ4_1
-    if output_type_str == "q4_0" or (output_type_str is None and isinstance(
-            wq_type, QuantizedDataType)):
+    if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)):
         return NEFileType.MostlyQ4_0
-    name_to_type = {
-        name: lazy_tensor.data_type
-        for (name, lazy_tensor) in model.items()
-    }
+    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
     raise Exception(f"Unexpected combination of types: {name_to_type}")
 
 
@@ -1370,12 +1267,8 @@ def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
     return model
 
 
-def convert_to_output_type(model: LazyModel,
-                           output_type: NEFileType) -> LazyModel:
-    return {
-        name: tensor.astype(output_type.type_for_tensor(name, tensor))
-        for (name, tensor) in model.items()
-    }
+def convert_to_output_type(model: LazyModel, output_type: NEFileType) -> LazyModel:
+    return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) for (name, tensor) in model.items()}
 
 
 def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
@@ -1425,10 +1318,7 @@ def load_some_model(path: Path) -> ModelPlus:
         files = list(path.glob("model-00001-of-*.safetensors"))
         if not files:
             # Try the PyTorch patterns too, with lower priority
-            globs = [
-                "consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt",
-                "pytorch_model.bin"
-            ]
+            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
             files = [file for glob in globs for file in path.glob(glob)]
         if not files:
             # Try NE too, but with lower priority, since if both a non-NE
@@ -1438,9 +1328,7 @@ def load_some_model(path: Path) -> ModelPlus:
         if not files:
             raise Exception(f"Can't find model in directory {path}")
         if len(files) > 1:
-            raise Exception(
-                f"Found multiple models in {path}, not sure which to pick: {files}"
-            )
+            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
         path = files[0]
 
     paths = find_multifile_paths(path)
@@ -1475,8 +1363,7 @@ def load_vocab(path: Path) -> SentencePieceVocab:
                 pass the directory as --vocab-dir")
     added_tokens_path = path.parent / "added_tokens.json"
     print(f"Loading vocab file {path}")
-    return SentencePieceVocab(
-        path, added_tokens_path if added_tokens_path.exists() else None)
+    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
 
 
 def default_outfile(model_paths: List[Path], params: Params) -> Path:
@@ -1501,41 +1388,26 @@ def do_dump_model(model_plus: ModelPlus) -> None:
     print(f"model_plus.format = {model_plus.format!r}")
     print(f"model_plus.vocab = {model_plus.vocab!r}")
     for name, lazy_tensor in model_plus.model.items():
-        print(
-            f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
-        )
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
 
 
 def main(args_in: Optional[List[str]] = None) -> None:
-    parser = argparse.ArgumentParser(
-        description="Convert a LLaMa model to a NE compatible file")
-    parser.add_argument("--dump",
+    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a NE compatible file")
+    parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
+    parser.add_argument("--dump-single",
                         action="store_true",
-                        help="don't convert, just show what's in the model")
-    parser.add_argument(
-        "--dump-single",
-        action="store_true",
-        help="don't convert, just show what's in a single model file")
-    parser.add_argument("--vocab-only",
-                        action="store_true",
-                        help="extract only the vocab")
+                        help="don't convert, just show what's in a single model file")
+    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
     parser.add_argument("--outtype",
                         choices=["f32", "f16", "q4_1", "q4_0"],
                         help="output format (default: based on input)")
-    parser.add_argument(
-        "--vocab-dir",
-        type=Path,
-        help="directory containing tokenizer.model, if separate from model file"
-    )
-    parser.add_argument("--outfile",
+    parser.add_argument("--vocab-dir",
+                        type=Path,
+                        help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("model",
                         type=Path,
-                        help="path to write to; default: based on input")
-    parser.add_argument(
-        "model",
-        type=Path,
-        help=
-        "directory containing model file, or model file itself (*.pth, *.pt, *.bin)"
-    )
+                        help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
     parser.add_argument("--format",
                         type=str,
                         default="NE",
@@ -1562,8 +1434,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
         if model_plus.vocab is not None and args.vocab_dir is None:
             vocab = model_plus.vocab
         else:
-            vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[
-                0].parent
+            vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
             vocab = load_vocab(vocab_dir)
 
         model = model_plus.model
@@ -1574,8 +1445,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
         outfile = args.outfile or default_outfile(model_plus.paths, params)
 
         if args.format == "GGUF":
-            OutputFile_GGUF.write_all(outfile, params, model, vocab,
-                                      output_type)
+            OutputFile_GGUF.write_all(outfile, params, model, vocab, output_type)
         else:
             OutputFile.write_all(outfile, params, model, vocab, output_type)
 
diff --git a/requirements.txt b/requirements.txt
index f82674779..e30a99c1d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,5 @@ datasets
 transformers_stream_generator
 tiktoken
 py-cpuinfo
-cmake
\ No newline at end of file
+gguf
+cmake