ggerganov · ggerganov · Dec 14, 2023 · Oct 15, 2023 · Oct 15, 2023 · Oct 15, 2023
diff --git a/convert.py b/convert.py
@@ -300,7 +300,6 @@ def load(model_plus: ModelPlus) -> Params:
 #
 # vocab
 #
-
 class BpeVocab:
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
         self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
@@ -414,7 +413,59 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
     def __repr__(self) -> str:
         return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 
-Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
+
+class HFVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+        try:
+            from transformers import AutoTokenizer
+        except ModuleNotFoundError:
+            raise ImportError(
+                "To use HFVocab, please install the `transformers` package. "
+                "You can install it with `pip install transformers`."
+            )
+
+        self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
+
+        added_tokens: dict[str, int]
+        if fname_added_tokens is not None:
+            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
+        else:
+            added_tokens = {}
+
+        vocab_size: int = self.tokenizer.vocab_size
+
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids   = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        tokenizer = self.tokenizer
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+
+        for i in range(tokenizer.vocab_size):
+            text = reverse_vocab[i].encode("utf-8")
+            yield text, 0.0, gguf.TokenType.NORMAL
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.hf_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<HFVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab | HFVocab'
 
 #
 # data loading
@@ -844,12 +895,12 @@ def add_meta_vocab(self, vocab: Vocab) -> None:
             scores.append(score)
             toktypes.append(toktype)
 
-        if isinstance(vocab, SentencePieceVocab):
+        if isinstance(vocab, SentencePieceVocab) or isinstance(vocab, HFVocab):
             self.gguf.add_tokenizer_model("llama")
         elif isinstance(vocab, BpeVocab):
             self.gguf.add_tokenizer_model("gpt2")
         else:
-            raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
+            raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab or HFVocab')
         self.gguf.add_token_list(tokens)
         self.gguf.add_token_scores(scores)
         self.gguf.add_token_types(toktypes)
@@ -1057,25 +1108,51 @@ def load_some_model(path: Path) -> ModelPlus:
     return model_plus
 
 
+def vocab_check_and_append_path(path: Path, vocab_file: str) -> bool:
+    path2 = path / vocab_file
 def load_vocab(path: Path, vocabtype: str | None) -> Vocab: 
     # Be extra-friendly and accept either a file or a directory.  Also, if it's 
     # a directory, it might be the model directory, and tokenizer.model might 
     # be in the parent of that. 
     if path.is_dir(): 
         vocab_file = "tokenizer.model" 
         if vocabtype == 'bpe': 
             vocab_file = "vocab.json" 
         path2 = path / vocab_file 
         # Use `.parent` instead of /.. to handle the symlink case better. 
         path3 = path.parent / vocab_file 
         if path2.exists(): 
             path = path2 
         elif path3.exists(): 
             path = path3 
         else: 
             raise FileNotFoundError( 
                 f"Could not find {vocab_file} in {path} or its parent; " 
                 "if it's in another directory, pass the directory as --vocab-dir") 
 def load_vocab(path: Path, vocabtype: str | None) -> Vocab: 
     # Be extra-friendly and accept either a file or a directory.  Also, if it's 
     # a directory, it might be the model directory, and tokenizer.model might 
     # be in the parent of that. 
     if path.is_dir(): 
         vocab_file = "tokenizer.model" 
         if vocabtype == 'bpe': 
             vocab_file = "vocab.json" 
         path2 = path / vocab_file 
         # Use `.parent` instead of /.. to handle the symlink case better. 
         path3 = path.parent / vocab_file 
         if path2.exists(): 
             path = path2 
         elif path3.exists(): 
             path = path3 
         else: 
             raise FileNotFoundError( 
                 f"Could not find {vocab_file} in {path} or its parent; " 
                 "if it's in another directory, pass the directory as --vocab-dir") 
  
+    # Use `.parent` instead of /.. to handle the symlink case better.
+    path3 = path.parent / vocab_file
+
+    if path2.exists():
+        path = path2
+    elif path3.exists():
+        path = path3
+    else:
+        path = None
+
+    return path
+
+
+
 def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
     # Be extra-friendly and accept either a file or a directory.  Also, if it's
     # a directory, it might be the model directory, and tokenizer.model might
     # be in the parent of that.
     if path.is_dir():
+        find_candidates = []
+
         vocab_file = "tokenizer.model"
-        if vocabtype == 'bpe':
+        if vocabtype == "bpe":
             vocab_file = "vocab.json"
-        path2 = path / vocab_file
-        # Use `.parent` instead of /.. to handle the symlink case better.
-        path3 = path.parent / vocab_file
-        if path2.exists():
-            path = path2
-        elif path3.exists():
-            path = path3
+
+        path_candidate = vocab_check_and_append_path(path, vocab_file)
+        find_candidates.append(vocab_file)
+
+        if path_candidate is None:
+            vocab_file = "tokenizer.json"
+            hf_path = vocab_check_and_append_path(path, vocab_file)
+            find_candidates.append(vocab_file)
+
+            if vocabtype == "spm" and hf_path is not None:
+                # A case where there is no tokenizer.model but there is a tokenizer.json and it needs to be loaded into HFVocab.
+                vocabtype = "hf"
+            else:
+                raise FileNotFoundError(
+                    f"Could not find {find_candidates} in {path} or its parent; "
+                    "if it's in another directory, pass the directory as --vocab-dir")
         else:
-            raise FileNotFoundError(
-                f"Could not find {vocab_file} in {path} or its parent; "
-                "if it's in another directory, pass the directory as --vocab-dir")
+            path = path_candidate
+
 
     print(f"Loading vocab file '{path}', type '{vocabtype}'")
 
@@ -1084,6 +1161,8 @@ def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
         return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
     elif vocabtype == "spm":
         return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+    elif vocabtype == "hf":
+        return HFVocab(path, added_tokens_path if added_tokens_path.exists() else None)
     else:
         raise ValueError(f"Unsupported vocabulary type {vocabtype}")
 
@@ -1120,7 +1199,7 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
     parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
-    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
+    parser.add_argument("--vocabtype",   choices=["spm", "bpe", "hf"], help="vocab format (default: spm)", default="spm")
     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
     args = parser.parse_args(args_in)
@@ -1162,7 +1241,7 @@ def main(args_in: list[str] | None = None) -> None:
         assert args.outfile, "need --outfile if using --vocab-only"
         # FIXME: Try to respect vocab_dir somehow?
         vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
-        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
+        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = isinstance(vocab, BpeVocab) or isinstance(vocab, HFVocab))
         outfile = args.outfile
         OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
         print(f"Wrote {outfile}")
@@ -1174,7 +1253,7 @@ def main(args_in: list[str] | None = None) -> None:
         vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
         vocab = load_vocab(vocab_dir, args.vocabtype)
     # FIXME: Try to respect vocab_dir somehow?
-    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
+    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = isinstance(vocab, BpeVocab) or isinstance(vocab, HFVocab))
 
     model   = model_plus.model
     model   = convert_model_names(model, params)