From a1372737e0893490864e202bb8e48d14ab97fd2b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 16 Jan 2024 14:03:57 +0200 Subject: [PATCH] py : pad with unknown tokens when data is missing ggml-ci --- convert.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/convert.py b/convert.py index 316028592676a..dfd4c46214fe8 100755 --- a/convert.py +++ b/convert.py @@ -1098,6 +1098,15 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list] scores.append(score) toktypes.append(toktype) + # pad with unknown tokens and print warnings + # ref: https://github.com/ggerganov/llama.cpp/issues/4958 + if len(tokens) < vocab.vocab_size: + for i in range(len(tokens), vocab.vocab_size): + tokens.append(f"".encode("utf-8")) + scores.append(-1000.0) + toktypes.append(gguf.TokenType.UNKNOWN) + print(f"Warning: token {i} not found in vocab - padding with {tokens[-1]}") + return tokens, scores, toktypes def add_meta_vocab(self, vocab: Vocab) -> None: