Skip to content

Commit

Permalink
py : pad with unknown tokens when data is missing
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
ggerganov committed Jan 16, 2024
1 parent 9b464b4 commit a137273
Showing 1 changed file with 9 additions and 0 deletions.
9 changes: 9 additions & 0 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1098,6 +1098,15 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]
scores.append(score)
toktypes.append(toktype)

# pad with unknown tokens and print warnings
# ref: https://github.com/ggerganov/llama.cpp/issues/4958
if len(tokens) < vocab.vocab_size:
for i in range(len(tokens), vocab.vocab_size):
tokens.append(f"<unk{i}>".encode("utf-8"))
scores.append(-1000.0)
toktypes.append(gguf.TokenType.UNKNOWN)
print(f"Warning: token {i} not found in vocab - padding with {tokens[-1]}")

return tokens, scores, toktypes

def add_meta_vocab(self, vocab: Vocab) -> None:
Expand Down

0 comments on commit a137273

Please sign in to comment.