Skip to content

Commit

Permalink
convert : handle tokenizer merges format from transformers 4.45 (gger…
Browse files Browse the repository at this point in the history
  • Loading branch information
compilade authored and arthw committed Nov 15, 2024
1 parent 40aabf4 commit 5fa8294
Showing 1 changed file with 24 additions and 2 deletions.
26 changes: 24 additions & 2 deletions gguf-py/gguf/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,30 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer = json.load(f)
if self.load_merges:
merges = tokenizer.get('model', {}).get('merges')
if isinstance(merges, list) and merges and isinstance(merges[0], str):
self.merges = merges
if isinstance(merges, list) and merges:
if isinstance(merges[0], str):
self.merges = merges
elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str):
# New format since transformers 4.45 to support spaces in merges
# ref: https://github.com/ggerganov/llama.cpp/issues/9692
# TODO: internally store as the new format instead of converting to old
if any(' ' in s for pair in merges for s in pair):
logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}')
self.merges = [
' '.join(
[
# ensure the spaces are properly encoded
''.join(
chr(ord(c) + 256) if c == ' ' else c
for c in part
)
for part in pair
]
)
for pair in merges
]
else:
raise ValueError("Unknown tokenizer merges format")
added_tokens = tokenizer.get('added_tokens', {})
else:
added_tokens = {}
Expand Down

0 comments on commit 5fa8294

Please sign in to comment.