Skip to content

Commit

Permalink
Add inverse chat template metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
CISC authored Sep 25, 2024
1 parent 1e43630 commit ebcbc45
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 0 deletions.
1 change: 1 addition & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ class Tokenizer:
CHAT_TEMPLATE = "tokenizer.chat_template"
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
CHAT_TEMPLATES = "tokenizer.chat_templates"
INVERSE_TEMPLATE = "tokenizer.inverse_template"
# FIM/Infill special tokens constants
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,9 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:

self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)

def add_inverse_template(self, value: str) -> None:
self.add_string(Keys.Tokenizer.INVERSE_TEMPLATE, value)

def add_prefix_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)

Expand Down
11 changes: 11 additions & 0 deletions gguf-py/gguf/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class SpecialVocab:
add_special_token: dict[str, bool]
special_token_ids: dict[str, int]
chat_template: str | Sequence[Mapping[str, str]] | None
inverse_template: str | None

def __init__(
self, path: str | os.PathLike[str], load_merges: bool = False,
Expand All @@ -33,6 +34,7 @@ def __init__(
self.load_merges = load_merges
self.merges = []
self.chat_template = None
self.inverse_template = None
if special_token_types is not None:
self.special_token_types = special_token_types
else:
Expand Down Expand Up @@ -71,6 +73,10 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
if not quiet:
logger.info(f'Setting chat_template to {self.chat_template}')
gw.add_chat_template(self.chat_template)
if self.inverse_template is not None:
if not quiet:
logger.info(f'Setting inverse_template to {self.inverse_template}')
gw.add_inverse_template(self.inverse_template)

def _load(self, path: Path) -> None:
self._try_load_from_tokenizer_json(path)
Expand Down Expand Up @@ -137,6 +143,11 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
self.chat_template = chat_template
else:
logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring')
inverse_template = tokenizer_config.get('inverse_template')
if inverse_template is None or isinstance(inverse_template, str):
self.inverse_template = inverse_template
else:
logger.warning(f'Bad type for inverse_template field in {tokenizer_config_file!r} - ignoring')
for typ in self.special_token_types:
add_entry = tokenizer_config.get(f'add_{typ}_token')
if isinstance(add_entry, bool):
Expand Down

0 comments on commit ebcbc45

Please sign in to comment.