Skip to content

Commit

Permalink
Fix error when decoding a token in the id gap (or out of range) in a …
Browse files Browse the repository at this point in the history
…tiktoken tokenizer (#841)
  • Loading branch information
dakinggg authored Jan 8, 2024
1 parent 5e85bd6 commit 5b99488
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 5 deletions.
5 changes: 4 additions & 1 deletion llmfoundry/tokenizers/tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,10 @@ def _convert_token_to_id(self, token: str) -> Optional[int]:

def _convert_id_to_token(self, index: int) -> Optional[str]:
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
# For tokens in either the gap in ids in the tokenizer, or beyond the range of the tokenizer,
# we return empty string. This matches the behavior of Hugging Face fast tokenizers,
# but not slow tokenizers.
return self.decoder.get(index, '')

def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Converts a sequence of tokens (string) in a single string."""
Expand Down
4 changes: 2 additions & 2 deletions llmfoundry/utils/huggingface_hub_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def process_file(
folder_path: str,
flatten_imports_prefix: Sequence[str],
) -> list[str]:
with open(file_path, 'r') as f:
with open(file_path, 'r', encoding='utf-8') as f:
source = f.read()

parent_module_name = None
Expand Down Expand Up @@ -102,7 +102,7 @@ def process_file(
if new_filename == '__init__.py':
new_filename = file_path.split('/')[-2] + '.py'
new_file_path = os.path.join(folder_path, new_filename)
with open(new_file_path, 'w') as f:
with open(new_file_path, 'w', encoding='utf-8') as f:
assert new_tree is not None
f.write(ast.unparse(new_tree))

Expand Down
20 changes: 18 additions & 2 deletions tests/tokenizers/test_tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,19 +338,23 @@ def test_additional_special_tokens(model_name: Optional[str],
encoding_name: Optional[str],
tmp_path: pathlib.Path):
special_token_to_add = '<|im_start|>'
input_string = special_token_to_add + ' hello'
wrapped_tokenizer, _, _ = get_tokenizers_for_testing(
model_name,
encoding_name,
tmp_path,
add_bos_token=False,
add_eos_token=False,
additional_special_tokens=[special_token_to_add])
encoded_outputs = wrapped_tokenizer(special_token_to_add +
' hello')['input_ids']
encoded_outputs = wrapped_tokenizer(input_string)['input_ids']

assert encoded_outputs[0] == wrapped_tokenizer.vocab_size
assert len(encoded_outputs) == 2

decoded_outputs = wrapped_tokenizer.decode(
encoded_outputs, spaces_between_special_tokens=False)
assert decoded_outputs == input_string


@pytest.mark.parametrize('model_name,encoding_name',
MODEL_ENCODING_NAME_PARAMETRIZATION)
Expand Down Expand Up @@ -386,3 +390,15 @@ def test_chat_formatting(model_name: Optional[str],
chat_str = wrapped_tokenizer.apply_chat_template(
dict_chats, tokenize=False, add_generation_prompt=True)
assert chat_str == MULTI_TURN_GENERATE_STRING[i]


def test_tiktoken_out_of_range():
wrapped_tokenizer = TiktokenTokenizerWrapper(model_name='gpt-4',)

# For gpt-4, 100256 is less than the vocab size, but is not a valid token
assert wrapped_tokenizer.decode([100256]) == ''
assert wrapped_tokenizer.decode(100256) == ''

# For gpt-4, 1000000 is greater than the vocab size
assert wrapped_tokenizer.decode([1000000]) == ''
assert wrapped_tokenizer.decode(1000000) == ''

0 comments on commit 5b99488

Please sign in to comment.