Skip to content

Commit

Permalink
Fixes a bad merge in the tiktoken PR (#619)
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg authored Sep 21, 2023
1 parent a8e8783 commit 4402831
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 1 deletion.
2 changes: 1 addition & 1 deletion llmfoundry/utils/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def build_tokenizer(
int(1e30),
)

return tokenizer
return tokenizer


def build_icl_evaluators(
Expand Down
31 changes: 31 additions & 0 deletions tests/test_builders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

import pytest
from transformers import PreTrainedTokenizerBase

from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper
from llmfoundry.utils.builders import build_tokenizer


@pytest.mark.parametrize('tokenizer_name,tokenizer_kwargs', [
('tiktoken', {
'model_name': 'gpt-4'
}),
('EleutherAI/gpt-neo-125M', {
'model_max_length': 10
}),
('mosaicml/mpt-7b', {
'model_max_length': 20
}),
])
def test_tokenizer_builder(tokenizer_name: str, tokenizer_kwargs: dict):
tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)

if tokenizer_name == 'tiktoken':
assert isinstance(tokenizer, TiktokenTokenizerWrapper)
assert tokenizer.model_name == tokenizer_kwargs['model_name']
else:
assert tokenizer.model_max_length == tokenizer_kwargs[
'model_max_length']
assert isinstance(tokenizer, PreTrainedTokenizerBase)

0 comments on commit 4402831

Please sign in to comment.