diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 071fb98ed8..104ad1764c 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -184,7 +184,7 @@ def build_tokenizer( int(1e30), ) - return tokenizer + return tokenizer def build_icl_evaluators( diff --git a/tests/test_builders.py b/tests/test_builders.py new file mode 100644 index 0000000000..adff8e55ee --- /dev/null +++ b/tests/test_builders.py @@ -0,0 +1,31 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from transformers import PreTrainedTokenizerBase + +from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper +from llmfoundry.utils.builders import build_tokenizer + + +@pytest.mark.parametrize('tokenizer_name,tokenizer_kwargs', [ + ('tiktoken', { + 'model_name': 'gpt-4' + }), + ('EleutherAI/gpt-neo-125M', { + 'model_max_length': 10 + }), + ('mosaicml/mpt-7b', { + 'model_max_length': 20 + }), +]) +def test_tokenizer_builder(tokenizer_name: str, tokenizer_kwargs: dict): + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + + if tokenizer_name == 'tiktoken': + assert isinstance(tokenizer, TiktokenTokenizerWrapper) + assert tokenizer.model_name == tokenizer_kwargs['model_name'] + else: + assert tokenizer.model_max_length == tokenizer_kwargs[ + 'model_max_length'] + assert isinstance(tokenizer, PreTrainedTokenizerBase)