diff --git a/megatron/arguments.py b/megatron/arguments.py index d5e5970865..d1d69e7b3f 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -1245,8 +1245,11 @@ def _add_data_args(parser): 'GPT2BPETokenizer', 'SentencePieceTokenizer', 'GPTSentencePieceTokenizer', + 'HFAutoTokenizer', 'NullTokenizer'], help='What type of tokenizer to use.') + group.add_argument('--hf_autotokenizer_model', type=str, default=None, + help='Name of HF tokenizer model.') group.add_argument('--tokenizer-model', type=str, default=None, help='Sentencepiece tokenizer model.') group.add_argument('--data-impl', type=str, default='infer', diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 1ae4fd8487..71a706bf2c 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -10,8 +10,6 @@ from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer -from transformers import AutoTokenizer - def build_tokenizer(args): """Initialize tokenizer.""" if args.rank == 0: @@ -137,6 +135,7 @@ class _HFAutoTokenizer(AbstractTokenizer): def __init__(self, pretrained_model_name_or_path: str): name = "HFAutoTokenizer" super().__init__(name) + from transformers import AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path) self.tokenizer.add_special_tokens({"pad_token": "<|padding|>"}) self.eod_id = self.tokenizer.eos_token_id