Skip to content

Commit

Permalink
Fix huggingface tokenizer loading for slow tokenizers (mosaicml#2483)
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg authored Aug 28, 2023
1 parent f4c56c9 commit a6c0b0f
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 40 deletions.
99 changes: 61 additions & 38 deletions composer/models/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
import inspect
import json
import logging
import os
import random
import string
import tempfile
import textwrap
from pathlib import Path
Expand Down Expand Up @@ -140,14 +143,17 @@ def __init__(self,
self.dummy_forward_called = False

@staticmethod
def load_huggingface_tokenizer_from_saved_state(hf_state: Dict[str, Any],
trust_remote_code: bool = False
) -> Optional[transformers.PreTrainedTokenizer]:
def load_huggingface_tokenizer_from_saved_state(
hf_state: Dict[str, Any],
trust_remote_code: bool = False,
tokenizer_save_dir: Optional[str] = None) -> Optional[transformers.PreTrainedTokenizer]:
"""A helper function that loads a HuggingFace tokenizer from a loaded in hf state.
Args:
hf_state (Dict[str, Any]): HF state loaded from a Composer checkpoint.
trust_remote_code (bool, optional): Whether to trust the remote code when loading the tokenizer. Defaults to False.
tokenizer_save_dir (Optional[str], optional): If specified, where to save the tokenizer files to locally. If not specified,
a folder with a unique suffix will be saved in the current working directory. Defaults to None.
Returns:
Optional[transformers.PreTrainedTokenizer]: The loaded HuggingFace tokenizer
Expand All @@ -161,40 +167,56 @@ def load_huggingface_tokenizer_from_saved_state(hf_state: Dict[str, Any],
hf_tokenizer = None
hf_tokenizer_state = hf_state['tokenizer']
if hf_tokenizer_state != {}:
with tempfile.TemporaryDirectory() as _tmp_dir:
for filename, saved_content in hf_tokenizer_state.items():
tokenizer_file_path = Path(_tmp_dir) / f'{filename}{saved_content["file_extension"]}'
if saved_content['file_extension'] == '.json':
with open(tokenizer_file_path, 'w') as _tmp_file:
json.dump(saved_content['content'], _tmp_file)
elif saved_content['file_extension'] == '.txt':
with open(tokenizer_file_path, 'w') as _tmp_file:
for line in saved_content['content']:
_tmp_file.write(line)
_tmp_file.write('\n')
elif saved_content['file_extension'] == '.py':
with open(tokenizer_file_path, 'w') as _tmp_file:
_tmp_file.write(saved_content['content'])
elif saved_content['file_extension'] == '.model':
try:
import sentencepiece as spm
except ImportError as e:
raise MissingConditionalImportError(extra_deps_group='sentencepiece',
conda_package='sentencepiece') from e
s = spm.SentencePieceProcessor()
s.load_from_serialized_proto(saved_content['content'])
with open(tokenizer_file_path, 'wb') as _tmp_file:
_tmp_file.write(s.serialized_model_proto())

hf_tokenizer = transformers.AutoTokenizer.from_pretrained(_tmp_dir, trust_remote_code=trust_remote_code)

# we need to set the name_or_path back because otherwise it is the tmp dir we are loading from here
hf_tokenizer.name_or_path = hf_tokenizer_state['tokenizer_config']['content'].get('name_or_path', '')
hf_tokenizer.init_kwargs['name_or_path'] = hf_tokenizer.name_or_path

# for an unknown reason this key is missing when loading the saved tokenizer, but present with a value of None
# for the original tokenizer, so we default it to None
hf_tokenizer.init_kwargs['tokenizer_file'] = hf_tokenizer.init_kwargs.get('tokenizer_file', None)
if tokenizer_save_dir is None:
unique_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
tokenizer_save_dir = os.path.join(os.getcwd(), f'tokenizer-save-dir-{unique_suffix}')
os.makedirs(tokenizer_save_dir, exist_ok=True)

for filename, saved_content in hf_tokenizer_state.items():
# This cannot be a temporary directory because huggingface relies on the slow tokenizer file
# being persistent on disk

# For backwards compatibility, check if the filename already has the file extension
if filename.endswith(saved_content['file_extension']):
tokenizer_file_name = filename
else:
tokenizer_file_name = filename + saved_content['file_extension']

tokenizer_file_path = Path(tokenizer_save_dir) / tokenizer_file_name
if saved_content['file_extension'] == '.json':
with open(tokenizer_file_path, 'w') as _f:
json.dump(saved_content['content'], _f)
elif saved_content['file_extension'] == '.txt':
with open(tokenizer_file_path, 'w') as _f:
for line in saved_content['content']:
_f.write(line)
_f.write('\n')
elif saved_content['file_extension'] == '.py':
with open(tokenizer_file_path, 'w') as _f:
_f.write(saved_content['content'])
elif saved_content['file_extension'] == '.model':
try:
import sentencepiece as spm
except ImportError as e:
raise MissingConditionalImportError(extra_deps_group='sentencepiece',
conda_package='sentencepiece') from e
s = spm.SentencePieceProcessor()
s.load_from_serialized_proto(saved_content['content'])
with open(tokenizer_file_path, 'wb') as _f:
_f.write(s.serialized_model_proto())

hf_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_save_dir,
trust_remote_code=trust_remote_code)

# we need to set the name_or_path back because otherwise it is the tmp dir we are loading from here
# For backwards compatibility we try both the old and new key
tokenizer_config_key = 'tokenizer_config.json' if 'tokenizer_config.json' in hf_tokenizer_state else 'tokenizer_config'
hf_tokenizer.name_or_path = hf_tokenizer_state[tokenizer_config_key]['content'].get('name_or_path', '')
hf_tokenizer.init_kwargs['name_or_path'] = hf_tokenizer.name_or_path

# for an unknown reason this key is missing when loading the saved tokenizer, but present with a value of None
# for the original tokenizer, so we default it to None
hf_tokenizer.init_kwargs['tokenizer_file'] = hf_tokenizer.init_kwargs.get('tokenizer_file', None)
return hf_tokenizer

@staticmethod
Expand Down Expand Up @@ -498,7 +520,8 @@ def get_metadata(self):
else:
raise ValueError(
f'Unexpected file ending {tokenizer_file_name} in output of tokenizer.save_pretrained.')
tokenizer_output[tokenizer_file_path.stem] = {

tokenizer_output[tokenizer_file_path.name] = {
'file_extension': tokenizer_file_extension,
'content': tokenizer_file_content
}
Expand Down
17 changes: 15 additions & 2 deletions tests/models/test_hf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def test_hf_train_eval_predict(num_classes: int, tiny_bert_config):
assert predictions[0]['logits'].shape == (batch_size, num_classes)


@pytest.mark.filterwarnings('ignore: The variance of predictions')
def test_hf_train_eval_predict_regression(tiny_deberta_config):
transformers = pytest.importorskip('transformers')

Expand Down Expand Up @@ -346,7 +347,7 @@ def test_hf_state_dict_info(tmp_path: Path, pass_in_tokenizer: bool, modify_toke
with tempfile.TemporaryDirectory() as _tmp_dir:
if dist.get_local_rank() == 0:
for filename, saved_content in hf_tokenizer_state.items():
with open(Path(_tmp_dir) / f'{filename}{saved_content["file_extension"]}', 'w') as _tmp_file:
with open(Path(_tmp_dir) / filename, 'w') as _tmp_file:
if saved_content['file_extension'] == '.json':
json.dump(saved_content['content'], _tmp_file)
elif saved_content['file_extension'] == '.txt':
Expand Down Expand Up @@ -543,7 +544,8 @@ def test_hf_loading_load_save_paths(checkpoint_upload_path: Optional[str], local


@pytest.mark.parametrize('modify_tokenizer', [False, True])
def test_hf_loading_sentencepiece_tokenizer(modify_tokenizer: bool, tmp_path: Path, tiny_t5_model):
@pytest.mark.parametrize('save_fast', [True, False])
def test_hf_loading_sentencepiece_tokenizer(modify_tokenizer: bool, tmp_path: Path, save_fast: bool, tiny_t5_model):
transformers = pytest.importorskip('transformers')

t0_pp_tokenizer = transformers.AutoTokenizer.from_pretrained('bigscience/T0pp')
Expand All @@ -558,9 +560,20 @@ def test_hf_loading_sentencepiece_tokenizer(modify_tokenizer: bool, tmp_path: Pa
trainer = get_lm_trainer(tiny_t5_model, t0_pp_tokenizer, str(tmp_path), is_conditional_generation=True)
trainer.save_checkpoint(str(tmp_path / 'hf-checkpoint.pt'))

if not save_fast:
sd = torch.load(str(tmp_path / 'hf-checkpoint.pt'))
# remove the fast tokenizer file from the checkpoint
del sd['state']['integrations']['huggingface']['tokenizer']['tokenizer.json']
torch.save(sd, str(tmp_path / 'hf-checkpoint.pt'))

hf_loaded_model, hf_loaded_tokenizer = HuggingFaceModel.hf_from_composer_checkpoint(
checkpoint_path=str(tmp_path / 'hf-checkpoint.pt'))

# Make sure we can use the loaded tokenizer and save it again
assert hf_loaded_tokenizer is not None
_ = hf_loaded_tokenizer('This is some text that should get tokenizer !? @ totallyarealtoken')
hf_loaded_tokenizer.save_pretrained(str(tmp_path / 'hf-tokenizer-2'))

check_hf_model_equivalence(hf_loaded_model, tiny_t5_model)
check_hf_tokenizer_equivalence(hf_loaded_tokenizer, t0_pp_tokenizer)

Expand Down

0 comments on commit a6c0b0f

Please sign in to comment.