From a6c0b0f8c11ee7bce3cc1328cf93334258bf4875 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Mon, 28 Aug 2023 11:35:34 -0700 Subject: [PATCH] Fix huggingface tokenizer loading for slow tokenizers (#2483) --- composer/models/huggingface.py | 99 +++++++++++++++++++++------------- tests/models/test_hf_model.py | 17 +++++- 2 files changed, 76 insertions(+), 40 deletions(-) diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py index 608678c204..bec3a1f08a 100644 --- a/composer/models/huggingface.py +++ b/composer/models/huggingface.py @@ -8,6 +8,9 @@ import inspect import json import logging +import os +import random +import string import tempfile import textwrap from pathlib import Path @@ -140,14 +143,17 @@ def __init__(self, self.dummy_forward_called = False @staticmethod - def load_huggingface_tokenizer_from_saved_state(hf_state: Dict[str, Any], - trust_remote_code: bool = False - ) -> Optional[transformers.PreTrainedTokenizer]: + def load_huggingface_tokenizer_from_saved_state( + hf_state: Dict[str, Any], + trust_remote_code: bool = False, + tokenizer_save_dir: Optional[str] = None) -> Optional[transformers.PreTrainedTokenizer]: """A helper function that loads a HuggingFace tokenizer from a loaded in hf state. Args: hf_state (Dict[str, Any]): HF state loaded from a Composer checkpoint. trust_remote_code (bool, optional): Whether to trust the remote code when loading the tokenizer. Defaults to False. + tokenizer_save_dir (Optional[str], optional): If specified, where to save the tokenizer files to locally. If not specified, + a folder with a unique suffix will be saved in the current working directory. Defaults to None. Returns: Optional[transformers.PreTrainedTokenizer]: The loaded HuggingFace tokenizer @@ -161,40 +167,56 @@ def load_huggingface_tokenizer_from_saved_state(hf_state: Dict[str, Any], hf_tokenizer = None hf_tokenizer_state = hf_state['tokenizer'] if hf_tokenizer_state != {}: - with tempfile.TemporaryDirectory() as _tmp_dir: - for filename, saved_content in hf_tokenizer_state.items(): - tokenizer_file_path = Path(_tmp_dir) / f'{filename}{saved_content["file_extension"]}' - if saved_content['file_extension'] == '.json': - with open(tokenizer_file_path, 'w') as _tmp_file: - json.dump(saved_content['content'], _tmp_file) - elif saved_content['file_extension'] == '.txt': - with open(tokenizer_file_path, 'w') as _tmp_file: - for line in saved_content['content']: - _tmp_file.write(line) - _tmp_file.write('\n') - elif saved_content['file_extension'] == '.py': - with open(tokenizer_file_path, 'w') as _tmp_file: - _tmp_file.write(saved_content['content']) - elif saved_content['file_extension'] == '.model': - try: - import sentencepiece as spm - except ImportError as e: - raise MissingConditionalImportError(extra_deps_group='sentencepiece', - conda_package='sentencepiece') from e - s = spm.SentencePieceProcessor() - s.load_from_serialized_proto(saved_content['content']) - with open(tokenizer_file_path, 'wb') as _tmp_file: - _tmp_file.write(s.serialized_model_proto()) - - hf_tokenizer = transformers.AutoTokenizer.from_pretrained(_tmp_dir, trust_remote_code=trust_remote_code) - - # we need to set the name_or_path back because otherwise it is the tmp dir we are loading from here - hf_tokenizer.name_or_path = hf_tokenizer_state['tokenizer_config']['content'].get('name_or_path', '') - hf_tokenizer.init_kwargs['name_or_path'] = hf_tokenizer.name_or_path - - # for an unknown reason this key is missing when loading the saved tokenizer, but present with a value of None - # for the original tokenizer, so we default it to None - hf_tokenizer.init_kwargs['tokenizer_file'] = hf_tokenizer.init_kwargs.get('tokenizer_file', None) + if tokenizer_save_dir is None: + unique_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6)) + tokenizer_save_dir = os.path.join(os.getcwd(), f'tokenizer-save-dir-{unique_suffix}') + os.makedirs(tokenizer_save_dir, exist_ok=True) + + for filename, saved_content in hf_tokenizer_state.items(): + # This cannot be a temporary directory because huggingface relies on the slow tokenizer file + # being persistent on disk + + # For backwards compatibility, check if the filename already has the file extension + if filename.endswith(saved_content['file_extension']): + tokenizer_file_name = filename + else: + tokenizer_file_name = filename + saved_content['file_extension'] + + tokenizer_file_path = Path(tokenizer_save_dir) / tokenizer_file_name + if saved_content['file_extension'] == '.json': + with open(tokenizer_file_path, 'w') as _f: + json.dump(saved_content['content'], _f) + elif saved_content['file_extension'] == '.txt': + with open(tokenizer_file_path, 'w') as _f: + for line in saved_content['content']: + _f.write(line) + _f.write('\n') + elif saved_content['file_extension'] == '.py': + with open(tokenizer_file_path, 'w') as _f: + _f.write(saved_content['content']) + elif saved_content['file_extension'] == '.model': + try: + import sentencepiece as spm + except ImportError as e: + raise MissingConditionalImportError(extra_deps_group='sentencepiece', + conda_package='sentencepiece') from e + s = spm.SentencePieceProcessor() + s.load_from_serialized_proto(saved_content['content']) + with open(tokenizer_file_path, 'wb') as _f: + _f.write(s.serialized_model_proto()) + + hf_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_save_dir, + trust_remote_code=trust_remote_code) + + # we need to set the name_or_path back because otherwise it is the tmp dir we are loading from here + # For backwards compatibility we try both the old and new key + tokenizer_config_key = 'tokenizer_config.json' if 'tokenizer_config.json' in hf_tokenizer_state else 'tokenizer_config' + hf_tokenizer.name_or_path = hf_tokenizer_state[tokenizer_config_key]['content'].get('name_or_path', '') + hf_tokenizer.init_kwargs['name_or_path'] = hf_tokenizer.name_or_path + + # for an unknown reason this key is missing when loading the saved tokenizer, but present with a value of None + # for the original tokenizer, so we default it to None + hf_tokenizer.init_kwargs['tokenizer_file'] = hf_tokenizer.init_kwargs.get('tokenizer_file', None) return hf_tokenizer @staticmethod @@ -498,7 +520,8 @@ def get_metadata(self): else: raise ValueError( f'Unexpected file ending {tokenizer_file_name} in output of tokenizer.save_pretrained.') - tokenizer_output[tokenizer_file_path.stem] = { + + tokenizer_output[tokenizer_file_path.name] = { 'file_extension': tokenizer_file_extension, 'content': tokenizer_file_content } diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py index a0ca8ad327..384064da33 100644 --- a/tests/models/test_hf_model.py +++ b/tests/models/test_hf_model.py @@ -109,6 +109,7 @@ def test_hf_train_eval_predict(num_classes: int, tiny_bert_config): assert predictions[0]['logits'].shape == (batch_size, num_classes) +@pytest.mark.filterwarnings('ignore: The variance of predictions') def test_hf_train_eval_predict_regression(tiny_deberta_config): transformers = pytest.importorskip('transformers') @@ -346,7 +347,7 @@ def test_hf_state_dict_info(tmp_path: Path, pass_in_tokenizer: bool, modify_toke with tempfile.TemporaryDirectory() as _tmp_dir: if dist.get_local_rank() == 0: for filename, saved_content in hf_tokenizer_state.items(): - with open(Path(_tmp_dir) / f'{filename}{saved_content["file_extension"]}', 'w') as _tmp_file: + with open(Path(_tmp_dir) / filename, 'w') as _tmp_file: if saved_content['file_extension'] == '.json': json.dump(saved_content['content'], _tmp_file) elif saved_content['file_extension'] == '.txt': @@ -543,7 +544,8 @@ def test_hf_loading_load_save_paths(checkpoint_upload_path: Optional[str], local @pytest.mark.parametrize('modify_tokenizer', [False, True]) -def test_hf_loading_sentencepiece_tokenizer(modify_tokenizer: bool, tmp_path: Path, tiny_t5_model): +@pytest.mark.parametrize('save_fast', [True, False]) +def test_hf_loading_sentencepiece_tokenizer(modify_tokenizer: bool, tmp_path: Path, save_fast: bool, tiny_t5_model): transformers = pytest.importorskip('transformers') t0_pp_tokenizer = transformers.AutoTokenizer.from_pretrained('bigscience/T0pp') @@ -558,9 +560,20 @@ def test_hf_loading_sentencepiece_tokenizer(modify_tokenizer: bool, tmp_path: Pa trainer = get_lm_trainer(tiny_t5_model, t0_pp_tokenizer, str(tmp_path), is_conditional_generation=True) trainer.save_checkpoint(str(tmp_path / 'hf-checkpoint.pt')) + if not save_fast: + sd = torch.load(str(tmp_path / 'hf-checkpoint.pt')) + # remove the fast tokenizer file from the checkpoint + del sd['state']['integrations']['huggingface']['tokenizer']['tokenizer.json'] + torch.save(sd, str(tmp_path / 'hf-checkpoint.pt')) + hf_loaded_model, hf_loaded_tokenizer = HuggingFaceModel.hf_from_composer_checkpoint( checkpoint_path=str(tmp_path / 'hf-checkpoint.pt')) + # Make sure we can use the loaded tokenizer and save it again + assert hf_loaded_tokenizer is not None + _ = hf_loaded_tokenizer('This is some text that should get tokenizer !? @ totallyarealtoken') + hf_loaded_tokenizer.save_pretrained(str(tmp_path / 'hf-tokenizer-2')) + check_hf_model_equivalence(hf_loaded_model, tiny_t5_model) check_hf_tokenizer_equivalence(hf_loaded_tokenizer, t0_pp_tokenizer)