Fix huggingface tokenizer loading for slow tokenizers (mosaicml#2483)

mvpatel2000 · Aug 28, 2023 · a6c0b0f · a6c0b0f
1 parent f4c56c9
commit a6c0b0f
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 40 deletions.
diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
@@ -8,6 +8,9 @@
 import inspect
 import json
 import logging
+import os
+import random
+import string
 import tempfile
 import textwrap
 from pathlib import Path
@@ -140,14 +143,17 @@ def __init__(self,
         self.dummy_forward_called = False
 
     @staticmethod
-    def load_huggingface_tokenizer_from_saved_state(hf_state: Dict[str, Any],
-                                                    trust_remote_code: bool = False
-                                                   ) -> Optional[transformers.PreTrainedTokenizer]:
+    def load_huggingface_tokenizer_from_saved_state(
+            hf_state: Dict[str, Any],
+            trust_remote_code: bool = False,
+            tokenizer_save_dir: Optional[str] = None) -> Optional[transformers.PreTrainedTokenizer]:
         """A helper function that loads a HuggingFace tokenizer from a loaded in hf state.
 
         Args:
             hf_state (Dict[str, Any]): HF state loaded from a Composer checkpoint.
             trust_remote_code (bool, optional): Whether to trust the remote code when loading the tokenizer. Defaults to False.
+            tokenizer_save_dir (Optional[str], optional): If specified, where to save the tokenizer files to locally. If not specified,
+                a folder with a unique suffix will be saved in the current working directory. Defaults to None.
 
         Returns:
             Optional[transformers.PreTrainedTokenizer]: The loaded HuggingFace tokenizer
@@ -161,40 +167,56 @@ def load_huggingface_tokenizer_from_saved_state(hf_state: Dict[str, Any],
         hf_tokenizer = None
         hf_tokenizer_state = hf_state['tokenizer']
         if hf_tokenizer_state != {}:
-            with tempfile.TemporaryDirectory() as _tmp_dir:
-                for filename, saved_content in hf_tokenizer_state.items():
-                    tokenizer_file_path = Path(_tmp_dir) / f'{filename}{saved_content["file_extension"]}'
-                    if saved_content['file_extension'] == '.json':
-                        with open(tokenizer_file_path, 'w') as _tmp_file:
-                            json.dump(saved_content['content'], _tmp_file)
-                    elif saved_content['file_extension'] == '.txt':
-                        with open(tokenizer_file_path, 'w') as _tmp_file:
-                            for line in saved_content['content']:
-                                _tmp_file.write(line)
-                                _tmp_file.write('\n')
-                    elif saved_content['file_extension'] == '.py':
-                        with open(tokenizer_file_path, 'w') as _tmp_file:
-                            _tmp_file.write(saved_content['content'])
-                    elif saved_content['file_extension'] == '.model':
-                        try:
-                            import sentencepiece as spm
-                        except ImportError as e:
-                            raise MissingConditionalImportError(extra_deps_group='sentencepiece',
-                                                                conda_package='sentencepiece') from e
-                        s = spm.SentencePieceProcessor()
-                        s.load_from_serialized_proto(saved_content['content'])
-                        with open(tokenizer_file_path, 'wb') as _tmp_file:
-                            _tmp_file.write(s.serialized_model_proto())
-
-                hf_tokenizer = transformers.AutoTokenizer.from_pretrained(_tmp_dir, trust_remote_code=trust_remote_code)
-
-                # we need to set the name_or_path back because otherwise it is the tmp dir we are loading from here
-                hf_tokenizer.name_or_path = hf_tokenizer_state['tokenizer_config']['content'].get('name_or_path', '')
-                hf_tokenizer.init_kwargs['name_or_path'] = hf_tokenizer.name_or_path
-
-                # for an unknown reason this key is missing when loading the saved tokenizer, but present with a value of None
-                # for the original tokenizer, so we default it to None
-                hf_tokenizer.init_kwargs['tokenizer_file'] = hf_tokenizer.init_kwargs.get('tokenizer_file', None)
+            if tokenizer_save_dir is None:
+                unique_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
+                tokenizer_save_dir = os.path.join(os.getcwd(), f'tokenizer-save-dir-{unique_suffix}')
+            os.makedirs(tokenizer_save_dir, exist_ok=True)
+
+            for filename, saved_content in hf_tokenizer_state.items():
+                # This cannot be a temporary directory because huggingface relies on the slow tokenizer file
+                # being persistent on disk
+
+                # For backwards compatibility, check if the filename already has the file extension
+                if filename.endswith(saved_content['file_extension']):
+                    tokenizer_file_name = filename
+                else:
+                    tokenizer_file_name = filename + saved_content['file_extension']
+
+                tokenizer_file_path = Path(tokenizer_save_dir) / tokenizer_file_name
+                if saved_content['file_extension'] == '.json':
+                    with open(tokenizer_file_path, 'w') as _f:
+                        json.dump(saved_content['content'], _f)
+                elif saved_content['file_extension'] == '.txt':
+                    with open(tokenizer_file_path, 'w') as _f:
+                        for line in saved_content['content']:
+                            _f.write(line)
+                            _f.write('\n')
+                elif saved_content['file_extension'] == '.py':
+                    with open(tokenizer_file_path, 'w') as _f:
+                        _f.write(saved_content['content'])
+                elif saved_content['file_extension'] == '.model':
+                    try:
+                        import sentencepiece as spm
+                    except ImportError as e:
+                        raise MissingConditionalImportError(extra_deps_group='sentencepiece',
+                                                            conda_package='sentencepiece') from e
+                    s = spm.SentencePieceProcessor()
+                    s.load_from_serialized_proto(saved_content['content'])
+                    with open(tokenizer_file_path, 'wb') as _f:
+                        _f.write(s.serialized_model_proto())
+
+            hf_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_save_dir,
+                                                                      trust_remote_code=trust_remote_code)
+
+            # we need to set the name_or_path back because otherwise it is the tmp dir we are loading from here
+            # For backwards compatibility we try both the old and new key
+            tokenizer_config_key = 'tokenizer_config.json' if 'tokenizer_config.json' in hf_tokenizer_state else 'tokenizer_config'
+            hf_tokenizer.name_or_path = hf_tokenizer_state[tokenizer_config_key]['content'].get('name_or_path', '')
+            hf_tokenizer.init_kwargs['name_or_path'] = hf_tokenizer.name_or_path
+
+            # for an unknown reason this key is missing when loading the saved tokenizer, but present with a value of None
+            # for the original tokenizer, so we default it to None
+            hf_tokenizer.init_kwargs['tokenizer_file'] = hf_tokenizer.init_kwargs.get('tokenizer_file', None)
         return hf_tokenizer
 
     @staticmethod
@@ -498,7 +520,8 @@ def get_metadata(self):
                     else:
                         raise ValueError(
                             f'Unexpected file ending {tokenizer_file_name} in output of tokenizer.save_pretrained.')
-                    tokenizer_output[tokenizer_file_path.stem] = {
+
+                    tokenizer_output[tokenizer_file_path.name] = {
                         'file_extension': tokenizer_file_extension,
                         'content': tokenizer_file_content
                     }

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
@@ -109,6 +109,7 @@ def test_hf_train_eval_predict(num_classes: int, tiny_bert_config):
     assert predictions[0]['logits'].shape == (batch_size, num_classes)
 
 
+@pytest.mark.filterwarnings('ignore: The variance of predictions')
 def test_hf_train_eval_predict_regression(tiny_deberta_config):
     transformers = pytest.importorskip('transformers')
 
@@ -346,7 +347,7 @@ def test_hf_state_dict_info(tmp_path: Path, pass_in_tokenizer: bool, modify_toke
         with tempfile.TemporaryDirectory() as _tmp_dir:
             if dist.get_local_rank() == 0:
                 for filename, saved_content in hf_tokenizer_state.items():
-                    with open(Path(_tmp_dir) / f'{filename}{saved_content["file_extension"]}', 'w') as _tmp_file:
+                    with open(Path(_tmp_dir) / filename, 'w') as _tmp_file:
                         if saved_content['file_extension'] == '.json':
                             json.dump(saved_content['content'], _tmp_file)
                         elif saved_content['file_extension'] == '.txt':
@@ -543,7 +544,8 @@ def test_hf_loading_load_save_paths(checkpoint_upload_path: Optional[str], local
 
 
 @pytest.mark.parametrize('modify_tokenizer', [False, True])
-def test_hf_loading_sentencepiece_tokenizer(modify_tokenizer: bool, tmp_path: Path, tiny_t5_model):
+@pytest.mark.parametrize('save_fast', [True, False])
+def test_hf_loading_sentencepiece_tokenizer(modify_tokenizer: bool, tmp_path: Path, save_fast: bool, tiny_t5_model):
     transformers = pytest.importorskip('transformers')
 
     t0_pp_tokenizer = transformers.AutoTokenizer.from_pretrained('bigscience/T0pp')
@@ -558,9 +560,20 @@ def test_hf_loading_sentencepiece_tokenizer(modify_tokenizer: bool, tmp_path: Pa
     trainer = get_lm_trainer(tiny_t5_model, t0_pp_tokenizer, str(tmp_path), is_conditional_generation=True)
     trainer.save_checkpoint(str(tmp_path / 'hf-checkpoint.pt'))
 
+    if not save_fast:
+        sd = torch.load(str(tmp_path / 'hf-checkpoint.pt'))
+        # remove the fast tokenizer file from the checkpoint
+        del sd['state']['integrations']['huggingface']['tokenizer']['tokenizer.json']
+        torch.save(sd, str(tmp_path / 'hf-checkpoint.pt'))
+
     hf_loaded_model, hf_loaded_tokenizer = HuggingFaceModel.hf_from_composer_checkpoint(
         checkpoint_path=str(tmp_path / 'hf-checkpoint.pt'))
 
+    # Make sure we can use the loaded tokenizer and save it again
+    assert hf_loaded_tokenizer is not None
+    _ = hf_loaded_tokenizer('This is some text that should get tokenizer !? @ totallyarealtoken')
+    hf_loaded_tokenizer.save_pretrained(str(tmp_path / 'hf-tokenizer-2'))
+
     check_hf_model_equivalence(hf_loaded_model, tiny_t5_model)
     check_hf_tokenizer_equivalence(hf_loaded_tokenizer, t0_pp_tokenizer)