From 16cc33c700e5eff3639eaf5ecbf9fb6b463b1cd9 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Mon, 24 Jun 2024 15:54:48 +0800 Subject: [PATCH 1/4] add push to hub tracker --- swift/hub/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/swift/hub/api.py b/swift/hub/api.py index 0d566f50b..d625098a3 100644 --- a/swift/hub/api.py +++ b/swift/hub/api.py @@ -9,6 +9,7 @@ import re import shutil import tempfile +import time import uuid from http import HTTPStatus from http.cookiejar import CookieJar @@ -142,7 +143,7 @@ def create_model(self, 'Visibility': visibility, # server check 'License': license, 'OriginalModelId': original_model_id, - 'TrainId': os.environ.get('MODELSCOPE_TRAIN_ID', ''), + 'TrainId': os.environ.get('MODELSCOPE_TRAIN_ID') or f'swift-{time.time()}', } r = self.session.post( path, json=body, cookies=cookies, headers=self.headers) From 6688285c62f62e6e264b1030232c4a9364ee2586 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 25 Jun 2024 14:16:08 +0800 Subject: [PATCH 2/4] wip --- swift/trainers/mixin.py | 91 +++++------------------------------------ swift/utils/hub.py | 11 ++++- 2 files changed, 20 insertions(+), 82 deletions(-) diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py index e87e4923f..f767bbe19 100644 --- a/swift/trainers/mixin.py +++ b/swift/trainers/mixin.py @@ -4,6 +4,7 @@ import re import shutil import time +from distutils.util import strtobool from pathlib import Path from types import MethodType from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -23,16 +24,16 @@ from transformers.trainer import (ADAPTER_CONFIG_NAME, ADAPTER_SAFE_WEIGHTS_NAME, ADAPTER_WEIGHTS_NAME, CONFIG_NAME, PREFIX_CHECKPOINT_DIR, SAFE_WEIGHTS_NAME, TRAINER_STATE_NAME, TRAINING_ARGS_NAME, WEIGHTS_NAME, IntervalStrategy, Trainer, TrainerCallback, is_peft_available) -from transformers.trainer_utils import EvalPrediction +from transformers.trainer_utils import EvalPrediction, HubStrategy from transformers.training_args import TrainingArguments -from transformers.utils import is_sagemaker_mp_enabled, is_torch_npu_available +from transformers.utils import is_sagemaker_mp_enabled, is_torch_npu_available, PushInProgress from swift.hub import Repository from swift.hub.check_model import check_local_model_is_latest from swift.torchacc_utils import (save_ta_ddp_checkpoint, save_ta_fsdp_checkpoint, ta_load_optimizer_and_scheduler, ta_save_optimizer_and_scheduler, ta_trim_graph) from swift.tuners import SwiftModel -from swift.utils import check_json_format, create_ms_repo, get_logger, use_torchacc +from swift.utils import check_json_format, create_ms_repo, get_logger, use_torchacc, push_to_ms_hub from swift.utils.constants import Invoke from .optimizers.galore import create_optimizer_and_scheduler from .utils import can_return_loss, find_labels, get_function, is_instance_of_ms_model @@ -53,6 +54,13 @@ def _push_to_hub(self: Repository, commit_message: str = 'Commit files to Models class PushToMsHubMixin: repo: Repository + _hub_type = 'hf' if strtobool(os.environ.get('USE_HF', 'False')) else 'ms' + + if _hub_type == 'ms': + import transformers.trainer + transformers.trainer.create_repo = create_ms_repo + transformers.trainer.upload_folder = push_to_ms_hub + def _add_patterns_to_file(self, file_name: str, patterns: List[str], commit_message: Optional[str] = None) -> None: # Make sure we only do this on the main process if not self.is_world_process_zero(): @@ -100,10 +108,6 @@ def _add_patterns_to_gitattributes(self, patterns: List[str], commit_message: Op commit_message = f'Add `{patterns[0]}` patterns to {file_name}' self._add_patterns_to_file(file_name, new_patterns, commit_message) - def init_hf_repo(self) -> None: - """init ms repo. Compatible with transformers>=4.34""" - self.init_git_repo(at_init=True) - def init_git_repo(self, at_init: bool = False) -> None: if not self.is_world_process_zero(): return @@ -131,79 +135,6 @@ def init_git_repo(self, at_init: bool = False) -> None: self.push_in_progress = None - def push_to_hub(self, commit_message: str = 'End of training', **kwargs) -> None: - # user calls manually `push_to_hub` with `self.args.push_to_hub = False` - create_model_card = kwargs.pop('create_model_card', None) - if not hasattr(self, 'repo'): - self.init_git_repo() - self.save_model(_internal_call=True) - - if not self.is_world_process_zero(): - return - - self.repo.push_to_hub(commit_message, **kwargs) - # push separately the model card to be independent from the rest of the model - readme_path = os.path.join(self.args.output_dir, 'README.md') - if create_model_card is None: - create_model_card = not os.path.exists(readme_path) - if create_model_card and self.args.should_save: - model_name = kwargs.pop('model_name', None) - if model_name is None and self.args.should_save: - if self.args.hub_model_id is not None: - model_name = self.args.hub_model_id.split('/')[-1] - else: - model_name = os.path.basename(self.args.output_dir) - self.create_model_card(model_name=model_name, **kwargs) - self.repo.push_to_hub('update model card README.md', **kwargs) - - def _push_from_checkpoint(self, checkpoint_folder: str) -> None: - """Compatible with transformers>=4.32""" - # Only push from one node. - if not self.is_world_process_zero() or self.args.push_hub_strategy == 'end': - return - output_dir = self.args.output_dir - # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder - modeling_files = [CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME] - if is_peft_available(): - modeling_files.extend([ADAPTER_CONFIG_NAME, ADAPTER_WEIGHTS_NAME, ADAPTER_SAFE_WEIGHTS_NAME]) - for modeling_file in modeling_files: - if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)): - shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file)) - # Saving the tokenizer is fast and we don't know how many files it may have spawned, so we resave it to be sure. - if self.tokenizer is not None: - self.tokenizer.save_pretrained(output_dir) - # Same for the training arguments - torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) - - try: - if self.args.push_hub_strategy == 'checkpoint': - # Temporarily move the checkpoint just saved for the push - tmp_checkpoint = os.path.join(output_dir, 'last-checkpoint') - # We have to remove the "last-checkpoint" dir if it exists, otherwise the checkpoint is moved as a - # subfolder. - if os.path.isdir(tmp_checkpoint): - shutil.rmtree(tmp_checkpoint) - shutil.move(checkpoint_folder, tmp_checkpoint) - - if self.args.save_strategy == IntervalStrategy.STEPS: - commit_message = f'Training in progress, step {self.state.global_step}' - else: - commit_message = f'Training in progress, epoch {int(self.state.epoch)}' - if self.args.push_hub_strategy == 'push_best': - folder, checkpoint_name = os.path.split(checkpoint_folder) - checkpoint_name = checkpoint_name.replace('tmp-checkpoint-', 'checkpoint-') - last_model_checkpoint = os.path.join(folder, checkpoint_name) - if last_model_checkpoint == self.state.best_model_checkpoint: - self.repo.push_to_hub(commit_message=commit_message, blocking=False, auto_lfs_prune=True) - else: - self.repo.push_to_hub(commit_message=commit_message, blocking=False, auto_lfs_prune=True) - except Exception as e: - logger.error(f'Error when pushing to hub: {e}') - finally: - if self.args.push_hub_strategy == 'checkpoint': - # Move back the checkpoint to its place - shutil.move(tmp_checkpoint, checkpoint_folder) - class SwiftMixin: diff --git a/swift/utils/hub.py b/swift/utils/hub.py index 20aa15df1..353db2824 100644 --- a/swift/utils/hub.py +++ b/swift/utils/hub.py @@ -15,8 +15,15 @@ logger = get_logger() -def create_ms_repo(hub_model_id: str, hub_token: Optional[str] = None, hub_private_repo: bool = False) -> str: - assert hub_model_id is not None, 'Please enter a valid hub_model_id' +def create_ms_repo( + repo_id: str, + *, + token: Optional[str] = None, + private: bool = False, + repo_type: Optional[str] = None, + exist_ok: bool = False, + **kwargs) -> str: + assert repo_id is not None, 'Please enter a valid hub_model_id' api = HubApi() if hub_token is None: From f913505c0455492339e29e4f300fb964f010fcdd Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 28 Jun 2024 16:31:28 +0800 Subject: [PATCH 3/4] wip --- swift/trainers/mixin.py | 7 ++++- swift/utils/hub.py | 64 ++++++++++++++++++++++------------------- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py index d44462214..c6396d2d9 100644 --- a/swift/trainers/mixin.py +++ b/swift/trainers/mixin.py @@ -61,6 +61,12 @@ class PushToMsHubMixin: transformers.trainer.create_repo = create_ms_repo transformers.trainer.upload_folder = push_to_ms_hub + def init_hf_repo(self) -> None: + if self._hub_type == 'hf': + return super().init_hf_repo() + else: + self.init_git_repo(at_init=True) + def _add_patterns_to_file(self, file_name: str, patterns: List[str], commit_message: Optional[str] = None) -> None: # Make sure we only do this on the main process if not self.is_world_process_zero(): @@ -132,7 +138,6 @@ def init_git_repo(self, at_init: bool = False) -> None: if os.environ.get('SM_TRAINING_ENV'): self._add_patterns_to_gitignore(['*.sagemaker-uploading', '*.sagemaker-uploaded'], 'Add `*.sagemaker` patterns to .gitignore') - self.push_in_progress = None diff --git a/swift/utils/hub.py b/swift/utils/hub.py index 353db2824..f7b880df9 100644 --- a/swift/utils/hub.py +++ b/swift/utils/hub.py @@ -3,7 +3,8 @@ import subprocess import tempfile import time -from typing import Optional +from pathlib import Path +from typing import Optional, Union, List from requests.exceptions import HTTPError @@ -20,65 +21,68 @@ def create_ms_repo( *, token: Optional[str] = None, private: bool = False, - repo_type: Optional[str] = None, - exist_ok: bool = False, **kwargs) -> str: assert repo_id is not None, 'Please enter a valid hub_model_id' api = HubApi() - if hub_token is None: + if token is None: hub_token = os.environ.get('MODELSCOPE_API_TOKEN') - if hub_token is not None: - api.login(hub_token) - visibility = ModelVisibility.PRIVATE if hub_private_repo else ModelVisibility.PUBLIC + if token is not None: + api.login(token) + visibility = ModelVisibility.PRIVATE if private else ModelVisibility.PUBLIC - if '/' not in hub_model_id: + if '/' not in repo_id: user_name = ModelScopeConfig.get_user_info()[0] assert isinstance(user_name, str) - hub_model_id = f'{user_name}/{hub_model_id}' - logger.info(f"'/' not in hub_model_id, setting hub_model_id: {hub_model_id}") + repo_id = f'{user_name}/{repo_id}' + logger.info(f"'/' not in hub_model_id, setting hub_model_id: {repo_id}") try: - api.create_model(hub_model_id, visibility) + api.create_model(repo_id, visibility) except HTTPError: # The remote repository has been created pass - return hub_model_id + return repo_id -def push_to_ms_hub(ckpt_dir: str, - hub_model_id: str, - hub_token: Optional[str] = None, - hub_private_repo: bool = False, - commit_message: str = 'update files'): - logger.info(f'Starting push to hub. ckpt_dir: {ckpt_dir}.') +def push_to_ms_hub(self, + *, + repo_id: str, + folder_path: Union[str, Path], + path_in_repo: Optional[str] = None, + commit_message: Optional[str] = None, + token: Union[str, bool, None] = None): + logger.info(f'Starting push to hub. ckpt_dir: {folder_path}.') tmp_file_name = tempfile.TemporaryDirectory().name subprocess_run(['git', 'lfs', 'env'], stdout=subprocess.PIPE) # check git-lfs install - hub_model_id = create_ms_repo(hub_model_id, hub_token, hub_private_repo) + path_in_repo = path_in_repo or '' + if not folder_path.endswith(path_in_repo): + folder_path = os.path.join(folder_path, path_in_repo) + git_token = ModelScopeConfig.get_token() - ms_url = f'https://oauth2:{git_token}@www.modelscope.cn/{hub_model_id}.git' - subprocess_run(['git', '-C', ckpt_dir, 'clone', ms_url, tmp_file_name], env={'GIT_LFS_SKIP_SMUDGE': '1'}) - tmp_dir = os.path.join(ckpt_dir, tmp_file_name) + ms_url = f'https://oauth2:{git_token}@www.modelscope.cn/{repo_id}.git' + subprocess_run(['git', '-C', folder_path, 'clone', ms_url, tmp_file_name], env={'GIT_LFS_SKIP_SMUDGE': '1'}) + tmp_dir = os.path.join(folder_path, tmp_file_name) subprocess_run(['git', '-C', tmp_dir, 'lfs', 'pull']) logger.info('Git clone the repo successfully.') # mv .git - dst_git_path = os.path.join(ckpt_dir, '.git') + dst_git_path = os.path.join(folder_path, '.git') if os.path.exists(dst_git_path): shutil.rmtree(dst_git_path) shutil.copytree(os.path.join(tmp_dir, '.git'), dst_git_path) - shutil.copy(os.path.join(tmp_dir, '.gitattributes'), os.path.join(ckpt_dir, '.gitattributes')) + shutil.copy(os.path.join(tmp_dir, '.gitattributes'), os.path.join(folder_path, '.gitattributes')) shutil.rmtree(tmp_dir) # add commit push - subprocess_run(['git', '-C', ckpt_dir, 'lfs', 'install']) + subprocess_run(['git', '-C', folder_path, 'lfs', 'install']) time.sleep(0.5) logger.info('Start `git add .`') - subprocess_run(['git', '-C', ckpt_dir, 'add', '.']) - if is_repo_clean(ckpt_dir): + subprocess_run(['git', '-C', folder_path, 'add', '.']) + if is_repo_clean(folder_path): logger.info('Repo currently clean. Ignoring commit and push_to_hub') else: - subprocess_run(['git', '-C', ckpt_dir, 'commit', '-m', commit_message]) - subprocess_run(['git', '-C', ckpt_dir, 'push']) - url = f'https://www.modelscope.cn/models/{hub_model_id}/summary' + subprocess_run(['git', '-C', folder_path, 'commit', '-m', commit_message]) + subprocess_run(['git', '-C', folder_path, 'push']) + url = f'https://www.modelscope.cn/models/{repo_id}/summary' logger.info(f'Push to Modelscope successful. url: `{url}`.') From ec10d18db398f8172af88b2efb2bb5dc07f12a95 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 28 Jun 2024 16:56:17 +0800 Subject: [PATCH 4/4] wip --- swift/trainers/mixin.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py index c6396d2d9..17968703a 100644 --- a/swift/trainers/mixin.py +++ b/swift/trainers/mixin.py @@ -41,16 +41,6 @@ logger = get_logger() -def _push_to_hub(self: Repository, commit_message: str = 'Commit files to Modelscope Hub', **kwargs): - blocking = kwargs.get('blocking', True) - self.push(commit_message) - if not blocking: - # Compatible with transformers - return None, None - else: - return None - - class PushToMsHubMixin: repo: Repository @@ -114,6 +104,16 @@ def _add_patterns_to_gitattributes(self, patterns: List[str], commit_message: Op commit_message = f'Add `{patterns[0]}` patterns to {file_name}' self._add_patterns_to_file(file_name, new_patterns, commit_message) + @staticmethod + def _push_to_hub(repo: Repository, commit_message: str = 'Commit files to Modelscope Hub', **kwargs): + blocking = kwargs.get('blocking', True) + repo.push(commit_message) + if not blocking: + # Compatible with transformers + return None, None + else: + return None + def init_git_repo(self, at_init: bool = False) -> None: if not self.is_world_process_zero(): return @@ -124,7 +124,7 @@ def init_git_repo(self, at_init: bool = False) -> None: self.args.hub_model_id = create_ms_repo(self.args.hub_model_id, self.args.hub_token, self.args.hub_private_repo) self.repo = Repository(self.args.output_dir, self.args.hub_model_id) self._add_patterns_to_gitattributes(['*.safetensors', '*.bin', '*.pt']) - self.repo.push_to_hub = MethodType(_push_to_hub, self.repo) + self.repo.push_to_hub = MethodType(self._push_to_hub, self.repo) self.repo.local_dir = self.repo.model_dir # hf compatibility # By default, ignore the checkpoint folders