Skip to content

Commit

Permalink
Remove logging fix to seperate PR. Relocate conversion script to avoi…
Browse files Browse the repository at this point in the history
…d logging circular import issue
  • Loading branch information
tjruwase committed Oct 17, 2023
1 parent 3dc989e commit b13006b
Show file tree
Hide file tree
Showing 18 changed files with 28 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,8 @@

def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--input_folder', type=str, help='Input DeepSpeed Checkpoint folder')
parser.add_argument('--output_folder', type=str, help='Output DeepSpeed checkpoint folder')
parser.add_argument('--target_tp', default=1, type=int, help='Target TP degree')
parser.add_argument('--target_pp', default=1, type=int, help='Target PP degree')
parser.add_argument('--input_folder', type=str, required=True, help='Input DeepSpeed Checkpoint folder')
parser.add_argument('--output_folder', type=str, required=True, help='Output DeepSpeed checkpoint folder')
parser.add_argument('--num_extract_workers',
default=4,
type=int,
Expand All @@ -56,6 +54,10 @@ def parse_arguments():
parser.add_argument('--for_release',
action='store_true',
help='Convert for release purpose, reset some (progress) counters.')
parser.add_argument('--keep_temp_folder',
action='store_true',
help='Preserve temporary folder of intermediate checkpoint slice files. Useful for debugging.'
)
args = parser.parse_args()
print(f'args = {args}')
return args
Expand Down Expand Up @@ -165,7 +167,6 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape):

universal_checkpoint_info = ds_checkpoint.get_checkpoint_info(UNIVERSAL_CHECKPOINT_INFO)
parameters_to_average = universal_checkpoint_info.get(PARAMETER_TO_AVERAGE_PATTERNS, [])
print(f'{parameters_to_average=}')
parameters_with_row_parallelism = universal_checkpoint_info.get(PARAMETER_WITH_ROW_PARALLELISM_PATTERNS, [])
vocabulary_parameters = universal_checkpoint_info.get(VOCABULARY_PARAMETER_PATTERNS, [])
for state in ("fp32", "exp_avg", "exp_avg_sq"):
Expand All @@ -177,10 +178,10 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape):
ckpt_dict = {}
if any(re.match(pattern, name) for pattern in parameters_to_average):
param = sum(slices) / len(slices)
print(f'merge {name} using average')
# print(f'merge {name} using average')
else:
cat_dim = 1 if any(re.match(pattern, name) for pattern in parameters_with_row_parallelism) else 0
print(f"merge {name} with CAT DIM: {cat_dim}")
# print(f"merge {name} with CAT DIM: {cat_dim}")
param = torch.cat(slices, dim=cat_dim)
ckpt_dict[CAT_DIM] = cat_dim

Expand Down Expand Up @@ -252,7 +253,7 @@ def main():
args = parse_arguments()
print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}')

ds_checkpoint = DeepSpeedCheckpoint(args.input_folder) #, 1, 2) # args.target_tp, args.target_pp)
ds_checkpoint = DeepSpeedCheckpoint(args.input_folder)
_check_for_required_state(ds_checkpoint)

iteration = ds_checkpoint.get_iteration()
Expand All @@ -278,7 +279,8 @@ def main():
print('*** 3. Saving common optimizer states')
_save_optimizer_state(args, ds_checkpoint)

# shutil.rmtree(temp_dir, ignore_errors=True)
if not args.keep_temp_folder:
shutil.rmtree(temp_dir, ignore_errors=True)

# Copy mp* files into output folder
for f in glob.glob(os.path.join(args.input_folder, 'mp*')):
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/inference/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import time
import os
from deepspeed import comm as dist
from deepspeed.utils import log_dist
from deepspeed.utils.logging import log_dist

from torch.nn.modules import Module
from packaging import version as pkg_version
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/inference/quantization/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from deepspeed.inference.quantization import layers
from .layers import QUANTIZATION_LAYER_MAPPINGS
from .utils import get_AsyncPartitionedParameterSwapper, recursive_setattr
from deepspeed.utils import logger
from deepspeed.utils.logging import logger
from collections import deque
from transformers.utils.generic import ContextManagers
from .quantization_context import QuantizationContext
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import torch
import torch.nn as nn
from deepspeed import comm as dist
from deepspeed.utils import log_dist
from deepspeed.utils.logging import log_dist

from deepspeed.ops.transformer.inference.ds_mlp import DeepSpeedMLP
from deepspeed.ops.transformer.inference.ds_attention import DeepSpeedSelfAttention, BloomSelfAttention
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/module_inject/fusedqkv_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# DeepSpeed Team
import torch
from deepspeed.utils import warning_once
from deepspeed.utils.logging import warning_once
import re


Expand Down
2 changes: 1 addition & 1 deletion deepspeed/ops/adagrad/cpu_adagrad.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import torch
from deepspeed.ops.op_builder import CPUAdagradBuilder
from deepspeed.utils import should_log_le
from deepspeed.utils.logging import should_log_le


class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
Expand Down
3 changes: 2 additions & 1 deletion deepspeed/ops/adam/cpu_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

import torch
from cpuinfo import get_cpu_info
from deepspeed.utils import logger, should_log_le
from deepspeed.utils import logger
from deepspeed.utils.logging import should_log_le
from deepspeed.ops.op_builder import CPUAdamBuilder


Expand Down
2 changes: 1 addition & 1 deletion deepspeed/ops/lion/cpu_lion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import torch
from cpuinfo import get_cpu_info
from deepspeed.utils import logger
from deepspeed.utils import should_log_le
from deepspeed.utils.logging import should_log_le
from deepspeed.ops.op_builder import CPULionBuilder


Expand Down
2 changes: 1 addition & 1 deletion deepspeed/ops/transformer/inference/diffusers_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from torch.autograd import Function
import torch.nn as nn
from packaging import version as pkg_version
from deepspeed.utils import log_dist
from deepspeed.utils.logging import log_dist
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder import InferenceBuilder

Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
from ..git_version_info import version

from deepspeed.profiling.flops_profiler.profiler import FlopsProfiler
from deepspeed.utils import print_json_dist, print_configuration
from deepspeed.utils.logging import print_json_dist, print_configuration

from deepspeed.accelerator import get_accelerator

Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/swap_tensor/async_swapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import torch

from deepspeed import comm as dist
from deepspeed.utils import logger
from deepspeed.utils.logging import logger
from deepspeed.runtime.swap_tensor.utils import swap_out_tensors, SwapBuffer
from deepspeed.accelerator import get_accelerator

Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/swap_tensor/optimizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import torch

from deepspeed import comm as dist
from deepspeed.utils import logger
from deepspeed.utils.logging import logger
from deepspeed.runtime.swap_tensor.constants import *
from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, \
MIN_AIO_BYTES, AIO_ALIGNED_BYTES, get_sized_buffers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import torch

from deepspeed.utils import logger
from deepspeed.utils.logging import logger
from deepspeed.ops.op_builder import AsyncIOBuilder
from deepspeed import comm as dist

Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/swap_tensor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"""

import torch
from deepspeed.utils import logger
from deepspeed.utils.logging import logger
from deepspeed.accelerator import get_accelerator

from deepspeed import comm as dist
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/zero/partitioned_param_coordinator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from typing import Deque, Set

from deepspeed import comm as dist
from deepspeed.utils import logger
from deepspeed.utils.logging import logger
from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
from deepspeed.runtime.zero.partition_parameters import *
from deepspeed.runtime.zero.partitioned_param_profiler import PartitionedParameterProfiler
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# DeepSpeed Team

from .ds_logging import logger, log_dist, print_json_dist, print_configuration, warning_once, should_log_le
from .logging import logger, log_dist
from .comms_logging import get_caller_func
#from .distributed import init_distributed
from .init_on_device import OnDevice
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion deepspeed/utils/timer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import time
from numpy import mean
from deepspeed.utils import log_dist
from deepspeed.utils.logging import log_dist
from deepspeed.accelerator import get_accelerator

FORWARD_MICRO_TIMER = 'fwd_microstep'
Expand Down

0 comments on commit b13006b

Please sign in to comment.