From 5132e44bc3a4afa90ac62e7e199645d8216044a4 Mon Sep 17 00:00:00 2001 From: Max Kovalenko Date: Thu, 4 Apr 2024 20:42:40 +0300 Subject: [PATCH 1/4] Add throughput timer configuration The new 'timers' section describes configuration for different timers. Specifically, in the "throughput" section, it is possible to disable the throughput timer (enabled by default). This allows to avoid the performance degradation whenever the throughput measurekent is not needed, for example in production environment. No device synchronize() is invoked when "synchronized" is set to False (default is True). This allows to produce approximate throughput measurements with minimal performance penalty. --- deepspeed/runtime/config.py | 4 ++++ deepspeed/runtime/engine.py | 9 ++++----- deepspeed/runtime/pipe/engine.py | 3 ++- deepspeed/utils/config.py | 28 +++++++++++++++++++++++++++ deepspeed/utils/constants.py | 33 ++++++++++++++++++++++++++++++++ deepspeed/utils/timer.py | 20 +++++++++---------- 6 files changed, 80 insertions(+), 17 deletions(-) create mode 100644 deepspeed/utils/config.py create mode 100644 deepspeed/utils/constants.py diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index 19b169086be1..d10de090fe78 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -66,6 +66,8 @@ from .data_pipeline.config import get_data_efficiency_enabled, get_data_efficiency_config, get_curriculum_enabled_legacy, get_curriculum_params_legacy from .data_pipeline.constants import * +from ..utils.config import DeepSpeedThroughputTimerConfig + TENSOR_CORE_ALIGN_SIZE = 8 ADAGRAD_OPTIMIZER = 'adagrad' @@ -911,6 +913,8 @@ def _initialize_params(self, param_dict): self.compile_config = get_compile_config(param_dict) + self.timers_config = DeepSpeedThroughputTimerConfig(param_dict) + def _batch_assertion(self): train_batch = self.train_batch_size diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 3ad37baeedcb..5797ceeac7be 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -271,11 +271,10 @@ def __init__(self, # Configure wall clock timers self.timers = SynchronizedWallClockTimer() # Throughput timer - self.tput_timer = ThroughputTimer( - batch_size=self.train_batch_size(), - steps_per_output=self.steps_per_print(), - monitor_memory=False, - ) + self.tput_timer = ThroughputTimer(self._config.timers_config, + batch_size=self.train_batch_size(), + steps_per_output=self.steps_per_print(), + monitor_memory=False) log_dist(f"DeepSpeed Flops Profiler Enabled: {self.flops_profiler_enabled()}", ranks=[0]) diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index ef1c98a95c7b..50211ac74a2f 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -118,7 +118,8 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs): self._force_grad_boundary = False - self.batch_timer = ThroughputTimer(batch_size=self.train_batch_size(), + self.batch_timer = ThroughputTimer(self._config.timers_config, + batch_size=self.train_batch_size(), logging_fn=self.tput_log, monitor_memory=False, steps_per_output=self.steps_per_print()) diff --git a/deepspeed/utils/config.py b/deepspeed/utils/config.py new file mode 100644 index 000000000000..e5c6e6010160 --- /dev/null +++ b/deepspeed/utils/config.py @@ -0,0 +1,28 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject +from deepspeed.utils.constants import * + + +class DeepSpeedThroughputTimerConfig(DeepSpeedConfigObject): + + def __init__(self, param_dict): + super(DeepSpeedThroughputTimerConfig, self).__init__() + + self.enabled = None + self.synchronized = None + + timers_dict = {} + if param_dict and TIMERS in param_dict: + if TIMERS_THROUGHPUT in param_dict[TIMERS]: + timers_dict = param_dict[TIMERS][TIMERS_THROUGHPUT] + + self._initialize(timers_dict) + + def _initialize(self, param_dict): + self.enabled = get_scalar_param(param_dict, TIMERS_THROUGHPUT_ENABLED, TIMERS_THROUGHPUT_ENABLED_DEFAULT) + self.synchronized = get_scalar_param(param_dict, TIMERS_THROUGHPUT_SYNCHRONIZED, + TIMERS_THROUGHPUT_SYNCHRONIZED_DEFAULT) diff --git a/deepspeed/utils/constants.py b/deepspeed/utils/constants.py new file mode 100644 index 000000000000..2f238e5aadfb --- /dev/null +++ b/deepspeed/utils/constants.py @@ -0,0 +1,33 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +######################################### +# Timers +######################################### +''' Elasticity Utility in DeepSpeed can be used to create highly elastic jobs compatible +with a large number of GPUs. For elastic jobs, DeepSpeed will provide a batch size that +can support a large number of GPUs based on the user specified parameters +''' +TIMERS_FORMAT = ''' +Timers should be enabled as: +"timers": { + "throughput": { + "enabled": true, + "synchronized": true + } +} +''' + +TIMERS = "timers" +TIMERS_THROUGHPUT = "throughput" + +TIMERS_THROUGHPUT_ENABLED = "enabled" +TIMERS_THROUGHPUT_ENABLED_DEFAULT = True + +# Synchronizing a device is required to produce the most accurate timer measurements. +# However, this comes at the expense of performance degradation. The CPU timer provides +# sufficient accuracy in many cases. +TIMERS_THROUGHPUT_SYNCHRONIZED = "synchronized" +TIMERS_THROUGHPUT_SYNCHRONIZED_DEFAULT = True diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py index 11ef54fe4665..dd78b207cc37 100755 --- a/deepspeed/utils/timer.py +++ b/deepspeed/utils/timer.py @@ -197,15 +197,9 @@ def get_mean(self, names, normalizer=1.0, reset=True): class ThroughputTimer: - def __init__( - self, - batch_size, - start_step=2, - steps_per_output=50, - monitor_memory=False, - logging_fn=None, - ): + def __init__(self, config, batch_size, start_step=2, steps_per_output=50, monitor_memory=False, logging_fn=None): from deepspeed.utils import logger + self.config = config self.start_time = 0 self.end_time = 0 self.started = False @@ -234,14 +228,17 @@ def _init_timer(self): self.initialized = True def start(self): + if not self.config.enabled: + return self._init_timer() self.started = True if self.global_step_count >= self.start_step: - get_accelerator().synchronize() + if self.config.synchronized: + get_accelerator().synchronize() self.start_time = time.time() def stop(self, global_step=False, report_speed=True): - if not self.started: + if not self.config.enabled or not self.started: return self.started = False self.micro_step_count += 1 @@ -249,7 +246,8 @@ def stop(self, global_step=False, report_speed=True): self.global_step_count += 1 if self.start_time > 0: - get_accelerator().synchronize() + if self.config.synchronized: + get_accelerator().synchronize() self.end_time = time.time() duration = self.end_time - self.start_time self.total_elapsed_time += duration From 69bd7d2de9491e816efa1693dcfb33ef8ada1371 Mon Sep 17 00:00:00 2001 From: Max Kovalenko Date: Thu, 18 Apr 2024 15:22:57 +0300 Subject: [PATCH 2/4] Use DeepSpeedConfigModel for timers configuration --- deepspeed/runtime/config.py | 4 +-- deepspeed/utils/config.py | 64 ++++++++++++++++++++++++------------- 2 files changed, 43 insertions(+), 25 deletions(-) diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index d10de090fe78..04b122963a38 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -66,7 +66,7 @@ from .data_pipeline.config import get_data_efficiency_enabled, get_data_efficiency_config, get_curriculum_enabled_legacy, get_curriculum_params_legacy from .data_pipeline.constants import * -from ..utils.config import DeepSpeedThroughputTimerConfig +from ..utils.config import get_timers_config TENSOR_CORE_ALIGN_SIZE = 8 @@ -913,7 +913,7 @@ def _initialize_params(self, param_dict): self.compile_config = get_compile_config(param_dict) - self.timers_config = DeepSpeedThroughputTimerConfig(param_dict) + self.timers_config = get_timers_config(param_dict) def _batch_assertion(self): diff --git a/deepspeed/utils/config.py b/deepspeed/utils/config.py index e5c6e6010160..31c39bae5204 100644 --- a/deepspeed/utils/config.py +++ b/deepspeed/utils/config.py @@ -3,26 +3,44 @@ # DeepSpeed Team -from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject -from deepspeed.utils.constants import * - - -class DeepSpeedThroughputTimerConfig(DeepSpeedConfigObject): - - def __init__(self, param_dict): - super(DeepSpeedThroughputTimerConfig, self).__init__() - - self.enabled = None - self.synchronized = None - - timers_dict = {} - if param_dict and TIMERS in param_dict: - if TIMERS_THROUGHPUT in param_dict[TIMERS]: - timers_dict = param_dict[TIMERS][TIMERS_THROUGHPUT] - - self._initialize(timers_dict) - - def _initialize(self, param_dict): - self.enabled = get_scalar_param(param_dict, TIMERS_THROUGHPUT_ENABLED, TIMERS_THROUGHPUT_ENABLED_DEFAULT) - self.synchronized = get_scalar_param(param_dict, TIMERS_THROUGHPUT_SYNCHRONIZED, - TIMERS_THROUGHPUT_SYNCHRONIZED_DEFAULT) +from deepspeed.runtime.config_utils import DeepSpeedConfigModel + +######################################### +# Timers +######################################### +# Timers. By default, timers are enabled. +# Users can configure in ds_config.json as below example: +TIMERS_FORMAT = ''' +Timers should be enabled as: +"timers": { + "throughput": { + "enabled": true, + "synchronized": true + } +} +''' + +TIMERS = "timers" +TIMERS_THROUGHPUT = "throughput" + + +def get_timers_config(param_dict): + if param_dict and TIMERS in param_dict and TIMERS_THROUGHPUT in param_dict[TIMERS]: + timers_config_dict = param_dict[TIMERS][TIMERS_THROUGHPUT] + else: + timers_config_dict = {} + return DeepSpeedThroughputTimerConfig(**timers_config_dict) + + +class DeepSpeedThroughputTimerConfig(DeepSpeedConfigModel): + """ Configure throughput timers """ + + enabled: bool = True + """ Turn on/off throughput timers """ + + synchronized: bool = True + """ Whether to synchronize a device when measuring the time. + Synchronizing a device is required to produce the most accurate timer measurements. + However, this comes at the expense of performance degradation. The CPU timer provides + sufficient accuracy in many cases. + """ \ No newline at end of file From a938f5097644ff2473a411d51b803247e8c33d2e Mon Sep 17 00:00:00 2001 From: Max Kovalenko Date: Thu, 18 Apr 2024 15:31:16 +0300 Subject: [PATCH 3/4] Removed constants.py, not needed anymore --- deepspeed/utils/constants.py | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 deepspeed/utils/constants.py diff --git a/deepspeed/utils/constants.py b/deepspeed/utils/constants.py deleted file mode 100644 index 2f238e5aadfb..000000000000 --- a/deepspeed/utils/constants.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team - -######################################### -# Timers -######################################### -''' Elasticity Utility in DeepSpeed can be used to create highly elastic jobs compatible -with a large number of GPUs. For elastic jobs, DeepSpeed will provide a batch size that -can support a large number of GPUs based on the user specified parameters -''' -TIMERS_FORMAT = ''' -Timers should be enabled as: -"timers": { - "throughput": { - "enabled": true, - "synchronized": true - } -} -''' - -TIMERS = "timers" -TIMERS_THROUGHPUT = "throughput" - -TIMERS_THROUGHPUT_ENABLED = "enabled" -TIMERS_THROUGHPUT_ENABLED_DEFAULT = True - -# Synchronizing a device is required to produce the most accurate timer measurements. -# However, this comes at the expense of performance degradation. The CPU timer provides -# sufficient accuracy in many cases. -TIMERS_THROUGHPUT_SYNCHRONIZED = "synchronized" -TIMERS_THROUGHPUT_SYNCHRONIZED_DEFAULT = True From 599b5da5bbe78b1b22e6d19d2b27cdda9c65bb04 Mon Sep 17 00:00:00 2001 From: Max Kovalenko Date: Sun, 21 Apr 2024 16:25:46 +0300 Subject: [PATCH 4/4] Fixed pre-commit checks --- deepspeed/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/utils/config.py b/deepspeed/utils/config.py index 31c39bae5204..15f37ca7d874 100644 --- a/deepspeed/utils/config.py +++ b/deepspeed/utils/config.py @@ -43,4 +43,4 @@ class DeepSpeedThroughputTimerConfig(DeepSpeedConfigModel): Synchronizing a device is required to produce the most accurate timer measurements. However, this comes at the expense of performance degradation. The CPU timer provides sufficient accuracy in many cases. - """ \ No newline at end of file + """