Skip to content

Commit

Permalink
Merge branch 'master' into wengshiy/accelerator_align
Browse files Browse the repository at this point in the history
  • Loading branch information
tjruwase authored May 15, 2024
2 parents 3562b68 + 23173fa commit e89b939
Show file tree
Hide file tree
Showing 12 changed files with 299 additions and 10 deletions.
92 changes: 92 additions & 0 deletions deepspeed/monitor/comet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

from typing import TYPE_CHECKING, Any, Tuple, List, Dict, Optional

from .utils import check_comet_availability
from .monitor import Monitor

import deepspeed.comm as dist

if TYPE_CHECKING:
import comet_ml
from .config import CometConfig

Name = str
Value = Any
GlobalSamples = int
Event = Tuple[Name, Value, GlobalSamples]


class CometMonitor(Monitor):

def __init__(self, comet_config: "CometConfig"):
super().__init__(comet_config)
check_comet_availability()
import comet_ml

self.enabled = comet_config.enabled
self._samples_log_interval = comet_config.samples_log_interval
self._experiment: Optional["comet_ml.ExperimentBase"] = None

if self.enabled and dist.get_rank() == 0:
self._experiment = comet_ml.start(
api_key=comet_config.api_key,
project=comet_config.project,
workspace=comet_config.workspace,
experiment_key=comet_config.experiment_key,
mode=comet_config.mode,
online=comet_config.online,
)

if comet_config.experiment_name is not None:
self._experiment.set_name(comet_config.experiment_name)

self._events_log_scheduler = EventsLogScheduler(comet_config.samples_log_interval)

@property
def experiment(self) -> Optional["comet_ml.ExperimentBase"]:
return self._experiment

@property
def samples_log_interval(self) -> int:
return self._samples_log_interval

def write_events(self, event_list: List[Event]) -> None:
if not self.enabled or dist.get_rank() != 0:
return None

for event in event_list:
name = event[0]
value = event[1]
engine_global_samples = event[2]

if self._events_log_scheduler.needs_logging(name, engine_global_samples):
self._experiment.__internal_api__log_metric__(
name=name,
value=value,
step=engine_global_samples,
)


class EventsLogScheduler:

def __init__(self, samples_log_interval: int):
self._samples_log_interval = samples_log_interval
self._last_logged_events_samples: Dict[str, int] = {}

def needs_logging(self, name: str, current_sample: int) -> bool:
if name not in self._last_logged_events_samples:
self._last_logged_events_samples[name] = current_sample
return True

last_logged_sample = self._last_logged_events_samples[name]
samples_delta = current_sample - last_logged_sample

if samples_delta >= self._samples_log_interval:
self._last_logged_events_samples[name] = current_sample
return True

return False
69 changes: 67 additions & 2 deletions deepspeed/monitor/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@

# DeepSpeed Team

from typing import Optional

from deepspeed.pydantic_v1 import root_validator
from deepspeed.runtime.config_utils import DeepSpeedConfigModel


def get_monitor_config(param_dict):
monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor")}
monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor", "comet")}
return DeepSpeedMonitorConfig(**monitor_dict)


Expand Down Expand Up @@ -60,12 +62,75 @@ class CSVConfig(DeepSpeedConfigModel):
""" Name for the current job. This will become a new directory inside `output_path`. """


class CometConfig(DeepSpeedConfigModel):
"""
Sets parameters for Comet monitor. For logging data Comet uses
experiment object.
https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment/
"""

enabled: bool = False
""" Whether logging to Comet is enabled. Requires `comet_ml` package is installed. """

samples_log_interval: int = 100
""" Metrics will be submitted to Comet after processing every `samples_log_intervas` samples"""

project: Optional[str] = None
"""
Comet project name. Can be set through .comet.config file or environment variable COMET_PROJECT_NAME
https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
"""

workspace: Optional[str] = None
"""
Comet workspace name. Can be set through .comet.config file or environment variable COMET_WORKSPACE
https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
"""

api_key: Optional[str] = None
"""
Comet API key. Can be set through .comet.config file or environment variable COMET_API_KEY
https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
"""

experiment_name: Optional[str] = None
"""
The name for comet experiment to be used for logging.
Can be set through .comet.config file or environment variable COMET_EXPERIMENT_NAME
https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
"""

experiment_key: Optional[str] = None
"""
The key for comet experiment to be used for logging. Must be an alphanumeric string whose length is between 32 and 50 characters.
Can be set through .comet.config or environment variable COMET_EXPERIMENT_KEY
https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
"""

online: Optional[bool] = None
"""
If True, the data will be logged to Comet server, otherwise it will be stored locally in offline experiment
Defaults to True.
"""

mode: Optional[str] = None
"""
Control how the Comet experiment is started, 3 options are possible.:
- "get": Continue logging to an existing experiment identified by the `experiment_key` value.
- "create": Always creates of a new experiment, useful for HPO sweeps.
- "get_or_create" (default): Starts a fresh experiment if required, or persists logging to an existing one.
"""


class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
"""Sets parameters for various monitoring methods."""

tensorboard: TensorBoardConfig = {}
""" TensorBoard monitor, requires `tensorboard` package is installed. """

comet: CometConfig = {}
""" Comet monitor, requires `comet_ml` package is installed """

wandb: WandbConfig = {}
""" WandB monitor, requires `wandb` package is installed. """

Expand All @@ -75,5 +140,5 @@ class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
@root_validator
def check_enabled(cls, values):
values["enabled"] = values.get("tensorboard").enabled or values.get("wandb").enabled or values.get(
"csv_monitor").enabled
"csv_monitor").enabled or values.get("comet")
return values
6 changes: 6 additions & 0 deletions deepspeed/monitor/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def write_events(self, event_list):
from .wandb import WandbMonitor
from .tensorboard import TensorBoardMonitor
from .csv_monitor import csvMonitor
from .comet import CometMonitor


class MonitorMaster(Monitor):
Expand All @@ -33,6 +34,7 @@ def __init__(self, monitor_config):
self.tb_monitor = None
self.wandb_monitor = None
self.csv_monitor = None
self.comet_monitor = None
self.enabled = monitor_config.enabled

if dist.get_rank() == 0:
Expand All @@ -42,6 +44,8 @@ def __init__(self, monitor_config):
self.wandb_monitor = WandbMonitor(monitor_config.wandb)
if monitor_config.csv_monitor.enabled:
self.csv_monitor = csvMonitor(monitor_config.csv_monitor)
if monitor_config.comet.enabled:
self.comet_monitor = CometMonitor(monitor_config.comet)

def write_events(self, event_list):
if dist.get_rank() == 0:
Expand All @@ -51,3 +55,5 @@ def write_events(self, event_list):
self.wandb_monitor.write_events(event_list)
if self.csv_monitor is not None:
self.csv_monitor.write_events(event_list)
if self.comet_monitor is not None:
self.comet_monitor.write_events(event_list)
13 changes: 13 additions & 0 deletions deepspeed/monitor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

# DeepSpeed Team

from packaging import version as pkg_version


def check_tb_availability():
try:
Expand All @@ -22,3 +24,14 @@ def check_wandb_availability():
'If you want to use wandb logging, please `pip install wandb` and follow the instructions at https://docs.wandb.ai/quickstart'
)
raise


def check_comet_availability():
try:
import comet_ml
comet_version = pkg_version.parse(comet_ml.__version__)
if comet_version < pkg_version.Version("3.41.0"):
raise ImportError("`comet_ml` must have at least version 3.41.0")
except ImportError:
print('If you want to use comet logging, please `pip install "comet_ml>=3.41.0"`')
raise
2 changes: 1 addition & 1 deletion deepspeed/runtime/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1263,7 +1263,7 @@ def _configure_optimizer(self, client_optimizer, model_parameters):
else:
self.optimizer = basic_optimizer

log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer_name()), ranks=[0])
log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer.__class__.__name__), ranks=[0])

self.compression_scheduler = self._configure_compression_scheduler()
self.quantizer = self._configure_quantization()
Expand Down
2 changes: 1 addition & 1 deletion docs/_data/navigation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ lnav:
- title: 'Flops Profiler'
url: /docs/config-json/#flops-profiler
- title: 'Monitoring'
url: /docs/config-json/#monitoring-module-tensorboard-wandb-csv
url: /docs/config-json/#monitoring-module
- title: 'Communication Logging'
url: /docs/config-json/#communication-logging
- title: 'Model Compression'
Expand Down
37 changes: 34 additions & 3 deletions docs/_pages/config-json.md
Original file line number Diff line number Diff line change
Expand Up @@ -1139,15 +1139,16 @@ DeepSpeed Data Efficiency Library includes two techniques: curriculum learning a
| ---------------------------------------------------------------------------------------------------------------------------- | ------- |
| List of which step to change difficulty level. One of the `schedule_config` when the `fixed_discrete` schedule_type is used. | N/A |

### Monitoring Module (TensorBoard, WandB, CSV)
### Monitoring Module

**Note:** Deepspeed logs to TensorBoard through PyTorch. Logging to TensorBoard requires that the `tensorboard` package is installed (read more in the [PyTorch documentation](https://pytorch.org/docs/1.8.0/tensorboard.html)).
{: .notice--warning}
**Note:** Logging to WandB requires that the `wandb` package is installed (read more in the [WandB documentation](https://docs.wandb.ai/quickstart)).
{: .notice--warning}
**Note:** Logging to Comet requires that the `comet_ml` package is installed (read more in the [Comet documentation](https://www.comet.com/docs/v2/guides/quickstart/#1-install-and-configure-the-comet-ml-sdk)).
{: .notice--warning}


Deepspeed's Monitor module can log training details into a [Tensorboard](https://www.tensorflow.org/tensorboard)-compatible file, to [WandB](https://wandb.ai/site), or to simple CSV files. Below is an overview of what DeepSpeed will log automatically.
Deepspeed's Monitor module can log training details into a [Tensorboard](https://www.tensorflow.org/tensorboard)-compatible file, to [WandB](https://wandb.ai/site), to [Comet](https://www.comet.com/site/?utm_source=deepseed&utm_medium=docs&utm_content=docs) or to simple CSV files. Below is an overview of what DeepSpeed will log automatically.

| Field | Description |Conditions |
| ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- |
Expand Down Expand Up @@ -1201,6 +1202,36 @@ Example of <i>**wandb**</i> configuration:
}
```

<i>**comet**</i>: [dictionary]

| Fields | Value | Default |
|--- |--- |--- |
| enabled | Whether logging to [Comet](https://www.comet.com/site/) is enabled. | `false` |
| workspace | Comet workspace name. | `None` |
| project | Comet project name. | `None` |
| samples_log_interval | Metrics will be submitted to Comet after processing every `samples_log_intervas` samples. | `100` |
| experiment_name | The name for comet experiment to be used for logging. | `None` |
| api_key | Comet API key. It's not recommended to save the Comet API Key in code. | `None` |
| experiment_key | The key for comet experiment to be used for logging. Must be an alphanumeric string whose length is between 32 and 50 characters. | `None` |
| online | If True, the data will be logged to Comet server, otherwise it will be stored locally in offline experiment. Default is `True`. | `None` |
| mode | Control how the Comet experiment is started. "get": Continue logging to an existing experiment identified by the `experiment_key` value. "create": Always creates of a new experiment, useful for HPO sweeps. "get_or_create" (default): Starts a fresh experiment if required, or persists logging to an existing one. | `None` |


Example of <i>**comet**</i> configuration:

```json
"comet": {
"enabled": true,
"workspace": "my_workspace",
"project": "my_project",
"samples_log_interval": 50,
"experiment_name": "llama-fine-tuning",
"experiment_key": "0c4a1c4a90664f2a8084e600b19a9d7",
"online": false,
"mode": "get",
}
```

<i>**csv_monitor**</i>: [dictionary]

| Fields | Value |Default |
Expand Down
15 changes: 12 additions & 3 deletions docs/_tutorials/monitor.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ In this tutorial, we introduce the DeepSpeed Monitor and provide examples of its

## Overview

Monitoring model and system metrics during training is vital to ensure hardware resources are fully utilized. The DeepSpeed Monitor enables live logging of metrics through one or more monitoring backends such as PyTorch's [TensorBoard](https://pytorch.org/docs/1.8.0/tensorboard.html), [WandB](https://docs.wandb.ai/quickstart), and simple CSV files.
Monitoring model and system metrics during training is vital to ensure hardware resources are fully utilized. The DeepSpeed Monitor enables live logging of metrics through one or more monitoring backends such as PyTorch's [TensorBoard](https://pytorch.org/docs/1.8.0/tensorboard.html), [WandB](https://docs.wandb.ai/quickstart), [Comet](https://www.comet.com/site/?utm_source=deepseed&utm_medium=docs&utm_content=tutorial) and simple CSV files.

Below is a live monitoring view for TensorBoard:

Expand All @@ -21,16 +21,20 @@ Below is a live monitoring view for WandB:

![WandB Example Output](/assets/images/wandb_monitor.PNG){: .align-center}

Below is a live monitoring view for Comet:

![CometML Example Output](/assets/images/comet_monitor.png){: .align-center}

## Usage

The DeepSpeed Monitor is configured within the deepspeed [configuration file](/docs/config-json/#monitoring-module-tensorboard-wandb-csv). DeepSpeed will automatically monitor key training metrics, including those tracked with the `wall_clock_breakdown` configuration option. In addition, users can log their own custom events and metrics.
The DeepSpeed Monitor is configured within the deepspeed [configuration file](/docs/config-json/#monitoring-module). DeepSpeed will automatically monitor key training metrics, including those tracked with the `wall_clock_breakdown` configuration option. In addition, users can log their own custom events and metrics.

- [Automatic Monitoring](#automatic-monitoring)
- [Custom Monitoring](#custom-monitoring)

### Automatic Monitoring

When using DeepSpeed for model training, the Monitor can be configured in the DeepSpeed [configuration file](/docs/config-json/#monitoring-module-tensorboard-wandb-csv). No explicit API calls are needed to use the Monitor. The Monitor can be enabled by adding the following field to DeepSpeed's configuration json file. Refer to [Monitoring](/docs/config-json/#monitoring-module-tensorboard-wandb-csv) for details.
When using DeepSpeed for model training, the Monitor can be configured in the DeepSpeed [configuration file](/docs/config-json/#monitoring-module). No explicit API calls are needed to use the Monitor. The Monitor can be enabled by adding the following field to DeepSpeed's configuration json file. Refer to [Monitoring](/docs/config-json/#monitoring-module) for details.

```json
{
Expand All @@ -45,6 +49,11 @@ When using DeepSpeed for model training, the Monitor can be configured in the De
"group": "my_group",
"project": "my_project"
}
"comet": {
"enabled": true,
"project": "my_project",
"experiment_name": "my_experiment"
}
"csv_monitor": {
"enabled": true,
"output_path": "output/ds_logs/",
Expand Down
Binary file added docs/assets/images/comet_monitor.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 5 additions & 0 deletions docs/code-docs/source/monitor.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ WandB
.. _WandbConfig:
.. autopydantic_model:: deepspeed.monitor.config.WandbConfig

Comet
-----
.. _CometConfig:
.. autopydantic_model:: deepspeed.monitor.config.CometConfig

CSV Monitor
-----------
.. _CSVConfig:
Expand Down
1 change: 1 addition & 0 deletions requirements/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
accelerate
clang-format==16.0.2
comet_ml>=3.41.0
deepspeed-kernels ; sys_platform == 'linux'
docutils<0.18
future
Expand Down
Loading

0 comments on commit e89b939

Please sign in to comment.