Skip to content

Commit

Permalink
Log all gpu rank stdout/err to MosaicML platform (mosaicml#2839)
Browse files Browse the repository at this point in the history
* logging all gpu ranks

* check if mosaic log dir is not none

* refactor

* Added logging

* updated python file command

* redirecting to sep stderr and out files

* added constant

* try logging to two files

* teeing to two files

* chronological logs

* formatted merged in dev

* reassign process std to log file

* formatting all logs exceptions

* adding error formatting

* removed process call

* default args std if not mosaic platform

* updated formatting to say STDOUT if not on mosaic

* fixed typing

* added warning, renamed env var

* updated warning

* added instructions to override ignore, default log env var to empty str

* set default for log dir

* changed message

* set platform to false for provided stdout and err

* changed default for log dir to false

* Update composer/cli/launcher.py

Co-authored-by: Mihir Patel <[email protected]>

---------

Co-authored-by: Daniel King <[email protected]>
Co-authored-by: Mihir Patel <[email protected]>
  • Loading branch information
3 people authored Feb 5, 2024
1 parent 7b9c42e commit 12261d6
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 7 deletions.
34 changes: 27 additions & 7 deletions composer/cli/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@
import tempfile
import time
import traceback
import warnings
from argparse import ArgumentParser
from typing import Any, Dict, List
from typing import Any, Dict, List, Union

import psutil
import torch

import composer
from composer.loggers.mosaicml_logger import MOSAICML_LOG_DIR_ENV_VAR, MOSAICML_PLATFORM_ENV_VAR
from composer.utils import get_free_tcp_port

CLEANUP_TIMEOUT = datetime.timedelta(seconds=30)
Expand Down Expand Up @@ -260,7 +262,7 @@ def _launch_processes(
command_mode: bool,
training_script: str,
stdout_file_format: str,
stderr_file_format: str,
stderr_file_format: Union[str, None],
training_script_args: List[Any],
processes: Dict[int, subprocess.Popen],
):
Expand Down Expand Up @@ -315,17 +317,18 @@ def _get_file(format: str):
)
return open(filename, 'x+')

stderr_file = _get_file(stderr_file_format)
stdout_file = _get_file(stdout_file_format)
stderr_file = _get_file(stderr_file_format) if stderr_file_format is not None else None

process = subprocess.Popen(
cmd,
stdout=stdout_file,
stderr=stderr_file,
stderr=stderr_file if stderr_file is not None else subprocess.STDOUT,
text=True,
)
process.stderr = stderr_file
process.stdout = stdout_file
if stderr_file is not None:
process.stderr = stderr_file
processes[global_rank] = process


Expand Down Expand Up @@ -357,6 +360,7 @@ def _monitor_processes(processes: Dict[int, subprocess.Popen]):


def _print_process_exit_status(global_rank: int, process: subprocess.Popen):
stdOutLabel = 'STDOUT'
if process.stdout is None:
output = None
else:
Expand All @@ -365,6 +369,7 @@ def _print_process_exit_status(global_rank: int, process: subprocess.Popen):

if process.stderr is None:
stderr = None
stdOutLabel = 'logs'
else:
process.stderr.seek(0)
stderr = process.stderr.read()
Expand All @@ -374,13 +379,15 @@ def _print_process_exit_status(global_rank: int, process: subprocess.Popen):
output=output,
stderr=stderr,
)

error_msg = [f'Global rank {global_rank} (PID {process.pid}) exited with code {process.returncode}']
if output is not None:
error_msg.extend([
f'----------Begin global rank {global_rank} STDOUT----------',
f'----------Begin global rank {global_rank} {stdOutLabel}----------',
output,
f'----------End global rank {global_rank} STDOUT----------',
f'----------End global rank {global_rank} {stdOutLabel}----------',
])

if stderr is not None:
error_msg.extend([
f'----------Begin global rank {global_rank} STDERR----------',
Expand Down Expand Up @@ -473,6 +480,19 @@ def main():
if args.stderr is None:
args.stderr = f'{log_tmpdir.name}/rank{{rank}}.stderr.txt'

# If running on the Mosaic platform, log all gpu ranks' stderr and stdout to Mosaic platform
if os.environ.get(
MOSAICML_PLATFORM_ENV_VAR,
'false').lower() == 'true' and str(os.environ.get(MOSAICML_LOG_DIR_ENV_VAR, 'false')).lower() != 'false':
log.info('Logging all GPU ranks to Mosaic Platform.')
log_file_format = f'{os.environ.get(MOSAICML_LOG_DIR_ENV_VAR)}/gpu_{{rank}}.txt'
if args.stderr is not None or args.stdout is not None:
warnings.warn(
'Logging to Mosaic Platform. Ignoring provided stdout and stderr args. To use provided stdout and stderr, set MOSAICML_LOG_DIR=false.'
)
args.stdout = log_file_format
args.stderr = None

try:
_launch_processes(nproc=args.nproc,
world_size=args.world_size,
Expand Down
1 change: 1 addition & 0 deletions composer/loggers/mosaicml_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
RUN_NAME_ENV_VAR = 'RUN_NAME'
MOSAICML_PLATFORM_ENV_VAR = 'MOSAICML_PLATFORM'
MOSAICML_ACCESS_TOKEN_ENV_VAR = 'MOSAICML_ACCESS_TOKEN_FILE'
MOSAICML_LOG_DIR_ENV_VAR = 'MOSAICML_LOG_DIR'


class MosaicMLLogger(LoggerDestination):
Expand Down

0 comments on commit 12261d6

Please sign in to comment.