diff --git a/composer/cli/launcher.py b/composer/cli/launcher.py index f4f7c436a2..c72e1b01db 100755 --- a/composer/cli/launcher.py +++ b/composer/cli/launcher.py @@ -14,13 +14,15 @@ import tempfile import time import traceback +import warnings from argparse import ArgumentParser -from typing import Any, Dict, List +from typing import Any, Dict, List, Union import psutil import torch import composer +from composer.loggers.mosaicml_logger import MOSAICML_LOG_DIR_ENV_VAR, MOSAICML_PLATFORM_ENV_VAR from composer.utils import get_free_tcp_port CLEANUP_TIMEOUT = datetime.timedelta(seconds=30) @@ -260,7 +262,7 @@ def _launch_processes( command_mode: bool, training_script: str, stdout_file_format: str, - stderr_file_format: str, + stderr_file_format: Union[str, None], training_script_args: List[Any], processes: Dict[int, subprocess.Popen], ): @@ -315,17 +317,18 @@ def _get_file(format: str): ) return open(filename, 'x+') - stderr_file = _get_file(stderr_file_format) stdout_file = _get_file(stdout_file_format) + stderr_file = _get_file(stderr_file_format) if stderr_file_format is not None else None process = subprocess.Popen( cmd, stdout=stdout_file, - stderr=stderr_file, + stderr=stderr_file if stderr_file is not None else subprocess.STDOUT, text=True, ) - process.stderr = stderr_file process.stdout = stdout_file + if stderr_file is not None: + process.stderr = stderr_file processes[global_rank] = process @@ -357,6 +360,7 @@ def _monitor_processes(processes: Dict[int, subprocess.Popen]): def _print_process_exit_status(global_rank: int, process: subprocess.Popen): + stdOutLabel = 'STDOUT' if process.stdout is None: output = None else: @@ -365,6 +369,7 @@ def _print_process_exit_status(global_rank: int, process: subprocess.Popen): if process.stderr is None: stderr = None + stdOutLabel = 'logs' else: process.stderr.seek(0) stderr = process.stderr.read() @@ -374,13 +379,15 @@ def _print_process_exit_status(global_rank: int, process: subprocess.Popen): output=output, stderr=stderr, ) + error_msg = [f'Global rank {global_rank} (PID {process.pid}) exited with code {process.returncode}'] if output is not None: error_msg.extend([ - f'----------Begin global rank {global_rank} STDOUT----------', + f'----------Begin global rank {global_rank} {stdOutLabel}----------', output, - f'----------End global rank {global_rank} STDOUT----------', + f'----------End global rank {global_rank} {stdOutLabel}----------', ]) + if stderr is not None: error_msg.extend([ f'----------Begin global rank {global_rank} STDERR----------', @@ -473,6 +480,19 @@ def main(): if args.stderr is None: args.stderr = f'{log_tmpdir.name}/rank{{rank}}.stderr.txt' + # If running on the Mosaic platform, log all gpu ranks' stderr and stdout to Mosaic platform + if os.environ.get( + MOSAICML_PLATFORM_ENV_VAR, + 'false').lower() == 'true' and str(os.environ.get(MOSAICML_LOG_DIR_ENV_VAR, 'false')).lower() != 'false': + log.info('Logging all GPU ranks to Mosaic Platform.') + log_file_format = f'{os.environ.get(MOSAICML_LOG_DIR_ENV_VAR)}/gpu_{{rank}}.txt' + if args.stderr is not None or args.stdout is not None: + warnings.warn( + 'Logging to Mosaic Platform. Ignoring provided stdout and stderr args. To use provided stdout and stderr, set MOSAICML_LOG_DIR=false.' + ) + args.stdout = log_file_format + args.stderr = None + try: _launch_processes(nproc=args.nproc, world_size=args.world_size, diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 06b70436a8..d4338a407a 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -36,6 +36,7 @@ RUN_NAME_ENV_VAR = 'RUN_NAME' MOSAICML_PLATFORM_ENV_VAR = 'MOSAICML_PLATFORM' MOSAICML_ACCESS_TOKEN_ENV_VAR = 'MOSAICML_ACCESS_TOKEN_FILE' +MOSAICML_LOG_DIR_ENV_VAR = 'MOSAICML_LOG_DIR' class MosaicMLLogger(LoggerDestination):