From 6ecc7dc587359433877dd602f00e29959ca0744e Mon Sep 17 00:00:00 2001 From: Jose Javier <26491792+josejg@users.noreply.github.com> Date: Thu, 11 Jul 2024 16:34:34 -0700 Subject: [PATCH 1/8] EnvironmentLoggerCallback --- llmfoundry/callbacks/__init__.py | 3 +- llmfoundry/callbacks/envlogger.py | 177 ++++++++++++++++++++++++++++++ setup.py | 1 + 3 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 llmfoundry/callbacks/envlogger.py diff --git a/llmfoundry/callbacks/__init__.py b/llmfoundry/callbacks/__init__.py index 496e905e13..23d1dd449e 100644 --- a/llmfoundry/callbacks/__init__.py +++ b/llmfoundry/callbacks/__init__.py @@ -16,6 +16,7 @@ from llmfoundry.callbacks.async_eval_callback import AsyncEval from llmfoundry.callbacks.curriculum_learning_callback import CurriculumLearning +from llmfoundry.callbacks.envlogger import EnvironmentLoggerCallback from llmfoundry.callbacks.eval_gauntlet_callback import EvalGauntlet from llmfoundry.callbacks.eval_output_logging_callback import EvalOutputLogging from llmfoundry.callbacks.fdiff_callback import FDiffMetrics @@ -55,8 +56,8 @@ callbacks.register('eval_output_logging', func=EvalOutputLogging) callbacks.register('mbmoe_tok_per_expert', func=MegaBlocksMoE_TokPerExpert) callbacks.register('run_timeout', func=RunTimeoutCallback) - callbacks.register('loss_perp_v_len', func=LossPerpVsContextLengthLogger) +callbacks.register('env_logger', func=EnvironmentLoggerCallback) callbacks_with_config.register('async_eval', func=AsyncEval) callbacks_with_config.register('curriculum_learning', func=CurriculumLearning) diff --git a/llmfoundry/callbacks/envlogger.py b/llmfoundry/callbacks/envlogger.py new file mode 100644 index 0000000000..f3d7a47ff5 --- /dev/null +++ b/llmfoundry/callbacks/envlogger.py @@ -0,0 +1,177 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import os +import platform +import socket +from typing import Any, Optional + +import git +import pkg_resources +import psutil +import torch +from composer.core import Callback, State +from composer.loggers import Logger + + +class EnvironmentLoggerCallback(Callback): + """A callback for logging environment information during model training. + + This callback collects various pieces of information about the training environment, + including git repository details, package versions, system information, GPU details, + distributed training setup, NVIDIA driver information, and Docker container details. + + Args: + workspace_dir (str): The directory containing the workspace. Defaults to '/workspace'. + log_git (bool): Whether to log git repository information. Defaults to True. + log_packages (bool): Whether to log package versions. Defaults to True. + log_nvidia (bool): Whether to log NVIDIA driver information. Defaults to True. + log_docker (bool): Whether to log Docker container information. Defaults to True. + log_system (bool): Whether to log system information. Defaults to False. + log_gpu (bool): Whether to log GPU information. Defaults to False. + log_distributed (bool): Whether to log distributed training information. Defaults to False. + packages_to_log (list[str]): A list of package names to log versions for. Defaults to None. + + The collected information is logged as hyperparameters at the start of model fitting. + """ + + PACKAGES_TO_LOG = [ + 'llm-foundry', + 'mosaicml', + 'megablocks', + 'grouped-gemm', + 'torch', + 'flash_attn', + 'transformers', + 'datasets', + 'peft', + ] + + def __init__( + self, + workspace_dir: str = '/workspace', + log_git: bool = True, + log_nvidia: bool = True, + log_docker: bool = True, + log_packages: bool = True, + log_system: bool = False, + log_gpu: bool = False, + log_distributed: bool = False, + packages_to_log: Optional[list[str]] = None, + ): + self.workspace_dir = workspace_dir + self.log_git = log_git + self.log_packages = log_packages + self.log_nvidia = log_nvidia + self.log_docker = log_docker + self.log_system = log_system + self.log_gpu = log_gpu + self.log_distributed = log_distributed + self.env_data: dict[str, Any] = {} + self.packages_to_log = packages_to_log or self.PACKAGES_TO_LOG + + def _get_git_info(self, repo_path: str) -> dict[str, str]: + repo = git.Repo(repo_path) + return { + 'commit_hash': repo.head.commit.hexsha, + 'branch': repo.active_branch.name, + } + + def _get_package_version(self, package_name: str) -> Optional[str]: + try: + return pkg_resources.get_distribution(package_name).version + except pkg_resources.DistributionNotFound: + return None + + def _get_system_info(self) -> dict[str, Any]: + return { + 'python_version': platform.python_version(), + 'os': f'{platform.system()} {platform.release()}', + 'hostname': socket.gethostname(), + 'cpu_info': { + 'model': platform.processor(), + 'cores': psutil.cpu_count(logical=False), + 'threads': psutil.cpu_count(logical=True), + }, + 'memory': { + 'total': psutil.virtual_memory().total, + 'available': psutil.virtual_memory().available, + }, + } + + def _get_gpu_info(self) -> dict[str, Any]: + if torch.cuda.is_available(): + return { + 'model': torch.cuda.get_device_name(0), + 'count': torch.cuda.device_count(), + 'memory': { + 'total': torch.cuda.get_device_properties(0).total_memory, + 'allocated': torch.cuda.memory_allocated(0), + }, + } + return {'available': False} + + def _get_nvidia_info(self) -> dict[str, Any]: + if torch.cuda.is_available(): + nccl_version = torch.cuda.nccl.version() # type: ignore + return { + 'cuda_version': + torch.version.cuda, # type: ignore[attr-defined] + 'cudnn_version': torch.backends.cudnn.version( + ), # type: ignore[attr-defined] + 'nccl_version': '.'.join( + map(str, nccl_version), + ), + } + + return {'available': False} + + def _get_distributed_info(self) -> dict[str, Any]: + return { + 'world_size': int(os.environ.get('WORLD_SIZE', 1)), + 'local_world_size': int(os.environ.get('LOCAL_WORLD_SIZE', 1)), + 'rank': int(os.environ.get('RANK', 0)), + 'local_rank': int(os.environ.get('LOCAL_RANK', 0)), + } + + def _get_docker_info(self) -> dict[str, Any]: + from mcli import sdk + + run = sdk.get_run(os.environ['RUN_NAME']) + image, tag = run.image.split(':') + return { + 'image': image, + 'tag': tag, + } + + def fit_start(self, state: State, logger: Logger) -> None: + # Collect environment data + if self.log_git: + self.env_data['git_info'] = { + folder: + self._get_git_info(os.path.join(self.workspace_dir, folder)) + for folder in os.listdir(self.workspace_dir) + if os.path.isdir(os.path.join(self.workspace_dir, folder)) + } + + if self.log_packages: + self.env_data['package_versions'] = { + pkg: self._get_package_version(pkg) + for pkg in self.packages_to_log + } + if self.log_nvidia: + self.env_data['nvidia'] = self._get_nvidia_info() + + if self.log_docker: + self.env_data['docker'] = self._get_docker_info() + if self.log_system: + self.env_data['system_info'] = self._get_system_info() + + if self.log_gpu: + self.env_data['gpu_info'] = self._get_gpu_info() + + if self.log_distributed: + self.env_data['distributed_info'] = self._get_distributed_info() + + # Log the collected data + logger.log_hyperparameters({'environment_data': self.env_data}) diff --git a/setup.py b/setup.py index 309d7d3372..00a14a00b2 100644 --- a/setup.py +++ b/setup.py @@ -75,6 +75,7 @@ 'tenacity>=8.2.3,<9', 'catalogue>=2,<3', 'typer<1', + 'GitPython==3.1.43', ] extra_deps = {} From bc6b2a666a0bbcd70486eb9a71f80ea539aed3b0 Mon Sep 17 00:00:00 2001 From: Jose Javier <26491792+josejg@users.noreply.github.com> Date: Thu, 11 Jul 2024 17:07:18 -0700 Subject: [PATCH 2/8] Update envlogger.py --- llmfoundry/callbacks/envlogger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/callbacks/envlogger.py b/llmfoundry/callbacks/envlogger.py index f3d7a47ff5..5abe48a5e9 100644 --- a/llmfoundry/callbacks/envlogger.py +++ b/llmfoundry/callbacks/envlogger.py @@ -117,8 +117,8 @@ def _get_nvidia_info(self) -> dict[str, Any]: return { 'cuda_version': torch.version.cuda, # type: ignore[attr-defined] - 'cudnn_version': torch.backends.cudnn.version( - ), # type: ignore[attr-defined] + 'cudnn_version': str(torch.backends.cudnn.version( + )), # type: ignore[attr-defined] 'nccl_version': '.'.join( map(str, nccl_version), ), From 2850830748c57fdd4d35a73e6e66d48f875d73aa Mon Sep 17 00:00:00 2001 From: Jose Javier <26491792+josejg@users.noreply.github.com> Date: Mon, 29 Jul 2024 15:37:50 -0700 Subject: [PATCH 3/8] fix --- llmfoundry/callbacks/envlogger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/callbacks/envlogger.py b/llmfoundry/callbacks/envlogger.py index 5abe48a5e9..d90c2b6e89 100644 --- a/llmfoundry/callbacks/envlogger.py +++ b/llmfoundry/callbacks/envlogger.py @@ -117,8 +117,8 @@ def _get_nvidia_info(self) -> dict[str, Any]: return { 'cuda_version': torch.version.cuda, # type: ignore[attr-defined] - 'cudnn_version': str(torch.backends.cudnn.version( - )), # type: ignore[attr-defined] + 'cudnn_version': str(torch.backends.cudnn.version() + ), # type: ignore[attr-defined] 'nccl_version': '.'.join( map(str, nccl_version), ), From 72d100be4c642080879bbca8aac9c9a400e89cdc Mon Sep 17 00:00:00 2001 From: Jose Javier <26491792+josejg@users.noreply.github.com> Date: Thu, 15 Aug 2024 09:03:12 -0700 Subject: [PATCH 4/8] fixes --- llmfoundry/callbacks/__init__.py | 4 +- .../{envlogger.py => env_logging_callback.py} | 59 +++++++++---------- 2 files changed, 29 insertions(+), 34 deletions(-) rename llmfoundry/callbacks/{envlogger.py => env_logging_callback.py} (84%) diff --git a/llmfoundry/callbacks/__init__.py b/llmfoundry/callbacks/__init__.py index 23d1dd449e..8c86dda2a6 100644 --- a/llmfoundry/callbacks/__init__.py +++ b/llmfoundry/callbacks/__init__.py @@ -16,7 +16,7 @@ from llmfoundry.callbacks.async_eval_callback import AsyncEval from llmfoundry.callbacks.curriculum_learning_callback import CurriculumLearning -from llmfoundry.callbacks.envlogger import EnvironmentLoggerCallback +from llmfoundry.callbacks.env_logging_callback import EnvironmentLoggingCallback from llmfoundry.callbacks.eval_gauntlet_callback import EvalGauntlet from llmfoundry.callbacks.eval_output_logging_callback import EvalOutputLogging from llmfoundry.callbacks.fdiff_callback import FDiffMetrics @@ -57,7 +57,7 @@ callbacks.register('mbmoe_tok_per_expert', func=MegaBlocksMoE_TokPerExpert) callbacks.register('run_timeout', func=RunTimeoutCallback) callbacks.register('loss_perp_v_len', func=LossPerpVsContextLengthLogger) -callbacks.register('env_logger', func=EnvironmentLoggerCallback) +callbacks.register('env_logger', func=EnvironmentLoggingCallback) callbacks_with_config.register('async_eval', func=AsyncEval) callbacks_with_config.register('curriculum_learning', func=CurriculumLearning) diff --git a/llmfoundry/callbacks/envlogger.py b/llmfoundry/callbacks/env_logging_callback.py similarity index 84% rename from llmfoundry/callbacks/envlogger.py rename to llmfoundry/callbacks/env_logging_callback.py index d90c2b6e89..5c91e9be5a 100644 --- a/llmfoundry/callbacks/envlogger.py +++ b/llmfoundry/callbacks/env_logging_callback.py @@ -12,9 +12,24 @@ import torch from composer.core import Callback, State from composer.loggers import Logger - - -class EnvironmentLoggerCallback(Callback): +from composer.utils import dist +from mcli import sdk + +__all__ = ['EnvironmentLoggingCallback'] + +_PACKAGES_TO_LOG = [ + 'llm-foundry', + 'mosaicml', + 'megablocks', + 'grouped-gemm', + 'torch', + 'flash_attn', + 'transformers', + 'datasets', + 'peft', +] + +class EnvironmentLoggingCallback(Callback): """A callback for logging environment information during model training. This callback collects various pieces of information about the training environment, @@ -35,18 +50,6 @@ class EnvironmentLoggerCallback(Callback): The collected information is logged as hyperparameters at the start of model fitting. """ - PACKAGES_TO_LOG = [ - 'llm-foundry', - 'mosaicml', - 'megablocks', - 'grouped-gemm', - 'torch', - 'flash_attn', - 'transformers', - 'datasets', - 'peft', - ] - def __init__( self, workspace_dir: str = '/workspace', @@ -68,7 +71,7 @@ def __init__( self.log_gpu = log_gpu self.log_distributed = log_distributed self.env_data: dict[str, Any] = {} - self.packages_to_log = packages_to_log or self.PACKAGES_TO_LOG + self.packages_to_log = packages_to_log or _PACKAGES_TO_LOG def _get_git_info(self, repo_path: str) -> dict[str, str]: repo = git.Repo(repo_path) @@ -115,28 +118,21 @@ def _get_nvidia_info(self) -> dict[str, Any]: if torch.cuda.is_available(): nccl_version = torch.cuda.nccl.version() # type: ignore return { - 'cuda_version': - torch.version.cuda, # type: ignore[attr-defined] - 'cudnn_version': str(torch.backends.cudnn.version() - ), # type: ignore[attr-defined] - 'nccl_version': '.'.join( - map(str, nccl_version), - ), + 'cuda_version': torch.version.cuda, # type: ignore[attr-defined] + 'cudnn_version': str(torch.backends.cudnn.version()), # type: ignore[attr-defined] + 'nccl_version': '.'.join(map(str, nccl_version)), } - return {'available': False} def _get_distributed_info(self) -> dict[str, Any]: return { - 'world_size': int(os.environ.get('WORLD_SIZE', 1)), - 'local_world_size': int(os.environ.get('LOCAL_WORLD_SIZE', 1)), - 'rank': int(os.environ.get('RANK', 0)), - 'local_rank': int(os.environ.get('LOCAL_RANK', 0)), + 'world_size': dist.get_world_size(), + 'local_world_size': dist.get_local_world_size(), + 'rank': dist.get_global_rank(), + 'local_rank': dist.get_local_rank(), } def _get_docker_info(self) -> dict[str, Any]: - from mcli import sdk - run = sdk.get_run(os.environ['RUN_NAME']) image, tag = run.image.split(':') return { @@ -148,8 +144,7 @@ def fit_start(self, state: State, logger: Logger) -> None: # Collect environment data if self.log_git: self.env_data['git_info'] = { - folder: - self._get_git_info(os.path.join(self.workspace_dir, folder)) + folder: self._get_git_info(os.path.join(self.workspace_dir, folder)) for folder in os.listdir(self.workspace_dir) if os.path.isdir(os.path.join(self.workspace_dir, folder)) } From 11eab11d2605f776b5b407fdb40be9a6dd042224 Mon Sep 17 00:00:00 2001 From: Jose Javier <26491792+josejg@users.noreply.github.com> Date: Thu, 15 Aug 2024 09:45:05 -0700 Subject: [PATCH 5/8] isort --- llmfoundry/callbacks/env_logging_callback.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/llmfoundry/callbacks/env_logging_callback.py b/llmfoundry/callbacks/env_logging_callback.py index 5c91e9be5a..1c55fb15e8 100644 --- a/llmfoundry/callbacks/env_logging_callback.py +++ b/llmfoundry/callbacks/env_logging_callback.py @@ -13,6 +13,7 @@ from composer.core import Callback, State from composer.loggers import Logger from composer.utils import dist + from mcli import sdk __all__ = ['EnvironmentLoggingCallback'] @@ -29,6 +30,7 @@ 'peft', ] + class EnvironmentLoggingCallback(Callback): """A callback for logging environment information during model training. @@ -118,8 +120,10 @@ def _get_nvidia_info(self) -> dict[str, Any]: if torch.cuda.is_available(): nccl_version = torch.cuda.nccl.version() # type: ignore return { - 'cuda_version': torch.version.cuda, # type: ignore[attr-defined] - 'cudnn_version': str(torch.backends.cudnn.version()), # type: ignore[attr-defined] + 'cuda_version': + torch.version.cuda, # type: ignore[attr-defined] + 'cudnn_version': str(torch.backends.cudnn.version() + ), # type: ignore[attr-defined] 'nccl_version': '.'.join(map(str, nccl_version)), } return {'available': False} @@ -144,7 +148,8 @@ def fit_start(self, state: State, logger: Logger) -> None: # Collect environment data if self.log_git: self.env_data['git_info'] = { - folder: self._get_git_info(os.path.join(self.workspace_dir, folder)) + folder: + self._get_git_info(os.path.join(self.workspace_dir, folder)) for folder in os.listdir(self.workspace_dir) if os.path.isdir(os.path.join(self.workspace_dir, folder)) } From c9d18845898c404da00b321dd400e40223d0407d Mon Sep 17 00:00:00 2001 From: Jose Javier <26491792+josejg@users.noreply.github.com> Date: Thu, 15 Aug 2024 12:04:34 -0700 Subject: [PATCH 6/8] fix --- llmfoundry/callbacks/env_logging_callback.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llmfoundry/callbacks/env_logging_callback.py b/llmfoundry/callbacks/env_logging_callback.py index 1c55fb15e8..1b5514823c 100644 --- a/llmfoundry/callbacks/env_logging_callback.py +++ b/llmfoundry/callbacks/env_logging_callback.py @@ -122,8 +122,9 @@ def _get_nvidia_info(self) -> dict[str, Any]: return { 'cuda_version': torch.version.cuda, # type: ignore[attr-defined] - 'cudnn_version': str(torch.backends.cudnn.version() - ), # type: ignore[attr-defined] + 'cudnn_version': str( + torch.backends.cudnn.version(), + ), # type: ignore[attr-defined] 'nccl_version': '.'.join(map(str, nccl_version)), } return {'available': False} From bc3cd9d5b1ae520acaf1e6c00df03c0d70160cc7 Mon Sep 17 00:00:00 2001 From: Jose Javier <26491792+josejg@users.noreply.github.com> Date: Fri, 16 Aug 2024 11:37:29 -0700 Subject: [PATCH 7/8] fix --- llmfoundry/callbacks/env_logging_callback.py | 38 ++++++++++++-------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/llmfoundry/callbacks/env_logging_callback.py b/llmfoundry/callbacks/env_logging_callback.py index 1b5514823c..7210f53ba5 100644 --- a/llmfoundry/callbacks/env_logging_callback.py +++ b/llmfoundry/callbacks/env_logging_callback.py @@ -75,12 +75,17 @@ def __init__( self.env_data: dict[str, Any] = {} self.packages_to_log = packages_to_log or _PACKAGES_TO_LOG - def _get_git_info(self, repo_path: str) -> dict[str, str]: - repo = git.Repo(repo_path) - return { - 'commit_hash': repo.head.commit.hexsha, - 'branch': repo.active_branch.name, - } + def _get_git_info(self, repo_path: str) -> Optional[dict[str, str]]: + if not os.path.isdir(os.path.join(self.workspace_dir, folder)): + return None + try: + repo = git.Repo(repo_path) + return { + 'commit_hash': repo.head.commit.hexsha, + 'branch': repo.active_branch.name, + } + except (git.InvalidGitRepositoryError, git.NoSuchPathError): + return None def _get_package_version(self, package_name: str) -> Optional[str]: try: @@ -137,7 +142,9 @@ def _get_distributed_info(self) -> dict[str, Any]: 'local_rank': dist.get_local_rank(), } - def _get_docker_info(self) -> dict[str, Any]: + def _get_docker_info(self) -> Optional[dict[str, Any]]: + if 'RUN_NAME' not in os.environ: + return None run = sdk.get_run(os.environ['RUN_NAME']) image, tag = run.image.split(':') return { @@ -148,12 +155,13 @@ def _get_docker_info(self) -> dict[str, Any]: def fit_start(self, state: State, logger: Logger) -> None: # Collect environment data if self.log_git: - self.env_data['git_info'] = { - folder: - self._get_git_info(os.path.join(self.workspace_dir, folder)) - for folder in os.listdir(self.workspace_dir) - if os.path.isdir(os.path.join(self.workspace_dir, folder)) - } + self.env_data['git_info'] = {} + for folder in os.listdir(self.workspace_dir): + path = self._get_git_info( + os.path.join(self.workspace_dir, folder), + ) + if path: + self.env_data['git_info'][folder] = path if self.log_packages: self.env_data['package_versions'] = { @@ -164,7 +172,9 @@ def fit_start(self, state: State, logger: Logger) -> None: self.env_data['nvidia'] = self._get_nvidia_info() if self.log_docker: - self.env_data['docker'] = self._get_docker_info() + if docker_info := self._get_docker_info(): + self.env_data['docker'] = docker_info + if self.log_system: self.env_data['system_info'] = self._get_system_info() From 390d3a4b1fa5c0bf98451d3efcc4a62af5b37ff0 Mon Sep 17 00:00:00 2001 From: Jose Javier <26491792+josejg@users.noreply.github.com> Date: Fri, 16 Aug 2024 23:22:04 -0700 Subject: [PATCH 8/8] fix --- llmfoundry/callbacks/env_logging_callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/env_logging_callback.py b/llmfoundry/callbacks/env_logging_callback.py index 7210f53ba5..a192390976 100644 --- a/llmfoundry/callbacks/env_logging_callback.py +++ b/llmfoundry/callbacks/env_logging_callback.py @@ -76,7 +76,7 @@ def __init__( self.packages_to_log = packages_to_log or _PACKAGES_TO_LOG def _get_git_info(self, repo_path: str) -> Optional[dict[str, str]]: - if not os.path.isdir(os.path.join(self.workspace_dir, folder)): + if not os.path.isdir(repo_path): return None try: repo = git.Repo(repo_path)