Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
``` ________________________________ Traceback (most recent call last) _________________________________ _ /mnt/workdisk/brian/llm-foundry-private/scripts/train/train.py:604 in <module> _ _ _ _ 601 _ cfg = om.merge(yaml_cfg, cli_cfg) _ _ 602 _ om.resolve(cfg) _ _ 603 _ assert isinstance(cfg, DictConfig) _ _ _ 604 _ main(cfg) _ _ 605 _ _ _ _ /mnt/workdisk/brian/llm-foundry-private/scripts/train/train.py:222 in main _ _ _ _ 219 _ _ _ _ _ _ _ _ _ _ _ _ 'dist_timeout', _ _ 220 _ _ _ _ _ _ _ _ _ _ _ _ must_exist=False, _ _ 221 _ _ _ _ _ _ _ _ _ _ _ _ default_value=600.0) _ _ _ 222 _ dist.initialize_dist(get_device(None), timeout=dist_timeout) _ _ 223 _ _ _ 224 _ # Get global and device batch size information from distributed/single node setting _ _ 225 _ cfg = update_batch_size_info(cfg) _ _ _ _ /mnt/workdisk/brian/composer/composer/utils/dist.py:527 in initialize_dist _ _ _ _ 524 _ _ os.environ.update(dist_env_var_defaults) _ _ 525 _ _ dist.init_process_group(device_obj.dist_backend, store=dist.HashStore(), world_s _ _ 526 _ else: _ _ _ 527 _ _ dist.init_process_group(device_obj.dist_backend, timeout=timeout_timedelta) _ _ 528 _ _ 529 _ _ 530 def get_sampler(dataset: torch.utils.data.Dataset, *, drop_last: bool = False, shuffle: _ _ _ _ /mnt/workdisk/brian/mpt_checkpoint/lib/python3.10/site-packages/torch/distributed/c10d_logger.py _ _ :74 in wrapper _ _ _ _ 71 _ @functools.wraps(func) _ _ 72 _ def wrapper(*args, **kwargs): _ _ 73 _ _ t1 = time.time_ns() _ _ _ 74 _ _ func_return = func(*args, **kwargs) _ _ 75 _ _ t2 = time.time_ns() _ _ 76 _ _ _ _ 77 _ _ if dist.is_initialized(): _ _ _ _ /mnt/workdisk/brian/mpt_checkpoint/lib/python3.10/site-packages/torch/distributed/distributed_c1 _ _ 0d.py:1141 in init_process_group _ _ _ _ 1138 _ _ _ rendezvous_iterator = rendezvous( _ _ 1139 _ _ _ _ init_method, rank, world_size, timeout=timeout _ _ 1140 _ _ _ ) _ _ _ 1141 _ _ _ store, rank, world_size = next(rendezvous_iterator) _ _ 1142 _ _ _ store.set_timeout(timeout) _ _ 1143 _ _ _ _ _ 1144 _ _ _ # Use a PrefixStore to avoid accidental overrides of keys used by _ _ _ _ /mnt/workdisk/brian/mpt_checkpoint/lib/python3.10/site-packages/torch/distributed/rendezvous.py: _ _ 231 in _env_rendezvous_handler _ _ _ _ 228 _ if "rank" in query_dict: _ _ 229 _ _ rank = int(query_dict["rank"]) _ _ 230 _ else: _ _ _ 231 _ _ rank = int(_get_env_or_raise("RANK")) _ _ 232 _ _ _ 233 _ if "world_size" in query_dict: _ _ 234 _ _ world_size = int(query_dict["world_size"]) _ _ _ _ /mnt/workdisk/brian/mpt_checkpoint/lib/python3.10/site-packages/torch/distributed/rendezvous.py: _ _ 216 in _get_env_or_raise _ _ _ _ 213 _ def _get_env_or_raise(env_var: str) -> str: _ _ 214 _ _ env_val = os.environ.get(env_var, None) _ _ 215 _ _ if not env_val: _ _ _ 216 _ _ _ raise _env_error(env_var) _ _ 217 _ _ else: _ _ 218 _ _ _ return env_val _ _ 219 ```
- Loading branch information