Skip to content

Commit

Permalink
Update README.md (#693)
Browse files Browse the repository at this point in the history
```
________________________________ Traceback (most recent call last) _________________________________
_ /mnt/workdisk/brian/llm-foundry-private/scripts/train/train.py:604 in <module>                   _
_                                                                                                  _
_   601 _   cfg = om.merge(yaml_cfg, cli_cfg)                                                      _
_   602 _   om.resolve(cfg)                                                                        _
_   603 _   assert isinstance(cfg, DictConfig)                                                     _
_ _ 604 _   main(cfg)                                                                              _
_   605                                                                                            _
_                                                                                                  _
_ /mnt/workdisk/brian/llm-foundry-private/scripts/train/train.py:222 in main                       _
_                                                                                                  _
_   219 _   _   _   _   _   _   _   _   _   _   _   _    'dist_timeout',                           _
_   220 _   _   _   _   _   _   _   _   _   _   _   _    must_exist=False,                         _
_   221 _   _   _   _   _   _   _   _   _   _   _   _    default_value=600.0)                      _
_ _ 222 _   dist.initialize_dist(get_device(None), timeout=dist_timeout)                           _
_   223 _                                                                                          _
_   224 _   # Get global and device batch size information from distributed/single node setting    _
_   225 _   cfg = update_batch_size_info(cfg)                                                      _
_                                                                                                  _
_ /mnt/workdisk/brian/composer/composer/utils/dist.py:527 in initialize_dist                       _
_                                                                                                  _
_   524 _   _   os.environ.update(dist_env_var_defaults)                                           _
_   525 _   _   dist.init_process_group(device_obj.dist_backend, store=dist.HashStore(), world_s   _
_   526 _   else:                                                                                  _
_ _ 527 _   _   dist.init_process_group(device_obj.dist_backend, timeout=timeout_timedelta)        _
_   528                                                                                            _
_   529                                                                                            _
_   530 def get_sampler(dataset: torch.utils.data.Dataset, *, drop_last: bool = False, shuffle:    _
_                                                                                                  _
_ /mnt/workdisk/brian/mpt_checkpoint/lib/python3.10/site-packages/torch/distributed/c10d_logger.py _
_ :74 in wrapper                                                                                   _
_                                                                                                  _
_   71 _   @functools.wraps(func)                                                                  _
_   72 _   def wrapper(*args, **kwargs):                                                           _
_   73 _   _   t1 = time.time_ns()                                                                 _
_ _ 74 _   _   func_return = func(*args, **kwargs)                                                 _
_   75 _   _   t2 = time.time_ns()                                                                 _
_   76 _   _                                                                                       _
_   77 _   _   if dist.is_initialized():                                                           _
_                                                                                                  _
_ /mnt/workdisk/brian/mpt_checkpoint/lib/python3.10/site-packages/torch/distributed/distributed_c1 _
_ 0d.py:1141 in init_process_group                                                                 _
_                                                                                                  _
_   1138 _   _   _   rendezvous_iterator = rendezvous(                                             _
_   1139 _   _   _   _   init_method, rank, world_size, timeout=timeout                            _
_   1140 _   _   _   )                                                                             _
_ _ 1141 _   _   _   store, rank, world_size = next(rendezvous_iterator)                           _
_   1142 _   _   _   store.set_timeout(timeout)                                                    _
_   1143 _   _   _                                                                                 _
_   1144 _   _   _   # Use a PrefixStore to avoid accidental overrides of keys used by             _
_                                                                                                  _
_ /mnt/workdisk/brian/mpt_checkpoint/lib/python3.10/site-packages/torch/distributed/rendezvous.py: _
_ 231 in _env_rendezvous_handler                                                                   _
_                                                                                                  _
_   228 _   if "rank" in query_dict:                                                               _
_   229 _   _   rank = int(query_dict["rank"])                                                     _
_   230 _   else:                                                                                  _
_ _ 231 _   _   rank = int(_get_env_or_raise("RANK"))                                              _
_   232 _                                                                                          _
_   233 _   if "world_size" in query_dict:                                                         _
_   234 _   _   world_size = int(query_dict["world_size"])                                         _
_                                                                                                  _
_ /mnt/workdisk/brian/mpt_checkpoint/lib/python3.10/site-packages/torch/distributed/rendezvous.py: _
_ 216 in _get_env_or_raise                                                                         _
_                                                                                                  _
_   213 _   def _get_env_or_raise(env_var: str) -> str:                                            _
_   214 _   _   env_val = os.environ.get(env_var, None)                                            _
_   215 _   _   if not env_val:                                                                    _
_ _ 216 _   _   _   raise _env_error(env_var)                                                      _
_   217 _   _   else:                                                                              _
_   218 _   _   _   return env_val                                                                 _
_   219                                 
```
  • Loading branch information
j316chuck authored Oct 26, 2023
1 parent 08611b0 commit c60657b
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ python inference/convert_composer_to_hf.py \
# --hf_repo_for_upload user-org/repo-name

# Evaluate the model on a subset of tasks
python eval/eval.py \
composer eval/eval.py \
eval/yamls/hf_eval.yaml \
icl_tasks=eval/yamls/copa.yaml \
model_name_or_path=mpt-125m-hf
Expand Down

0 comments on commit c60657b

Please sign in to comment.