You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
(lss) root@lsp-ws:/home/user/GLM-main# bash scripts/generate_block.sh
config_tasks/model_blocklm_10B_chinese.sh
/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/launch.py:180: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torchrun.
Note that --use_env is set by default in torchrun.
If your script expects --local_rank argument to be set, please
change it to read from os.environ['LOCAL_RANK'] instead. See https://pytorch.org/docs/stable/distributed.html#launch-utility for
further instructions
warnings.warn(
[2023-07-21 11:28:46,033] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Traceback (most recent call last):
File "generate_samples.py", line 23, in
from arguments import get_args
File "/home/user/GLM-main/arguments.py", line 23, in
from utils import get_hostname
File "/home/user/GLM-main/utils.py", line 26, in
from fp16 import FP16_Optimizer
File "/home/user/GLM-main/fp16/init.py", line 15, in
from .fp16util import (
File "/home/user/GLM-main/fp16/fp16util.py", line 21, in
import mpu
File "/home/user/GLM-main/mpu/init.py", line 50, in
from .transformer import GPT2ParallelTransformer
File "/home/user/GLM-main/mpu/transformer.py", line 22, in
from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/apex/init.py", line 7, in
from . import amp
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/apex/amp/init.py", line 1, in
from .amp import init, half_function, float_function, promote_function,
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/apex/amp/amp.py", line 1, in
from . import compat, rnn_compat, utils, wrap
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/apex/amp/rnn_compat.py", line 1, in
from . import utils, wrap
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/apex/amp/wrap.py", line 3, in
from ._amp_state import _amp_state
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/apex/amp/_amp_state.py", line 14, in
from torch._six import container_abcs
ImportError: cannot import name 'container_abcs' from 'torch._six' (/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/_six.py)
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 115601) of binary: /root/anaconda3/envs/lss/bin/python
Traceback (most recent call last):
File "/root/anaconda3/envs/lss/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/root/anaconda3/envs/lss/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/launch.py", line 195, in
main()
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/launch.py", line 191, in main
launch(args)
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/launch.py", line 176, in launch
run(args)
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
(lss) root@lsp-ws:/home/user/GLM-main# bash scripts/generate_block.sh
config_tasks/model_blocklm_10B_chinese.sh
/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/launch.py:180: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torchrun.
Note that --use_env is set by default in torchrun.
If your script expects
--local_rank
argument to be set, pleasechange it to read from
os.environ['LOCAL_RANK']
instead. Seehttps://pytorch.org/docs/stable/distributed.html#launch-utility for
further instructions
warnings.warn(
[2023-07-21 11:28:46,033] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Traceback (most recent call last):
File "generate_samples.py", line 23, in
from arguments import get_args
File "/home/user/GLM-main/arguments.py", line 23, in
from utils import get_hostname
File "/home/user/GLM-main/utils.py", line 26, in
from fp16 import FP16_Optimizer
File "/home/user/GLM-main/fp16/init.py", line 15, in
from .fp16util import (
File "/home/user/GLM-main/fp16/fp16util.py", line 21, in
import mpu
File "/home/user/GLM-main/mpu/init.py", line 50, in
from .transformer import GPT2ParallelTransformer
File "/home/user/GLM-main/mpu/transformer.py", line 22, in
from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/apex/init.py", line 7, in
from . import amp
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/apex/amp/init.py", line 1, in
from .amp import init, half_function, float_function, promote_function,
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/apex/amp/amp.py", line 1, in
from . import compat, rnn_compat, utils, wrap
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/apex/amp/rnn_compat.py", line 1, in
from . import utils, wrap
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/apex/amp/wrap.py", line 3, in
from ._amp_state import _amp_state
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/apex/amp/_amp_state.py", line 14, in
from torch._six import container_abcs
ImportError: cannot import name 'container_abcs' from 'torch._six' (/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/_six.py)
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 115601) of binary: /root/anaconda3/envs/lss/bin/python
Traceback (most recent call last):
File "/root/anaconda3/envs/lss/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/root/anaconda3/envs/lss/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/launch.py", line 195, in
main()
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/launch.py", line 191, in main
launch(args)
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/launch.py", line 176, in launch
run(args)
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/anaconda3/envs/lss/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
generate_samples.py FAILED
Failures:
<NO_OTHER_FAILURES>
Root Cause (first observed failure):
[0]:
time : 2023-07-21_11:28:50
host : lsp-ws
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 115601)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
(lss) root@lsp-ws:/home/user/GLM-main#
这是什么原因呢?谢谢
The text was updated successfully, but these errors were encountered: