记录一个Pytorch多卡训练的问题
Sakura
最近在服务器上用torch.nn.DistributedDataParallel训练模型,但是一直跑到一半会遇到RendezvousConnectionError,完整的错误信息如下
WARNING:torch.distributed.elastic.rendezvous.dynamic_rendezvous:The node 'gpu09.cluster.com_24452_0' has failed to send a keep-alive heartbeat to the rendezvous '1ba4523a-8461-48d7-96c7-1c70a350b9f0' due to an error of type RendezvousConnectionError.
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24464 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24465 closing signal SIGTERM
WARNING:torch.distributed.elastic.rendezvous.dynamic_rendezvous:The node 'gpu09.cluster.com_24452_0' has failed to shutdown the rendezvous '1ba4523a-8461-48d7-96c7-1c70a350b9f0' due to an error of type RendezvousConnectionError.
ERROR:torch.distributed.elastic.multiprocessing.errors.error_handler:{
"message": {
"message": "RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details.",
"extraInfo": {
"py_callstack": "Traceback (most recent call last):\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py\", line 113, in _call_store\n return getattr(self._store, store_op)(*args, **kwargs)\nRuntimeError: Broken pipe\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 345, in wrapper\n return f(*args, **kwargs)\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/run.py\", line 719, in main\n run(args)\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/run.py\", line 710, in run\n elastic_launch(\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/launcher/api.py\", line 131, in __call__\n return launch_agent(self._config, self._entrypoint, list(args))\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/launcher/api.py\", line 252, in launch_agent\n result = agent.run()\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/metrics/api.py\", line 125, in wrapper\n result = f(*args, **kwargs)\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py\", line 709, in run\n result = self._invoke_run(role)\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py\", line 874, in _invoke_run\n num_nodes_waiting = rdzv_handler.num_nodes_waiting()\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py\", line 1079, in num_nodes_waiting\n self._state_holder.sync()\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py\", line 408, in sync\n get_response = self._backend.get_state()\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py\", line 73, in get_state\n base64_state: bytes = self._call_store(\"get\", self._key)\n File \"/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py\", line 115, in _call_store\n raise RendezvousConnectionError(\ntorch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details.\n",
"timestamp": "1638142809"
Traceback (most recent call last):
File "/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store
return getattr(self._store, store_op)(*args, **kwargs)
RuntimeError: Broken pipe
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/users10/yuhanchen/miniconda3/envs/drug/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==1.10.0', 'console_scripts', 'torchrun')())
File "/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/run.py", line 719, in main
run(args)
File "/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/run.py", line 710, in run
elastic_launch(
File "/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 252, in launch_agent
result = agent.run()
File "/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper
result = f(*args, **kwargs)
File "/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run
result = self._invoke_run(role)
File "/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 874, in _invoke_run
num_nodes_waiting = rdzv_handler.num_nodes_waiting()
File "/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1079, in num_nodes_waiting
self._state_holder.sync()
File "/users10/yuhanchen/miniconda3/envs/drug/lib/python3.9/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 408, in sync