You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When I try to reproduce the result, I encounter a bug. I post some information below. Please contact me if anyone knows how to deal with it. Any help from you will be gratitude.
+++
-environment:
Ubuntu 22.04.3
python 3.10.12
CUDA 12.1
PyTorch 2.1.1
GPU V100 * 1
-Command:
python fed_seed_run.py /root/workspace fedavg rte fine-tuning 1000 0,0,0
-Log:
Traceback (most recent call last):
File "/root/workspace/code/FedETuning/main.py", line 20, in
main()
File "/root/workspace/code/FedETuning/main.py", line 16, in main
trainer.train()
File "/root/workspace/code/FedETuning/trainers/FedBaseTrainer.py", line 96, in train
self.client_manager.run()
File "/root/workspace/code/FedETuning/fedlab/core/network_manager.py", line 38, in run
self.main_loop()
File "/root/workspace/code/FedETuning/trainers/BaseClient/base_client.py", line 344, in main_loop
sender_rank, message_code, payload = self._network.recv(src=0)
File "/root/workspace/code/FedETuning/fedlab/core/network.py", line 102, in recv
sender_rank, message_code, content = PackageProcessor.recv_package(
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 118, in recv_package
sender_rank, _, slices_size, message_code, data_type = recv_header(
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 96, in recv_header
dist.recv(buffer, src=src)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1640, in recv
pg.recv([tensor], src, tag).wait()
RuntimeError: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.5]:38114
Traceback (most recent call last):
File "/root/workspace/code/FedETuning/main.py", line 20, in
main()
File "/root/workspace/code/FedETuning/main.py", line 16, in main
trainer.train()
File "/root/workspace/code/FedETuning/trainers/FedBaseTrainer.py", line 96, in train
self.client_manager.run()
File "/root/workspace/code/FedETuning/fedlab/core/network_manager.py", line 38, in run
self.main_loop()
File "/root/workspace/code/FedETuning/trainers/BaseClient/base_client.py", line 368, in main_loop
self.synchronize()
File "/root/workspace/code/FedETuning/trainers/BaseClient/base_client.py", line 376, in synchronize
self._network.send(
File "/root/workspace/code/FedETuning/fedlab/core/network.py", line 90, in send
PackageProcessor.send_package(pack, dst=dst)
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 74, in send_package
send_content(content=package.content, dst=dst)
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 61, in send_content
dist.send(content, dst=dst)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1597, in send
default_pg.send([tensor], dst, tag).wait()
RuntimeError: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.5]:41358
Exception ignored in: <function Pool.del at 0x7f0097291990>
Traceback (most recent call last):
File "/usr/lib/python3.10/multiprocessing/pool.py", line 271, in del
File "/usr/lib/python3.10/multiprocessing/queues.py", line 371, in put
AttributeError: 'NoneType' object has no attribute 'dumps'
+++
The text was updated successfully, but these errors were encountered:
When I try to reproduce the result, I encounter a bug. I post some information below. Please contact me if anyone knows how to deal with it. Any help from you will be gratitude.
+++
-environment:
Ubuntu 22.04.3
python 3.10.12
CUDA 12.1
PyTorch 2.1.1
GPU V100 * 1
-Command:
python fed_seed_run.py /root/workspace fedavg rte fine-tuning 1000 0,0,0
-Log:
Traceback (most recent call last):
File "/root/workspace/code/FedETuning/main.py", line 20, in
main()
File "/root/workspace/code/FedETuning/main.py", line 16, in main
trainer.train()
File "/root/workspace/code/FedETuning/trainers/FedBaseTrainer.py", line 96, in train
self.client_manager.run()
File "/root/workspace/code/FedETuning/fedlab/core/network_manager.py", line 38, in run
self.main_loop()
File "/root/workspace/code/FedETuning/trainers/BaseClient/base_client.py", line 344, in main_loop
sender_rank, message_code, payload = self._network.recv(src=0)
File "/root/workspace/code/FedETuning/fedlab/core/network.py", line 102, in recv
sender_rank, message_code, content = PackageProcessor.recv_package(
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 118, in recv_package
sender_rank, _, slices_size, message_code, data_type = recv_header(
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 96, in recv_header
dist.recv(buffer, src=src)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1640, in recv
pg.recv([tensor], src, tag).wait()
RuntimeError: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.5]:38114
Traceback (most recent call last):
File "/root/workspace/code/FedETuning/main.py", line 20, in
main()
File "/root/workspace/code/FedETuning/main.py", line 16, in main
trainer.train()
File "/root/workspace/code/FedETuning/trainers/FedBaseTrainer.py", line 96, in train
self.client_manager.run()
File "/root/workspace/code/FedETuning/fedlab/core/network_manager.py", line 38, in run
self.main_loop()
File "/root/workspace/code/FedETuning/trainers/BaseClient/base_client.py", line 368, in main_loop
self.synchronize()
File "/root/workspace/code/FedETuning/trainers/BaseClient/base_client.py", line 376, in synchronize
self._network.send(
File "/root/workspace/code/FedETuning/fedlab/core/network.py", line 90, in send
PackageProcessor.send_package(pack, dst=dst)
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 74, in send_package
send_content(content=package.content, dst=dst)
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 61, in send_content
dist.send(content, dst=dst)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1597, in send
default_pg.send([tensor], dst, tag).wait()
RuntimeError: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.5]:41358
Exception ignored in: <function Pool.del at 0x7f0097291990>
Traceback (most recent call last):
File "/usr/lib/python3.10/multiprocessing/pool.py", line 271, in del
File "/usr/lib/python3.10/multiprocessing/queues.py", line 371, in put
AttributeError: 'NoneType' object has no attribute 'dumps'
+++
The text was updated successfully, but these errors were encountered: