You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Dear zhejz,
When I run the train_rl.sh through the training process I have EOF Error, I have 40 gigs RAM and running it on a 3090 gpu, I've this error in different epochs frequently after n_epoch: 0,n_epoch: 8,n_epoch: 25.
Here is the full error:
Error executing job with overrides: ['agent.ppo.wb_run_path=null', 'wb_project=train_rl_experts', 'wb_name=roach', 'agen
t/ppo/policy=xtma_beta', 'agent.ppo.training.kwargs.explore_coef=0.05', 'carla_sh_path=/media/carla/AVRL/carla/CarlaUE4.
sh']
Traceback (most recent call last):
File "train_rl.py", line 75, in main
agent.learn(env, total_timesteps=int(cfg.total_timesteps), callback=callback, seed=cfg.seed)
File "/media/carla/AVRL/roach/DML_AVRL/agents/rl_birdview/rl_birdview_agent.py", line 109, in learn
model.learn(total_timesteps, callback=callback, seed=seed)
File "/media/carla/AVRL/roach/DML_AVRL/agents/rl_birdview/models/ppo.py", line 249, in learn
callback.on_training_end()
File "/media/carla/AVRL/roach/env/carla/lib/python3.8/site-packages/stable_baselines3/common/callbacks.py", line 95, i
n on_training_end
self._on_training_end()
File "/media/carla/AVRL/roach/env/carla/lib/python3.8/site-packages/stable_baselines3/common/callbacks.py", line 179,
in _on_training_end
callback.on_training_end()
File "/media/carla/AVRL/roach/env/carla/lib/python3.8/site-packages/stable_baselines3/common/callbacks.py", line 95, i
n on_training_end
self._on_training_end()
File "/media/carla/AVRL/roach/DML_AVRL/agents/rl_birdview/utils/wandb_callback.py", line 67, in _on_training_end
avg_ep_stat, ep_events = self.evaluate_policy(self.vec_env, self.model.policy, eval_video_path)
File "/media/carla/AVRL/roach/DML_AVRL/agents/rl_birdview/utils/wandb_callback.py", line 158, in evaluate_policy
obs, reward, done, info = env.step(actions)
File "/media/carla/AVRL/roach/env/carla/lib/python3.8/site-packages/stable_baselines3/common/vec_env/base_vec_env.py",
line 161, in step
return self.step_wait()
File "/media/carla/AVRL/roach/env/carla/lib/python3.8/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.p
y", line 107, in step_wait
results = [remote.recv() for remote in self.remotes]
File "/media/carla/AVRL/roach/env/carla/lib/python3.8/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.p
y", line 107, in
results = [remote.recv() for remote in self.remotes]
File "/usr/lib/python3.8/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/lib/python3.8/multiprocessing/connection.py", line 420, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 389, in _recv
raise EOFError
EOFError
The text was updated successfully, but these errors were encountered:
Dear zhejz,
When I run the train_rl.sh through the training process I have EOF Error, I have 40 gigs RAM and running it on a 3090 gpu, I've this error in different epochs frequently after n_epoch: 0,n_epoch: 8,n_epoch: 25.
Here is the full error:
Error executing job with overrides: ['agent.ppo.wb_run_path=null', 'wb_project=train_rl_experts', 'wb_name=roach', 'agen
t/ppo/policy=xtma_beta', 'agent.ppo.training.kwargs.explore_coef=0.05', 'carla_sh_path=/media/carla/AVRL/carla/CarlaUE4.
sh']
Traceback (most recent call last):
File "train_rl.py", line 75, in main
agent.learn(env, total_timesteps=int(cfg.total_timesteps), callback=callback, seed=cfg.seed)
File "/media/carla/AVRL/roach/DML_AVRL/agents/rl_birdview/rl_birdview_agent.py", line 109, in learn
model.learn(total_timesteps, callback=callback, seed=seed)
File "/media/carla/AVRL/roach/DML_AVRL/agents/rl_birdview/models/ppo.py", line 249, in learn
callback.on_training_end()
File "/media/carla/AVRL/roach/env/carla/lib/python3.8/site-packages/stable_baselines3/common/callbacks.py", line 95, i
n on_training_end
self._on_training_end()
File "/media/carla/AVRL/roach/env/carla/lib/python3.8/site-packages/stable_baselines3/common/callbacks.py", line 179,
in _on_training_end
callback.on_training_end()
File "/media/carla/AVRL/roach/env/carla/lib/python3.8/site-packages/stable_baselines3/common/callbacks.py", line 95, i
n on_training_end
self._on_training_end()
File "/media/carla/AVRL/roach/DML_AVRL/agents/rl_birdview/utils/wandb_callback.py", line 67, in _on_training_end
avg_ep_stat, ep_events = self.evaluate_policy(self.vec_env, self.model.policy, eval_video_path)
File "/media/carla/AVRL/roach/DML_AVRL/agents/rl_birdview/utils/wandb_callback.py", line 158, in evaluate_policy
obs, reward, done, info = env.step(actions)
File "/media/carla/AVRL/roach/env/carla/lib/python3.8/site-packages/stable_baselines3/common/vec_env/base_vec_env.py",
line 161, in step
return self.step_wait()
File "/media/carla/AVRL/roach/env/carla/lib/python3.8/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.p
y", line 107, in step_wait
results = [remote.recv() for remote in self.remotes]
File "/media/carla/AVRL/roach/env/carla/lib/python3.8/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.p
y", line 107, in
results = [remote.recv() for remote in self.remotes]
File "/usr/lib/python3.8/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/lib/python3.8/multiprocessing/connection.py", line 420, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 389, in _recv
raise EOFError
EOFError
The text was updated successfully, but these errors were encountered: