diff --git a/examples/find_and_avoid/README.md b/examples/find_and_avoid/README.md index 64bf8232..db5cf1ff 100644 --- a/examples/find_and_avoid/README.md +++ b/examples/find_and_avoid/README.md @@ -1,6 +1,6 @@ # Find target & avoid obstacles -NOTE: This agent do not converge +:warning: NOTE: This agent do not converge well This is a typical find target and avoid obstacles task with a simple world configuration. For this task the E-puck robot is used, which is a compact mobile @@ -21,4 +21,8 @@ gas and wheel). [implementation](https://github.com/philtabor/Actor-Critic-Methods-Paper-To-Code/tree/master/DDPG) which is presented in [Youtube video](https://www.youtube.com/watch?v=6Yd5WnYls_Y). + +|Trained Agent Showcase|Reward Per Episode Plot| +|----------------------|-----------------------| +|![image](https://github.com/KelvinYang0320/deepworlds/blob/dev/examples/find_and_avoid/doc/demo.gif)|![image](https://github.com/KelvinYang0320/deepworlds/blob/dev/examples/find_and_avoid/doc/trend.png)| diff --git a/examples/find_and_avoid/controllers/robot/robot.py b/examples/find_and_avoid/controllers/robot/robot.py index ecc7c55d..3a9675dc 100644 --- a/examples/find_and_avoid/controllers/robot/robot.py +++ b/examples/find_and_avoid/controllers/robot/robot.py @@ -30,12 +30,11 @@ def use_message_data(self, message): # Action 0 is turning wheel = float(message[0]) - # Normalzie gas from [-1, 1] to [0.3, 1.3] to make robot always move forward - gas *= 4 - # Clip it - # gas = np.clip(gas, -0.5, 4.0) + # Mapping gas from [-1, 1] to [0, 4] to make robot always move forward + gas = (gas+1)*2 + gas = np.clip(gas, 0, 4.0) - # Clip turning rate to [-1, 1] + # Mapping turning rate from [-1, 1] to [-2, 2] wheel *= 2 wheel = np.clip(wheel, -2, 2) @@ -44,11 +43,10 @@ def use_message_data(self, message): self.motorSpeeds[1] = gas - wheel # Clip final motor speeds to [-4, 4] to be sure that motors get valid values - self.motorSpeeds = np.clip(self.motorSpeeds, -4, 4) + self.motorSpeeds = np.clip(self.motorSpeeds, 0, 6) # Apply motor speeds - self.leftMotor.setVelocity(self.motorSpeeds[0]) - self.rightMotor.setVelocity(self.motorSpeeds[1]) + self._setVelocity(self.motorSpeeds[0], self.motorSpeeds[1]) def setup_rangefinders(self, n_rangefinders): # Sensor setup @@ -65,12 +63,14 @@ def setup_motors(self): # Motor setup self.leftMotor = self.robot.getDevice('left wheel motor') self.rightMotor = self.robot.getDevice('right wheel motor') + self._setVelocity(0.0, 0.0) + self.motorSpeeds = [0.0, 0.0] + + def _setVelocity(self, v1, v2): self.leftMotor.setPosition(float('inf')) self.rightMotor.setPosition(float('inf')) - self.leftMotor.setVelocity(0.0) - self.rightMotor.setVelocity(0.0) - - self.motorSpeeds = [0.0, 0.0] + self.leftMotor.setVelocity(v1) + self.rightMotor.setVelocity(v2) robot_controller = FindTargetRobot(8) diff --git a/examples/find_and_avoid/controllers/supervisor/checkConvergence.py b/examples/find_and_avoid/controllers/supervisor/checkConvergence.py new file mode 100644 index 00000000..27f6b575 --- /dev/null +++ b/examples/find_and_avoid/controllers/supervisor/checkConvergence.py @@ -0,0 +1,28 @@ +import matplotlib.pyplot as plt +import numpy as np +import time +def moving_average(x, w): + return np.convolve(x, np.ones(w), 'valid') / w + +def Plot(text_path="./exports/", dest_path="./exports/"): + fp=open(text_path+"Episode-score.txt", "r") + lines = fp.read().splitlines() + scores = list(map(float, lines)) + episode = list(range(1, 1+len(scores))) + plt.figure() + plt.title("Episode scores over episode") + plt.plot(episode, scores, label='Raw data') + SMA = moving_average(scores, 50) + plt.plot(SMA, label='SMA50') + plt.xlabel("episode") + plt.ylabel("episode score") + + plt.legend() + plt.savefig(dest_path+'trend.png') + plt.close() + +if __name__ == '__main__': + while(1): + + Plot() + time.sleep(60) \ No newline at end of file diff --git a/examples/find_and_avoid/controllers/supervisor/supervisor.py b/examples/find_and_avoid/controllers/supervisor/supervisor.py index e149893a..fdb7a184 100644 --- a/examples/find_and_avoid/controllers/supervisor/supervisor.py +++ b/examples/find_and_avoid/controllers/supervisor/supervisor.py @@ -6,7 +6,7 @@ import utilities as utils from models.networks import DDPG - +import os OBSERVATION_SPACE = 10 ACTION_SPACE = 2 @@ -26,12 +26,21 @@ def __init__(self, robot, target): self.target_name = target self.robot = self.getFromDef(robot) self.target = self.getFromDef(target) - self.findThreshold = 0.12 + self.findThreshold = 0.05 self.steps = 0 - self.steps_threshold = 600 + self.steps_threshold = 500 self.message = [] self.should_done = False + self.pre_distance = None + ''' + Get other 2 intermediate targets when training the robot in small_world.wbt instead of small_world_easy.wbt. + + self.mid1 = self.getFromDef("mid1") + self.mid2 = self.getFromDef("mid2") + ''' + self.is_solved = False + def get_default_observation(self): return [0 for i in range(OBSERVATION_SPACE)] @@ -80,32 +89,42 @@ def get_reward(self, action): return 0 rf_values = np.array(self.message[:8]) - + reward = 0 - if self.steps > self.steps_threshold: - return -10 - - if utils.get_distance_from_target(self.robot, - self.target) < self.findThreshold: - return +10 - - if np.abs(action[1]) > 1.5 or np.abs(action[0]) > 1.5: - if self.steps > 10: - self.should_done = True - return -1 - - if np.max(rf_values) > 500: - if self.steps > 10: - self.should_done = True - return -1 - elif np.max(rf_values) > 200: - return -0.5 - - # if (distance != 0): - # reward = 0.1 * np.round((0.6 / distance), 1) - + # # (1) Take too many steps + # if self.steps > self.steps_threshold: + # return -10 # reward -= (self.steps / self.steps_threshold) + + # # (2) Reward according to distance + target_ = self.target + + if self.pre_distance == None: + self.pre_distance = utils.get_distance_from_target(self.robot, target_) + else: + cur_distance = utils.get_distance_from_target(self.robot, target_) + reward = self.pre_distance - cur_distance + self.pre_distance = cur_distance + + # # (3) Find the target + # if utils.get_distance_from_target(self.robot, self.target) < self.findThreshold: + # reward += 5 + + # # (4) Action 1 (gas) or Action 0 (turning) should <= 1.5 + # if np.abs(action[1]) > 1.5 or np.abs(action[0]) > 1.5: + # if self.steps > 10: + # self.should_done = True + # return -1 + + # # (5) Stop or Punish the agent when the robot is getting to close to obstacle + # if np.max(rf_values) > 500: + # if self.steps > 10: + # self.should_done = True + # return -1 + # elif np.max(rf_values) > 200: + # return -0.5 + return reward def is_done(self): @@ -114,10 +133,10 @@ def is_done(self): if distance < self.findThreshold: print("======== + Solved + ========") + self.is_solved = True return True if self.steps > self.steps_threshold or self.should_done: - return True return False @@ -125,6 +144,8 @@ def is_done(self): def reset(self): self.steps = 0 self.should_done = False + self.pre_distance = None + self.is_solved = False return super().reset() @@ -132,56 +153,86 @@ def get_info(self): pass -supervisor_pre = FindTargetSupervisor('robot', 'target') -supervisor_env = KeyboardPrinter(supervisor_pre) -agent = DDPG(lr_actor=0.00025, - lr_critic=0.00025, - input_dims=[10], - gamma=0.99, - tau=0.001, - env=supervisor_env, - batch_size=256, - layer1_size=400, - layer2_size=300, - n_actions=2, - load_models=False, - save_dir='./models/saved/ddpg/') - -score_history = [] - -np.random.seed(0) - -for i in range(1, 500): - done = False - score = 0 - obs = list(map(float, supervisor_env.reset())) - - first_iter = True - if i % 250 == 0: - print("================= TESTING =================") - while not done: - act = agent.choose_action_test(obs).tolist() - new_state, _, done, _ = supervisor_env.step(act) - obs = list(map(float, new_state)) +def create_path(path): + try: + os.makedirs(path) + except OSError: + print ("Creation of the directory %s failed" % path) else: - print("================= TRAINING =================") - while not done: - if (not first_iter): - act = agent.choose_action_train(obs).tolist() - else: - first_iter = False - act = [0, 0] - - new_state, reward, done, info = supervisor_env.step(act) - agent.remember(obs, act, reward, new_state, int(done)) - agent.learn() - score += reward - - obs = list(map(float, new_state)) - - score_history.append(score) - print("===== Episode", i, "score %.2f" % score, - "100 game average %.2f" % np.mean(score_history[-100:])) - - # if i % 100 == 0: - # agent.save_models() + print ("Successfully created the directory %s " % path) + +if __name__ == '__main__': + create_path("./models/saved/ddpg/") + create_path("./exports/") + + supervisor_pre = FindTargetSupervisor('robot', 'target') + supervisor_env = KeyboardPrinter(supervisor_pre) + agent = DDPG(lr_actor=0.00025, + lr_critic=0.00025, + input_dims=[10], + gamma=0.99, + tau=0.001, + env=supervisor_env, + batch_size=256, + layer1_size=400, + layer2_size=300, + layer3_size=200, + n_actions=2, + load_models=False, + save_dir='./models/saved/ddpg/') + # # Load from checkpoint + # agent.load_models(lr_critic=0.00025, lr_actor=0.00025, + # input_dims=[10], + # layer1_size=400, + # layer2_size=300, + # layer3_size=200, + # n_actions=2, + # load_dir='./models/saved/ddpg/') + score_history = [] + + np.random.seed(0) + N_episode = 600 + for i in range(N_episode+1): + done = False + score = 0 + obs = list(map(float, supervisor_env.reset())) + + first_iter = True + + if score_history == [] or np.mean(score_history[-50:])<0.5 or score_history[-1]<0.65: + print("================= TRAINING =================") + while not done: + if (not first_iter): + act = agent.choose_action_train(obs).tolist() + else: + first_iter = False + act = [0, 0] + + new_state, reward, done, info = supervisor_env.step(act) + agent.remember(obs, act, reward, new_state, int(done)) + agent.learn() + score += reward + + obs = list(map(float, new_state)) + else: + print("================= TESTING =================") + while not done: + if (not first_iter): + act = agent.choose_action_test(obs).tolist() + else: + first_iter = False + act = [0, 0] + + new_state, _, done, _ = supervisor_env.step(act) + obs = list(map(float, new_state)) + + + score_history.append(score) + fp = open("./exports/Episode-score.txt","a") + fp.write(str(score)+'\n') + fp.close() + print("===== Episode", i, "score %.2f" % score, + "50 game average %.2f" % np.mean(score_history[-50:])) + + if supervisor_pre.is_solved == True: + agent.save_models() \ No newline at end of file diff --git a/examples/find_and_avoid/doc/demo.gif b/examples/find_and_avoid/doc/demo.gif new file mode 100644 index 00000000..fd27f5dd Binary files /dev/null and b/examples/find_and_avoid/doc/demo.gif differ diff --git a/examples/find_and_avoid/doc/trend.png b/examples/find_and_avoid/doc/trend.png new file mode 100644 index 00000000..a05e0a80 Binary files /dev/null and b/examples/find_and_avoid/doc/trend.png differ diff --git a/examples/find_and_avoid/worlds/small_world.wbt b/examples/find_and_avoid/worlds/small_world.wbt index 86b38f9d..09df1eae 100644 --- a/examples/find_and_avoid/worlds/small_world.wbt +++ b/examples/find_and_avoid/worlds/small_world.wbt @@ -1,24 +1,21 @@ -#VRML_SIM R2021a utf8 +#VRML_SIM R2021b utf8 WorldInfo { coordinateSystem "NUE" } Viewpoint { - orientation 1 0 0 4.71238898038469 - position -1.0204376820965868e-16 2.343537727000891 3.6188514266717534e-16 + orientation -0.999993584557609 0.0025653260563713327 0.002499989169686478 1.545006413329063 + position -0.0696317107449893 2.632085990439978 0.07609127999188595 } WoodenBox { - translation -0.23 0.15 0.15 - size 0.1 0.3 0.7 -} -DEF target Ball { - translation 0.38 0.06991367201317689 -0.37 - radius 0.07 + translation -0.12 0.08 0.1 + size 0.06 0.15 0.5 } TexturedBackground { } TexturedBackgroundLight { } RectangleArena { + floorSize 0.7 0.7 } DEF supervisor Robot { children [ @@ -32,16 +29,29 @@ DEF supervisor Robot { supervisor TRUE } DEF robot E-puck { - hidden position_0_0 -2.5496523492311853e-11 - hidden position_0_1 2.6980067249608448e-11 - hidden translation_1 -0.026 0.02 0 - hidden rotation_1 -1 0 0 0 - hidden translation_2 0.026 0.02 0 - hidden rotation_2 -1 0 0 0 - translation -0.413245814554288 -6.388972745422807e-05 0.35468382649016655 - rotation -0.0005534598842282864 0.999999486515061 -0.0008489121103638631 -1.9793525593801036 + translation -0.247908 0.000104606 0.267582 name "robot" controller "robot" emitter_channel 0 receiver_channel 0 } +DEF target Solid { + translation 0.26 0.01 -0.25 + children [ + Shape { + appearance Appearance { + material Material { + } + texture ImageTexture { + url [ + "https://avatars.githubusercontent.com/u/57842071?s=200&v=4" + ] + } + } + geometry Plane { + size 0.12 0.12 + } + } + ] + name "target" +}