-
Notifications
You must be signed in to change notification settings - Fork 0
/
maze.py
154 lines (154 loc) · 5.51 KB
/
maze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import numpy as np
import pic2maze
class GridWorld(object):
def __init__(self,m,n,obs,end):
self.grid = np.zeros((m,n))
self.m = m
self.n = n
self.stateSpace = [i for i in range(self.m*self.n)]
self.end = abs(end-self.m*self.n)
self.stateSpace.remove(self.end)
self.stateSpacePlus = [i for i in range(self.m*self.n)]
self.actionSpace = {'Up': -self.m, 'Down': self.m,
'Left': -1, 'Right': 1}
self.possibleActions = ['Up', 'Down', 'Left', 'Right']
self.agentPosition = 0 # doesn't matter as long as it's not the end cell
self.obstacles = obs
def isTerminalState(self, state):
return(state in self.stateSpacePlus and state not in self.stateSpace)
def getAgentRowAndColumn(self):
x = self.agentPosition // self.m
y = self.agentPosition % self.n
return x,y
def setState(self,state):
x, y = self.getAgentRowAndColumn()
self.grid[x][y] = 0
self.agentPosition = state
x, y = self.getAgentRowAndColumn()
self.grid[x][y] = 1
def badMove(self, newState, oldState):
if newState not in self.stateSpacePlus:
return True
elif oldState % self.m == 0 and newState % self.m == self.m - 1:
return True
elif oldState % self.m == self.m - 1 and newState % self.m == 0:
return True
elif newState in self.obstacles:
return True
else:
return False
def step(self, action):
x, y = self.getAgentRowAndColumn()
resultingState = self.agentPosition + self.actionSpace[action]
reward = -1 if not self.isTerminalState(resultingState) else 0
if not self.badMove(resultingState, self.agentPosition):
self.setState(resultingState)
return resultingState, reward, self.isTerminalState(self.agentPosition), None
else:
return self.agentPosition, reward, self.isTerminalState(self.agentPosition), None
def reset(self):
self.agentPosition = 0
self.grid = np.zeros((self.m, self.n))
return self.agentPosition
def render(self):
print('-------------------------------')
for row in self.grid:
for col in row:
if col == 0:
print('-', end='\t')
elif col == 1:
print('X', end='\t')
else:
print("?", end='\t')
print('\n')
print('-------------------------------')
def actionSpaceSample(self):
return np.random.choice(self.possibleActions)
def maxAction(Q, state, actions):
values = np.array([Q[state,a] for a in actions])
action = np.argmax(values)
return actions[action]
def renderPolicy(Q,env):
actionz = []
policy = []
qlen = env.m * env.n #* len(env.possibleActions)
for index in range(qlen):
if index == 0 or index == 81:
continue
for action in env.possibleActions:
actionz.append([Q[index, action],action])
for i in actionz:
policy.append(max(actionz)[1])
break
actionz = []
count = 1
print('-------------------------------')
print("start", end='\t')
place = 1
for choice in policy:
if count == env.m:
print('\n')
count = 0
count += 1
if place in env.obstacles:
print("ROCK", end="\t")
elif place == env.end:
print("END", end="\t")
else:
if choice == "Up":
print("↑", end="\t")
elif choice == "Down":
print("↓", end="\t")
elif choice == "Right":
print("→", end='\t')
elif choice == "Left":
print('←', end='\t')
else:
print("??", end='\t')
place += 1
print('\n')
print('-------------------------------')
def main(m,n,obs,end,alpha=.1,discount=1,eps=1):
env = GridWorld(m,n,obs,end)
ALPHA = alpha
discount = 1 # infinitley farsighted
eps = 1 # greedy
Q = {}
for state in env.stateSpacePlus:
for action in env.possibleActions:
Q[state,action] = 0
numGames = 20000
totalRewards = np.zeros(numGames)
observation = env.reset()
for i in range(numGames):
if i % 5000 == 0:
print('starting game', i)
print("Current Policy:")
renderPolicy(Q,env)
#env.render()
done = False
epRewards = 0
observation = env.reset()
while not done:
rand = np.random.random()
action = maxAction(Q, observation, env.possibleActions) if rand < (1-eps) \
else env.actionSpaceSample()
observation_, reward, done, info = env.step(action)
epRewards += reward
action_ = maxAction(Q, observation_, env.possibleActions)
Q[observation, action] = Q[observation, action] + ALPHA*(reward + \
discount*Q[observation_, action_] - Q[observation, action])
observation = observation_
if eps - 2 / numGames > 0: # linear decrease in eps twoards pure greed about halfway through
eps -= 2 / numGames
else:
eps = 0
totalRewards[i] = epRewards
renderPolicy(Q,env)
if __name__ == "__main__":
rowidth = 5
maze = pic2maze.main("IMG_0954.JPG","obsmaze.png",rowidth)
n = int(maze[0] / rowidth)
m = int(rowidth)
obs = maze[1]
main(m,n,obs,1)