-
Notifications
You must be signed in to change notification settings - Fork 0
/
OpenAI-RLLeaning.py
87 lines (71 loc) · 2.33 KB
/
OpenAI-RLLeaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import numpy as np
import gym
import random
def main():
# create Taxi environment
env = gym.make('Taxi-v3',render_mode="human")
# initialize q-table
state_size = env.observation_space.n
action_size = env.action_space.n
qtable = np.zeros((state_size, action_size))
# hyperparameters
learning_rate = 0.5
discount_rate = 0.6
epsilon = 1.0
decay_rate= 0.005
# training variables
num_episodes = 1000
max_steps = 99 # per episode
convergence_criteria=0.000000000001
# training
for episode in range(num_episodes):
# reset the environment
state = env.reset()[0]
done = False
maxDifForEpisode=0
for s in range(max_steps):
# exploration-exploitation tradeoff
if random.uniform(0,1) < epsilon:
# explore
action = env.action_space.sample()
else:
# exploit
action = np.argmax(qtable[state,:])
# take action and observe reward
new_state, reward, done, info = env.step(action)[:4]
# Q-learning algorithm
diff=learning_rate * (reward + discount_rate * np.max(qtable[new_state,:])-qtable[state,action])
qtable[state,action] = qtable[state,action] + diff
if (diff<0):
diff=-diff
maxDifForEpisode=max(maxDifForEpisode,diff)
# Update to our new state
state = new_state
# if done, finish episode
if done == True:
break
# Decrease epsilon
epsilon = np.exp(-decay_rate*episode)
if (maxDifForEpisode<convergence_criteria):
print(maxDifForEpisode)
break
print(f"Training completed over {episode+1} episodes")
input("Press Enter to watch trained agent...")
# watch trained agent
state = env.reset()[0]
done = False
rewards = 0
for s in range(max_steps):
print(f"TRAINED AGENT")
print("Step {}".format(s+1))
action = np.argmax(qtable[state,:])
new_state, reward, done, info = env.step(action)[:4]
rewards += reward
env.render()
print(f"score: {rewards}")
state = new_state
if done == True:
break
env.close()
if __name__ == "__main__":
main()