diff --git a/1-grid-world/8-a3c/A3C/A3C_complex_environment/environment_a3c_load_weights.py b/1-grid-world/8-a3c/A3C/A3C_complex_environment/environment_a3c_load_weights.py new file mode 100755 index 00000000..3c22ccaa --- /dev/null +++ b/1-grid-world/8-a3c/A3C/A3C_complex_environment/environment_a3c_load_weights.py @@ -0,0 +1,232 @@ +import time +import numpy as np +import random +from PIL import Image +UNIT = 50 # pixels + +HEIGHT = 0 # grid height +WIDTH = 0 # grid width + +# Goal position +goal_x = 0 # x position of goal +goal_y = 0 # y position of goal + +#obstacle y locations +poss = [] +obs_list = [] +y_list = [] + +# possible grid sizes +grid_size = [8, 10, 12, 14, 16] + +#np.random.seed(1) + +class Env(): + def __init__(self): + global HEIGHT + global WIDTH + + HEIGHT = random.choice(grid_size) + WIDTH = random.choice(grid_size) + + self.action_space = ['u', 'd', 'l', 'r'] + self.action_size = len(self.action_space) + + self.counter = 0 + self.rewards = [] + self.goal = [] + + # rectangle + self.rectangle = (UNIT/2, UNIT/2) + + # obstacle + global obs_list + global poss + global y_list + + obs_list = random.sample(xrange(1, HEIGHT), 5) + y_list = [i for i in range(HEIGHT)] + + obs_list.sort() + + self.set_reward([0, obs_list[0]], -1) + self.set_reward([WIDTH-1, obs_list[1]], -1) + self.set_reward([1, obs_list[2]], -1) + self.set_reward([WIDTH-2, obs_list[3]], -1) + self.set_reward([2, obs_list[4]], -1) + + # #goal + global goal_x + global goal_y + + poss = list(set(y_list) - set(obs_list)) + + goal_x = random.randint(0, WIDTH-1) + goal_y = random.choice(poss) + + self.set_reward([goal_x, goal_y], 1) + + def reset_reward(self): + + self.rewards = [] + self.goal = [] + + self.set_reward([0, obs_list[0]], -1) + self.set_reward([WIDTH-1, obs_list[1]], -1) + self.set_reward([1, obs_list[2]], -1) + self.set_reward([WIDTH-2, obs_list[3]], -1) + self.set_reward([2, obs_list[4]], -1) + + self.set_reward([goal_x, goal_y], 1) + + def set_reward(self, state, reward): + state = [int(state[0]), int(state[1])] + x = int(state[0]) + y = int(state[1]) + temp = {} + + if reward > 0: + temp['reward'] = reward + temp['figure'] = ((UNIT * x) + UNIT / 2,(UNIT * y) + UNIT / 2) + self.goal.append(temp['figure']) + + + elif reward < 0: + temp['direction'] = -1 + temp['reward'] = reward + temp['figure'] = ((UNIT * x) + UNIT / 2,(UNIT * y) + UNIT / 2) + + temp['state'] = state + self.rewards.append(temp) + + # new methods + + def check_if_reward(self, state): + check_list = dict() + check_list['if_goal'] = False + rewards = 0 + + for reward in self.rewards: + if reward['state'] == state: + rewards += reward['reward'] + if reward['reward'] == 1: + check_list['if_goal'] = True + + check_list['rewards'] = rewards + + return check_list + + def coords_to_state(self, coords): + x = int((coords[0] - UNIT / 2) / UNIT) + y = int((coords[1] - UNIT / 2) / UNIT) + return [x, y] + + def reset(self): + x, y = self.rectangle + + tmp_x = self.rectangle[0] + UNIT / 2 - x + tmp_y = self.rectangle[1] + UNIT / 2 - y + + self.rectangle = (tmp_x, tmp_y) + + # return observation + self.reset_reward() + return self.get_state() + + def step(self, action): + self.counter += 1 + + if self.counter % 2 == 1: + self.rewards = self.move_rewards() + + next_coords = self.move(self.rectangle, action) + check = self.check_if_reward(self.coords_to_state(next_coords)) + done = check['if_goal'] + reward = check['rewards'] + + s_ = self.get_state() + + return s_, reward, done, next_coords, self.rewards + + def get_state(self): + location = self.coords_to_state(self.rectangle) + agent_x = location[0] + agent_y = location[1] + + states = list() + + for reward in self.rewards: + reward_location = reward['state'] + states.append(reward_location[0] - agent_x) + states.append(reward_location[1] - agent_y) + if reward['reward'] < 0: + states.append(-1) + states.append(reward['direction']) + else: + states.append(1) + + return states + + def move_rewards(self): + new_rewards = [] + for temp in self.rewards: + if temp['reward'] == 1: + new_rewards.append(temp) + continue + + temp['figure'] = self.move_const(temp) + temp['state'] = self.coords_to_state(temp['figure']) + new_rewards.append(temp) + return new_rewards + + def move_const(self, target): + s = target['figure'] + base_action = np.array([0, 0]) + + if s[0] == (WIDTH - 1) * UNIT + UNIT / 2: + target['direction'] = 1 + elif s[0] == UNIT / 2: + target['direction'] = -1 + + if target['direction'] == -1: + base_action[0] += UNIT + elif target['direction'] == 1: + base_action[0] -= UNIT + + + if((target['figure'][0] != self.rectangle[0] or target['figure'][1] != self.rectangle[1]) + and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]): + base_action = np.array([0, 0]) + + tmp_x = target['figure'][0] + base_action[0] + tmp_y = target['figure'][1] + base_action[1] + + target['figure'] = (tmp_x, tmp_y) + s_ = target['figure'] + + return s_ + + def move(self, target, action): + s = target + base_action = np.array([0, 0]) + + if action == 0: # up + if s[1] > UNIT: + base_action[1] -= UNIT + elif action == 1: # down + if s[1] < (HEIGHT - 1) * UNIT: + base_action[1] += UNIT + elif action == 2: # right + if s[0] < (WIDTH - 1) * UNIT: + base_action[0] += UNIT + elif action == 3: # left + if s[0] > UNIT: + base_action[0] -= UNIT + + tmp_x = target[0] + base_action[0] + tmp_y = target[1] + base_action[1] + target = (tmp_x, tmp_y) + self.rectangle = (tmp_x, tmp_y) + s_ = target + + return s_ diff --git a/1-grid-world/8-a3c/A3C/A3C_complex_environment/environment_a3c_r3.py b/1-grid-world/8-a3c/A3C/A3C_complex_environment/environment_a3c_r3.py new file mode 100755 index 00000000..963b1a43 --- /dev/null +++ b/1-grid-world/8-a3c/A3C/A3C_complex_environment/environment_a3c_r3.py @@ -0,0 +1,244 @@ +import time +import numpy as np +import random +from PIL import Image +UNIT = 50 # pixels + +HEIGHT = 0 # grid height +WIDTH = 0 # grid width + +# Goal position +goal_x = 0 # x position of goal +goal_y = 0 # y position of goal + +#obstacle y locations +poss = [] +obs_list = [] +y_list = [] +# possible grid sizes +grid_size = [7, 9, 11, 13, 15] +#np.random.seed(1) + +class Env(object): + def __init__(self): + super(Env, self).__init__() + global HEIGHT + global WIDTH + + HEIGHT = random.choice(grid_size) + WIDTH = random.choice(grid_size) + + self.action_space = ['u', 'd', 'l', 'r'] + self.action_size = len(self.action_space) + + self.counter = 0 + self.rewards = [] + self.goal = [] + + # rectangle + self.rectangle = (UNIT/2, UNIT/2) + + # obstacle + global obs_list + global poss + global y_list + + obs_list = random.sample(xrange(1, HEIGHT), 5) + y_list = [i for i in range(HEIGHT)] + + obs_list.sort() + + self.set_reward([0, obs_list[0]], -1) + self.set_reward([WIDTH-1, obs_list[1]], -1) + self.set_reward([1, obs_list[2]], -1) + self.set_reward([WIDTH-2, obs_list[3]], -1) + self.set_reward([2, obs_list[4]], -1) + # #goal + global goal_x + global goal_y + + poss = list(set(y_list) - set(obs_list)) + + goal_x = random.randint(0, WIDTH-1) + goal_y = random.choice(poss) + + self.set_reward([goal_x, goal_y], 1) + + def reset_reward(self): + + self.rewards = [] + self.goal = [] + + HEIGHT = random.choice(grid_size) + WIDTH = random.choice(grid_size) + + obs_list = random.sample(xrange(1, HEIGHT), 5) + obs_list.sort() + + self.set_reward([0, obs_list[0]], -1) + self.set_reward([WIDTH-1, obs_list[1]], -1) + self.set_reward([1, obs_list[2]], -1) + self.set_reward([WIDTH-2, obs_list[3]], -1) + self.set_reward([2, obs_list[4]], -1) + + poss = list(set(y_list) - set(obs_list)) + + goal_x = random.randint(0, WIDTH-1) + goal_y = random.choice(poss) + + self.set_reward([goal_x, goal_y], 1) + + + def set_reward(self, state, reward): + state = [int(state[0]), int(state[1])] + x = int(state[0]) + y = int(state[1]) + temp = {} + + if reward > 0: + temp['reward'] = reward + + temp['figure'] = ((UNIT * x) + UNIT / 2,(UNIT * y) + UNIT / 2) + self.goal.append(temp['figure']) + + + elif reward < 0: + temp['direction'] = -1 + temp['reward'] = reward + temp['figure'] = ((UNIT * x) + UNIT / 2,(UNIT * y) + UNIT / 2) + + temp['state'] = state + self.rewards.append(temp) + + # new methods + + def check_if_reward(self, state): + check_list = dict() + check_list['if_goal'] = False + rewards = 0 + + for reward in self.rewards: + if reward['state'] == state: + rewards += reward['reward'] + if reward['reward'] == 1: + check_list['if_goal'] = True + + check_list['rewards'] = rewards + + return check_list + + def coords_to_state(self, coords): + x = int((coords[0] - UNIT / 2) / UNIT) + y = int((coords[1] - UNIT / 2) / UNIT) + return [x, y] + + def reset(self): + x, y = self.rectangle + + tmp_x = self.rectangle[0] + UNIT / 2 - x + tmp_y = self.rectangle[1] + UNIT / 2 - y + + self.rectangle = (tmp_x, tmp_y) + + # return observation + self.reset_reward() + return self.get_state() + + def step(self, action): + self.counter += 1 + + if self.counter % 2 == 1: + self.rewards = self.move_rewards() + + next_coords = self.move(self.rectangle, action) + check = self.check_if_reward(self.coords_to_state(next_coords)) + done = check['if_goal'] + reward = check['rewards'] + + s_ = self.get_state() + + return s_, reward, done, next_coords, self.rewards + + def get_state(self): + + location = self.coords_to_state(self.rectangle) + agent_x = location[0] + agent_y = location[1] + + states = list() + + for reward in self.rewards: + reward_location = reward['state'] + states.append(reward_location[0] - agent_x) + states.append(reward_location[1] - agent_y) + if reward['reward'] < 0: + states.append(-1) + states.append(reward['direction']) + else: + states.append(1) + + return states + + def move_rewards(self): + new_rewards = [] + for temp in self.rewards: + if temp['reward'] == 1: + new_rewards.append(temp) + continue + temp['figure'] = self.move_const(temp) + temp['state'] = self.coords_to_state(temp['figure']) + new_rewards.append(temp) + return new_rewards + + def move_const(self, target): + s = target['figure'] + base_action = np.array([0, 0]) + + if s[0] == (WIDTH - 1) * UNIT + UNIT / 2: + target['direction'] = 1 + elif s[0] == UNIT / 2: + target['direction'] = -1 + + if target['direction'] == -1: + base_action[0] += UNIT + elif target['direction'] == 1: + base_action[0] -= UNIT + + if((target['figure'][0] != self.rectangle[0] or target['figure'][1] != self.rectangle[1]) + and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]): + base_action = np.array([0, 0]) + + tmp_x = target['figure'][0] + base_action[0] + tmp_y = target['figure'][1] + base_action[1] + target['figure'] = (tmp_x, tmp_y) + + s_ = target['figure'] + + return s_ + + def move(self, target, action): + s = target + base_action = np.array([0, 0]) + + if action == 0: # up + if s[1] > UNIT: + base_action[1] -= UNIT + elif action == 1: # down + if s[1] < (HEIGHT - 1) * UNIT: + base_action[1] += UNIT + elif action == 2: # right + if s[0] < (WIDTH - 1) * UNIT: + base_action[0] += UNIT + elif action == 3: # left + if s[0] > UNIT: + base_action[0] -= UNIT + + tmp_x = target[0] + base_action[0] + tmp_y = target[1] + base_action[1] + + target = (tmp_x, tmp_y) + self.rectangle = (tmp_x, tmp_y) + + s_ = target + + return s_ diff --git a/1-grid-world/8-a3c/A3C/A3C_complex_environment/grid_env_r3.py b/1-grid-world/8-a3c/A3C/A3C_complex_environment/grid_env_r3.py new file mode 100755 index 00000000..f61d77d2 --- /dev/null +++ b/1-grid-world/8-a3c/A3C/A3C_complex_environment/grid_env_r3.py @@ -0,0 +1,267 @@ +import tensorflow as tf +import numpy as np +import threading +import gym +import os +from scipy.misc import imresize +from environment_a3c_r3 import Env + +total_episodes = 0 + +def copy_src_to_dst(from_scope, to_scope): + """Creates a copy variable weights operation + """ + from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) + to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) + + op_holder = [] + for from_var, to_var in zip(from_vars, to_vars): + op_holder.append(to_var.assign(from_var)) + return op_holder + + +def discount_reward(rewards, gamma=0.99): + """Returns discounted rewards + """ + discounted_r = np.zeros_like(rewards, dtype=np.float32) + running_add = 0 + for t in reversed(range(len(rewards))): + if rewards[t] != 0: + running_add = 0 + running_add = running_add * gamma + rewards[t] + discounted_r[t] = running_add + + return discounted_r + + +class A3CNetwork(object): + + def __init__(self, name, input_shape, output_dim, logdir=None): + """Network structure is defined here + """ + with tf.variable_scope(name): + self.states = tf.placeholder(tf.float32, shape=[None, input_shape], name="states") + self.actions = tf.placeholder(tf.uint8, shape=[None], name="actions") + self.rewards = tf.placeholder(tf.float32, shape=[None], name="rewards") + self.advantage = tf.placeholder(tf.float32, shape=[None], name="advantage") + + action_onehot = tf.one_hot(self.actions, output_dim, name="action_onehot") + net = self.states + + with tf.variable_scope("layer1") : + net = tf.layers.dense(net,60,name = "dense") + net = tf.nn.relu(net,name = 'relu') + + with tf.variable_scope("layer2") : + net = tf.layers.dense(net,60,name = "dense") + net = tf.nn.relu(net,name = 'relu') + + # actor network + actions = tf.layers.dense(net, output_dim, name="final_fc") + self.action_prob = tf.nn.softmax(actions, name="action_prob") + single_action_prob = tf.reduce_sum(self.action_prob * action_onehot, axis=1) + + entropy = - self.action_prob * tf.log(self.action_prob + 1e-7) + entropy = tf.reduce_sum(entropy, axis=1) + + log_action_prob = tf.log(single_action_prob + 1e-7) + maximize_objective = log_action_prob * self.advantage + entropy * 0.01 + self.actor_loss = - tf.reduce_mean(maximize_objective) + + # value network + self.values = tf.squeeze(tf.layers.dense(net, 1, name="values")) + self.value_loss = tf.losses.mean_squared_error(labels=self.rewards, + predictions=self.values) + + self.total_loss = self.actor_loss + self.value_loss * .5 + # self.optimizer = tf.train.RMSPropOptimizer(learning_rate=0.01, decay=.99) + self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4) + var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) + self.gradients = self.optimizer.compute_gradients(self.total_loss, var_list) + self.gradients_placeholders = [] + + for grad, var in self.gradients: + self.gradients_placeholders.append((tf.placeholder(var.dtype, shape=var.get_shape()), var)) + self.apply_gradients = self.optimizer.apply_gradients(self.gradients_placeholders) + + if logdir: + loss_summary = tf.summary.scalar("total_loss", self.total_loss) + value_summary = tf.summary.histogram("values", self.values) + + self.summary_op = tf.summary.merge([loss_summary, value_summary]) + self.summary_writer = tf.summary.FileWriter(logdir) + + +class Agent(threading.Thread): + + def __init__(self, session, env, coord, name, global_network, input_shape, output_dim, saver,logdir=None): + """Agent worker thread + """ + super(Agent, self).__init__() + self.local = A3CNetwork(name, input_shape, output_dim, logdir) + self.global_to_local = copy_src_to_dst("global", name) + self.global_network = global_network + + self.input_shape = input_shape + self.output_dim = output_dim + self.env = env + self.sess = session + self.coord = coord + self.name = name + self.logdir = logdir + self.saver = saver + + + def play_episode(self): + self.sess.run(self.global_to_local) + if total_episodes == 15000 : + self.coord.request_stop() + states = [] + actions = [] + rewards = [] + global total_episodes + s = self.env.reset() + + done = False + total_reward = 0 + time_step = 0 + global_step = 0 + while not done: + + a = self.choose_action(s) + s2, r, done, next_coords, mod_rewards = self.env.step(a) + + total_reward += r + + states.append(s) + actions.append(a) + s = s2 + + time_step += 1 + global_step += 1 + + if time_step >= 40: + if r == 1: + r *= np.power(0.99, (time_step/2)) + elif r == -1: + r *= np.power(1.01, (time_step/2)) + rewards.append(r) + + if time_step >= 50 or done: + self.train(states, actions, rewards) + self.sess.run(self.global_to_local) + states, actions, rewards = [], [], [] + time_step = 0 + + if done or global_step == 1500 : + print("episode no. " + str(total_episodes) + " global episode " + str(global_step) +" total score :" + str(total_reward)) + total_episodes +=1 + break + + + def run(self): + + + while not self.coord.should_stop(): + self.play_episode() + + def choose_action(self, states): + states = np.reshape(states, [-1, self.input_shape]) + feed = { + self.local.states: states + } + + action = self.sess.run(self.local.action_prob, feed) + action = np.squeeze(action) + + return np.random.choice(np.arange(self.output_dim) , p=action) + + def train(self, states, actions, rewards): + states = np.array(states) + actions = np.array(actions) + rewards = np.array(rewards) + + feed = { + self.local.states: states + } + + values = self.sess.run(self.local.values, feed) + + rewards = discount_reward(rewards, gamma=0.99) + + advantage = rewards - values + + feed = { + self.local.states: states, + self.local.actions: actions, + self.local.rewards: rewards, + self.local.advantage: advantage + } + + gradients = self.sess.run(self.local.gradients, feed) + + feed = [] + for (grad, _), (placeholder, _) in zip(gradients, self.global_network.gradients_placeholders): + feed.append((placeholder, grad)) + + feed = dict(feed) + self.sess.run(self.global_network.apply_gradients, feed) + + +def main(): + # try: + tf.reset_default_graph() + sess = tf.InteractiveSession() + coord = tf.train.Coordinator() + + n_threads = 8 + + input_shape = 23 + output_dim = 4 # {0,1, 2, 3} + global_network = A3CNetwork(name="global", + input_shape=input_shape, + output_dim=output_dim) + var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "global") + saver = tf.train.Saver(var_list=var_list) + + thread_list = [] + env_list = [] + + for id in range(n_threads): + env = Env() + + single_agent = Agent(env=env, + session=sess, + coord=coord, + name="thread_{}".format(id), + global_network=global_network, + input_shape=input_shape, + output_dim=output_dim, + saver=saver) + thread_list.append(single_agent) + env_list.append(env) + + + + init = tf.global_variables_initializer() + sess.run(init) + + + + for t in thread_list: + t.start() + + print("Ctrl + C to close") + + coord.wait_for_stop() + + + if coord.wait_for_stop() : + print 'stopped' + saver = tf.train.Saver() + saver.save(sess, 'models/modelmr1.ckpt') + print 'model saved' + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/1-grid-world/8-a3c/A3C/A3C_complex_environment/grid_env_test.py b/1-grid-world/8-a3c/A3C/A3C_complex_environment/grid_env_test.py new file mode 100755 index 00000000..a68af00b --- /dev/null +++ b/1-grid-world/8-a3c/A3C/A3C_complex_environment/grid_env_test.py @@ -0,0 +1,274 @@ +import tensorflow as tf +import numpy as np +import threading +import gym +import os +from scipy.misc import imresize +import environment_a3c_load_weights +from environment_a3c_load_weights import Env +from renderenv_load_weights import EnvRender + +total_episodes = 0 + +def copy_src_to_dst(from_scope, to_scope): + """Creates a copy variable weights operation + """ + from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) + to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) + + op_holder = [] + for from_var, to_var in zip(from_vars, to_vars): + op_holder.append(to_var.assign(from_var)) + return op_holder + + +def discount_reward(rewards, gamma=0.99): + """Returns discounted rewards + """ + discounted_r = np.zeros_like(rewards, dtype=np.float32) + running_add = 0 + for t in reversed(range(len(rewards))): + if rewards[t] != 0: + running_add = 0 + running_add = running_add * gamma + rewards[t] + discounted_r[t] = running_add + + return discounted_r + + +class A3CNetwork(object): + + def __init__(self, name, input_shape, output_dim, logdir=None): + """Network structure is defined here + """ + with tf.variable_scope(name): + self.states = tf.placeholder(tf.float32, shape=[None, input_shape], name="states") + self.actions = tf.placeholder(tf.uint8, shape=[None], name="actions") + self.rewards = tf.placeholder(tf.float32, shape=[None], name="rewards") + self.advantage = tf.placeholder(tf.float32, shape=[None], name="advantage") + + action_onehot = tf.one_hot(self.actions, output_dim, name="action_onehot") + net = self.states + + with tf.variable_scope("layer1") : + net = tf.layers.dense(net,60,name = "dense") + net = tf.nn.relu(net,name = 'relu') + + with tf.variable_scope("layer2") : + net = tf.layers.dense(net,60,name = "dense") + net = tf.nn.relu(net,name = 'relu') + + # actor network + actions = tf.layers.dense(net, output_dim, name="final_fc") + self.action_prob = tf.nn.softmax(actions, name="action_prob") + single_action_prob = tf.reduce_sum(self.action_prob * action_onehot, axis=1) + + entropy = - self.action_prob * tf.log(self.action_prob + 1e-7) + entropy = tf.reduce_sum(entropy, axis=1) + + log_action_prob = tf.log(single_action_prob + 1e-7) + maximize_objective = log_action_prob * self.advantage + entropy * 0.01 + self.actor_loss = - tf.reduce_mean(maximize_objective) + + # value network + self.values = tf.squeeze(tf.layers.dense(net, 1, name="values")) + self.value_loss = tf.losses.mean_squared_error(labels=self.rewards, + predictions=self.values) + + self.total_loss = self.actor_loss + self.value_loss * .5 + # self.optimizer = tf.train.RMSPropOptimizer(learning_rate=0.01, decay=.99) + self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4) + var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) + self.gradients = self.optimizer.compute_gradients(self.total_loss, var_list) + self.gradients_placeholders = [] + + for grad, var in self.gradients: + self.gradients_placeholders.append((tf.placeholder(var.dtype, shape=var.get_shape()), var)) + self.apply_gradients = self.optimizer.apply_gradients(self.gradients_placeholders) + + if logdir: + loss_summary = tf.summary.scalar("total_loss", self.total_loss) + value_summary = tf.summary.histogram("values", self.values) + + self.summary_op = tf.summary.merge([loss_summary, value_summary]) + self.summary_writer = tf.summary.FileWriter(logdir) + + +class Agent(threading.Thread): + + def __init__(self, session, env, coord, name, global_network, input_shape, output_dim, saver,logdir=None): + """Agent worker thread + """ + super(Agent, self).__init__() + self.local = A3CNetwork(name, input_shape, output_dim, logdir) + self.global_to_local = copy_src_to_dst("global", name) + self.global_network = global_network + + self.input_shape = input_shape + self.output_dim = output_dim + self.env = env + self.sess = session + self.coord = coord + self.name = name + self.logdir = logdir + self.saver = saver + + + def play_episode(self, env_render): + self.sess.run(self.global_to_local) + global total_episodes + if total_episodes == 0 : + self.coord.request_stop() + states = [] + actions = [] + rewards = [] + + s = self.env.reset() + env_render.reset(self.env) + + done = False + total_reward = 0 + time_step = 0 + global_step = 0 + while not done: + env_render.render() + a = self.choose_action(s) + s2, r, done, next_coords, mod_rewards = self.env.step(a) + env_render.move(next_coords, mod_rewards) + total_reward += r + + states.append(s) + actions.append(a) + + s = s2 + + time_step += 1 + global_step += 1 + + if time_step >= 40: + if r == 1: + r *= np.power(0.99, (time_step/2)) + elif r == -1: + r *= np.power(1.01, (time_step/2)) + + rewards.append(r) + + if time_step >= 80 or done: + states, actions, rewards = [], [], [] + time_step = 0 + print("episode no. " + str(total_episodes) + " global episode " + str(global_step) +" total score :" + str(total_reward)) + total_episodes +=1 + break + + + def run(self): + gx = environment_a3c_load_weights.goal_x + gy = environment_a3c_load_weights.goal_y + Hx = environment_a3c_load_weights.HEIGHT + Hy = environment_a3c_load_weights.WIDTH + ob_list = environment_a3c_load_weights.obs_list + env_render = EnvRender(gx, gy, Hx, Hy, ob_list) + while not self.coord.should_stop(): + self.play_episode(env_render) + + def choose_action(self, states): + """ + """ + states = np.reshape(states, [-1, self.input_shape]) + feed = { + self.local.states: states + } + + action = self.sess.run(self.local.action_prob, feed) + action = np.squeeze(action) + + return np.argmax(action) + + def train(self, states, actions, rewards): + states = np.array(states) + actions = np.array(actions) + rewards = np.array(rewards) + + feed = { + self.local.states: states + } + + values = self.sess.run(self.local.values, feed) + + rewards = discount_reward(rewards, gamma=0.99) + + advantage = rewards - values + + feed = { + self.local.states: states, + self.local.actions: actions, + self.local.rewards: rewards, + self.local.advantage: advantage + } + + gradients = self.sess.run(self.local.gradients, feed) + + feed = [] + for (grad, _), (placeholder, _) in zip(gradients, self.global_network.gradients_placeholders): + feed.append((placeholder, grad)) + + feed = dict(feed) + self.sess.run(self.global_network.apply_gradients, feed) + + +def main(): + # try: + tf.reset_default_graph() + sess = tf.InteractiveSession() + coord = tf.train.Coordinator() + + save_path = "models/model.ckpt" + n_threads = 1 + + input_shape = 23 + output_dim = 4 # {0,1, 2, 3} + global_network = A3CNetwork(name="global", + input_shape=input_shape, + output_dim=output_dim) + var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "global") + saver = tf.train.Saver(var_list=var_list) + + thread_list = [] + env_list = [] + + global total_episodes + + for id in range(n_threads): + env = Env() + + single_agent = Agent(env=env, + session=sess, + coord=coord, + name="thread_{}".format(id), + global_network=global_network, + input_shape=input_shape, + output_dim=output_dim, + saver=saver) + thread_list.append(single_agent) + env_list.append(env) + + + saver = tf.train.Saver() + saver.restore(sess, 'models/modelmr1.ckpt') + + + for t in thread_list: + t.start() + + print("Ctrl + C to close") + + coord.wait_for_stop() + + + if coord.wait_for_stop() : + print 'stopped' + + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/1-grid-world/8-a3c/A3C/A3C_complex_environment/models/modelmr1.ckpt.data-00000-of-00001 b/1-grid-world/8-a3c/A3C/A3C_complex_environment/models/modelmr1.ckpt.data-00000-of-00001 new file mode 100755 index 00000000..a09a8d35 Binary files /dev/null and b/1-grid-world/8-a3c/A3C/A3C_complex_environment/models/modelmr1.ckpt.data-00000-of-00001 differ diff --git a/1-grid-world/8-a3c/A3C/A3C_complex_environment/models/modelmr1.ckpt.index b/1-grid-world/8-a3c/A3C/A3C_complex_environment/models/modelmr1.ckpt.index new file mode 100755 index 00000000..446f759d Binary files /dev/null and b/1-grid-world/8-a3c/A3C/A3C_complex_environment/models/modelmr1.ckpt.index differ diff --git a/1-grid-world/8-a3c/A3C/A3C_complex_environment/models/modelmr1.ckpt.meta b/1-grid-world/8-a3c/A3C/A3C_complex_environment/models/modelmr1.ckpt.meta new file mode 100755 index 00000000..97799561 Binary files /dev/null and b/1-grid-world/8-a3c/A3C/A3C_complex_environment/models/modelmr1.ckpt.meta differ diff --git a/1-grid-world/8-a3c/A3C/A3C_complex_environment/renderenv_load_weights.py b/1-grid-world/8-a3c/A3C/A3C_complex_environment/renderenv_load_weights.py new file mode 100755 index 00000000..d05dbdd3 --- /dev/null +++ b/1-grid-world/8-a3c/A3C/A3C_complex_environment/renderenv_load_weights.py @@ -0,0 +1,150 @@ +import numpy as np +import Tkinter as tk +from PIL import ImageTk, Image +import time + +PhotoImage = ImageTk.PhotoImage + +UNIT = 50 # pixels + +HEIGHT = 0 +WIDTH = 0 + +# Goal position +goal_x = 0 # x position of goal +goal_y = 0 # y position of goal + +obs_list = [] + +np.random.seed(1) + +class EnvRender(tk.Tk): + def __init__(self, gx, gy, Hx, Hy, ob_list): + tk.Tk.__init__(self) + global HEIGHT + global WIDTH + + HEIGHT = Hx + WIDTH = Hy + + self.title('A3C') + self.geometry('{0}x{1}'.format(WIDTH * UNIT, HEIGHT * UNIT)) + self.shapes = self.load_images() + self.canvas = self._build_canvas() + + self.counter = 0 + self.objects = [] + + # obstacle + global obs_list + obs_list = ob_list + self.set_reward([0, obs_list[0]], -1) + self.set_reward([WIDTH-1, obs_list[1]], -1) + self.set_reward([1, obs_list[2]], -1) + self.set_reward([WIDTH-2, obs_list[3]], -1) + self.set_reward([2, obs_list[4]], -1) + # #goal + global goal_x + global goal_y + goal_x = gx + goal_y = gy + + self.set_reward([goal_x, goal_y], 1) + + def _build_canvas(self): + + canvas = tk.Canvas(self, bg='white', + height=HEIGHT * UNIT, + width=WIDTH * UNIT) + # create grids + for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 + x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT + canvas.create_line(x0, y0, x1, y1) + for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 + x0, y0, x1, y1 = 0, r, WIDTH * UNIT, r + canvas.create_line(x0, y0, x1, y1) + + self.objects = [] + self.goal = [] + # add image to canvas + x, y = UNIT/2, UNIT/2 + self.rectangle = canvas.create_image(x, y, image=self.shapes[0]) + + # pack all` + canvas.pack() + + return canvas + + def load_images(self): + rectangle = PhotoImage( + Image.open("../img/rectangle.png").resize((30, 30))) + triangle = PhotoImage( + Image.open("../img/triangle.png").resize((30, 30))) + circle = PhotoImage( + Image.open("../img/circle.png").resize((30, 30))) + + return rectangle, triangle, circle + + def reset_object(self): + for obj in self.objects: + self.canvas.delete(obj['id']) + + self.objects = [] + # obstacle + self.set_reward([0, obs_list[0]], -1) + self.set_reward([WIDTH-1, obs_list[1]], -1) + self.set_reward([1, obs_list[2]], -1) + self.set_reward([WIDTH-2, obs_list[3]], -1) + self.set_reward([2, obs_list[4]], -1) + # #goal + + self.set_reward([goal_x, goal_y], 1) + + def set_reward(self, state, reward): + state = [int(state[0]), int(state[1])] + x = int(state[0]) + y = int(state[1]) + + tmp = {} + if reward > 0: + tmp['id'] = self.canvas.create_image((UNIT * x) + UNIT / 2, (UNIT * y) + UNIT / 2, image=self.shapes[2]) + elif reward < 0: + tmp['id'] = self.canvas.create_image((UNIT * x) + UNIT / 2, (UNIT * y) + UNIT / 2, image=self.shapes[1]) + + tmp['reward'] = reward + self.objects.append(tmp) + + def reset(self, oenv): + self.update() + time.sleep(0.5) + + self.canvas.delete(self.rectangle) + self.rectangle = self.canvas.create_image(oenv.rectangle[0], oenv.rectangle[1], image=self.shapes[0]) + self.reset_object() + + def move(self, next_coords, mod_rewards): + self.render() + self.counter += 1 + + if self.counter % 2 == 1: + for obj in self.objects: + if obj['reward'] < 0: + self.canvas.delete(obj['id']) + + self.objects = [item for item in self.objects if item['reward'] > 0] + + for item in mod_rewards: + if item['reward'] < 0: + tmp = {} + tmp['id'] = self.canvas.create_image(item['figure'][0], item['figure'][1], image=self.shapes[1]) + tmp['reward'] = item['reward'] + self.objects.append(tmp) + + self.canvas.delete(self.rectangle) + self.rectangle = self.canvas.create_image(next_coords[0], next_coords[1], image=self.shapes[0]) + self.canvas.tag_raise(self.rectangle) + + def render(self): + time.sleep(0.07) + self.update() + diff --git a/1-grid-world/8-a3c/A3C/A3C_tensorflow/environment_a3c_load_weights.py b/1-grid-world/8-a3c/A3C/A3C_tensorflow/environment_a3c_load_weights.py new file mode 100755 index 00000000..e62fd9b0 --- /dev/null +++ b/1-grid-world/8-a3c/A3C/A3C_tensorflow/environment_a3c_load_weights.py @@ -0,0 +1,211 @@ +import time +import numpy as np +import random +from PIL import Image +UNIT = 50 # pixels + + +HEIGHT = 0 # grid height +WIDTH = 0 # grid width + +# Goal position +goal_x = 0 # x position of goal +goal_y = 0 # y position of goal + +# possible grid sizes +grid_size = [6,8,10] +#np.random.seed(1) + +class Env(): + def __init__(self): + global HEIGHT + global WIDTH + + HEIGHT = random.choice(grid_size) + WIDTH = random.choice(grid_size) + + self.action_space = ['u', 'd', 'l', 'r'] + self.action_size = len(self.action_space) + + self.counter = 0 + self.rewards = [] + self.goal = [] + + # rectangle + self.rectangle = (UNIT/2, UNIT/2) + + # obstacle + self.set_reward([0, 1], -1) + self.set_reward([1, 2], -1) + self.set_reward([2, 3], -1) + + # #goal + global goal_x + global goal_y + goal_x = random.randint(0, WIDTH-1) + goal_y = random.randint(4, HEIGHT-1) + + self.set_reward([goal_x, goal_y], 1) + + def reset_reward(self): + + self.rewards = [] + self.goal = [] + self.set_reward([0, 1], -1) + self.set_reward([1, 2], -1) + self.set_reward([2, 3], -1) + + self.set_reward([goal_x, goal_y], 1) + + def set_reward(self, state, reward): + state = [int(state[0]), int(state[1])] + x = int(state[0]) + y = int(state[1]) + temp = {} + + if reward > 0: + temp['reward'] = reward + temp['figure'] = ((UNIT * x) + UNIT / 2,(UNIT * y) + UNIT / 2) + self.goal.append(temp['figure']) + + + elif reward < 0: + temp['direction'] = -1 + temp['reward'] = reward + temp['figure'] = ((UNIT * x) + UNIT / 2,(UNIT * y) + UNIT / 2) + + temp['state'] = state + self.rewards.append(temp) + + # new methods + + def check_if_reward(self, state): + check_list = dict() + check_list['if_goal'] = False + rewards = 0 + + for reward in self.rewards: + if reward['state'] == state: + rewards += reward['reward'] + if reward['reward'] == 1: + check_list['if_goal'] = True + + check_list['rewards'] = rewards + + return check_list + + def coords_to_state(self, coords): + x = int((coords[0] - UNIT / 2) / UNIT) + y = int((coords[1] - UNIT / 2) / UNIT) + + return [x, y] + + def reset(self): + x, y = self.rectangle + + tmp_x = self.rectangle[0] + UNIT / 2 - x + tmp_y = self.rectangle[1] + UNIT / 2 - y + + self.rectangle = (tmp_x, tmp_y) + # return observation + self.reset_reward() + + return self.get_state() + + def step(self, action): + self.counter += 1 + + if self.counter % 2 == 1: + self.rewards = self.move_rewards() + + next_coords = self.move(self.rectangle, action) + check = self.check_if_reward(self.coords_to_state(next_coords)) + done = check['if_goal'] + reward = check['rewards'] + + s_ = self.get_state() + + return s_, reward, done, next_coords, self.rewards + + def get_state(self): + location = self.coords_to_state(self.rectangle) + agent_x = location[0] + agent_y = location[1] + + states = list() + + for reward in self.rewards: + reward_location = reward['state'] + states.append(reward_location[0] - agent_x) + states.append(reward_location[1] - agent_y) + if reward['reward'] < 0: + states.append(-1) + states.append(reward['direction']) + else: + states.append(1) + + return states + + def move_rewards(self): + new_rewards = [] + for temp in self.rewards: + if temp['reward'] == 1: + new_rewards.append(temp) + continue + temp['figure'] = self.move_const(temp) + temp['state'] = self.coords_to_state(temp['figure']) + new_rewards.append(temp) + return new_rewards + + def move_const(self, target): + + s = target['figure'] + base_action = np.array([0, 0]) + + if s[0] == (WIDTH - 1) * UNIT + UNIT / 2: + target['direction'] = 1 + elif s[0] == UNIT / 2: + target['direction'] = -1 + + if target['direction'] == -1: + base_action[0] += UNIT + elif target['direction'] == 1: + base_action[0] -= UNIT + + if((target['figure'][0] != self.rectangle[0] or target['figure'][1] != self.rectangle[1]) + and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]): + base_action = np.array([0, 0]) + + tmp_x = target['figure'][0] + base_action[0] + tmp_y = target['figure'][1] + base_action[1] + target['figure'] = (tmp_x, tmp_y) + + s_ = target['figure'] + + return s_ + + def move(self, target, action): + s = target + base_action = np.array([0, 0]) + + if action == 0: # up + if s[1] > UNIT: + base_action[1] -= UNIT + elif action == 1: # down + if s[1] < (HEIGHT - 1) * UNIT: + base_action[1] += UNIT + elif action == 2: # right + if s[0] < (WIDTH - 1) * UNIT: + base_action[0] += UNIT + elif action == 3: # left + if s[0] > UNIT: + base_action[0] -= UNIT + + tmp_x = target[0] + base_action[0] + tmp_y = target[1] + base_action[1] + target = (tmp_x, tmp_y) + self.rectangle = (tmp_x, tmp_y) + s_ = target + + return s_ + diff --git a/1-grid-world/8-a3c/A3C/A3C_tensorflow/environment_a3c_r1.py b/1-grid-world/8-a3c/A3C/A3C_tensorflow/environment_a3c_r1.py new file mode 100755 index 00000000..a48fe6d6 --- /dev/null +++ b/1-grid-world/8-a3c/A3C/A3C_tensorflow/environment_a3c_r1.py @@ -0,0 +1,211 @@ +import time +import numpy as np +import random +from PIL import Image +UNIT = 50 # pixels + + +HEIGHT = 0 # grid height +WIDTH = 0 # grid width + +# Goal position +goal_x = 0 # x position of goal +goal_y = 0 # y position of goal + +# possible grid sizes +grid_size = [5, 7, 9, 11, 13] +#np.random.seed(1) + +class Env(object): + def __init__(self): + super(Env, self).__init__() + global HEIGHT + global WIDTH + HEIGHT = random.choice(grid_size) + WIDTH = HEIGHT + + self.action_space = ['u', 'd', 'l', 'r'] + self.action_size = len(self.action_space) + + self.counter = 0 + self.rewards = [] + self.goal = [] + + # rectangle + self.rectangle = (UNIT/2, UNIT/2) + + # obstacle + self.set_reward([0, 1], -1) + self.set_reward([1, 2], -1) + self.set_reward([2, 3], -1) + + # #goal + global goal_x + global goal_y + goal_x = random.randint(0, WIDTH-1) + goal_y = random.randint(4, HEIGHT-1) + + #self.set_reward([goal_x, goal_y], 1) + self.set_reward([goal_x, goal_y], 1) + + def reset_reward(self): + + self.rewards = [] + self.goal = [] + self.set_reward([0, 1], -1) + self.set_reward([1, 2], -1) + self.set_reward([2, 3], -1) + + HEIGHT = random.choice(grid_size) + WIDTH = HEIGHT + + goal_x = random.randint(0, WIDTH-1) + goal_y = random.randint(4, HEIGHT-1) + self.set_reward([goal_x, goal_y], 1) + + + def set_reward(self, state, reward): + state = [int(state[0]), int(state[1])] + x = int(state[0]) + y = int(state[1]) + temp = {} + + if reward > 0: + temp['reward'] = reward + temp['figure'] = ((UNIT * x) + UNIT / 2,(UNIT * y) + UNIT / 2) + self.goal.append(temp['figure']) + elif reward < 0: + temp['direction'] = -1 + temp['reward'] = reward + temp['figure'] = ((UNIT * x) + UNIT / 2,(UNIT * y) + UNIT / 2) + temp['state'] = state + + self.rewards.append(temp) + + def check_if_reward(self, state): + check_list = dict() + check_list['if_goal'] = False + rewards = 0 + + for reward in self.rewards: + if reward['state'] == state: + rewards += reward['reward'] + if reward['reward'] == 1: + check_list['if_goal'] = True + + check_list['rewards'] = rewards + + return check_list + + def coords_to_state(self, coords): + x = int((coords[0] - UNIT / 2) / UNIT) + y = int((coords[1] - UNIT / 2) / UNIT) + + return [x, y] + + def reset(self): + x, y = self.rectangle + + tmp_x = self.rectangle[0] + UNIT / 2 - x + tmp_y = self.rectangle[1] + UNIT / 2 - y + + self.rectangle = (tmp_x, tmp_y) + + # return observation + self.reset_reward() + + return self.get_state() + + def step(self, action): + self.counter += 1 + + if self.counter % 2 == 1: + self.rewards = self.move_rewards() + + next_coords = self.move(self.rectangle, action) + check = self.check_if_reward(self.coords_to_state(next_coords)) + done = check['if_goal'] + reward = check['rewards'] + s_ = self.get_state() + + return s_, reward, done, next_coords, self.rewards + + def get_state(self): + location = self.coords_to_state(self.rectangle) + agent_x = location[0] + agent_y = location[1] + + states = list() + for reward in self.rewards: + reward_location = reward['state'] + states.append(reward_location[0] - agent_x) + states.append(reward_location[1] - agent_y) + if reward['reward'] < 0: + states.append(-1) + states.append(reward['direction']) + else: + states.append(1) + + return states + + def move_rewards(self): + new_rewards = [] + for temp in self.rewards: + if temp['reward'] == 1: + new_rewards.append(temp) + continue + temp['figure'] = self.move_const(temp) + temp['state'] = self.coords_to_state(temp['figure']) + new_rewards.append(temp) + + return new_rewards + + def move_const(self, target): + s = target['figure'] + base_action = np.array([0, 0]) + + if s[0] == (WIDTH - 1) * UNIT + UNIT / 2: + target['direction'] = 1 + elif s[0] == UNIT / 2: + target['direction'] = -1 + + if target['direction'] == -1: + base_action[0] += UNIT + elif target['direction'] == 1: + base_action[0] -= UNIT + + if((target['figure'][0] != self.rectangle[0] or target['figure'][1] != self.rectangle[1]) + and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]): + base_action = np.array([0, 0]) + + tmp_x = target['figure'][0] + base_action[0] + tmp_y = target['figure'][1] + base_action[1] + target['figure'] = (tmp_x, tmp_y) + s_ = target['figure'] + + return s_ + + def move(self, target, action): + s = target + base_action = np.array([0, 0]) + if action == 0: # up + if s[1] > UNIT: + base_action[1] -= UNIT + elif action == 1: # down + if s[1] < (HEIGHT - 1) * UNIT: + base_action[1] += UNIT + elif action == 2: # right + if s[0] < (WIDTH - 1) * UNIT: + base_action[0] += UNIT + elif action == 3: # left + if s[0] > UNIT: + base_action[0] -= UNIT + + tmp_x = target[0] + base_action[0] + tmp_y = target[1] + base_action[1] + target = (tmp_x, tmp_y) + self.rectangle = (tmp_x, tmp_y) + s_ = target + + return s_ + diff --git a/1-grid-world/8-a3c/A3C/A3C_tensorflow/grid_env_r1.py b/1-grid-world/8-a3c/A3C/A3C_tensorflow/grid_env_r1.py new file mode 100755 index 00000000..220af81a --- /dev/null +++ b/1-grid-world/8-a3c/A3C/A3C_tensorflow/grid_env_r1.py @@ -0,0 +1,260 @@ +import tensorflow as tf +import numpy as np +import threading +import gym +import os +from scipy.misc import imresize +from environment_a3c_r1 import Env + +total_episodes = 0 + +def copy_src_to_dst(from_scope, to_scope): + """Creates a copy variable weights operation + """ + from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) + to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) + + op_holder = [] + for from_var, to_var in zip(from_vars, to_vars): + op_holder.append(to_var.assign(from_var)) + return op_holder + +def discount_reward(rewards, gamma=0.99): + """Returns discounted rewards + """ + discounted_r = np.zeros_like(rewards, dtype=np.float32) + running_add = 0 + for t in reversed(range(len(rewards))): + if rewards[t] != 0: + running_add = 0 + running_add = running_add * gamma + rewards[t] + discounted_r[t] = running_add + + return discounted_r + + +class A3CNetwork(object): + + def __init__(self, name, input_shape, output_dim, logdir=None): + """Network structure is defined here + """ + with tf.variable_scope(name): + self.states = tf.placeholder(tf.float32, shape=[None, input_shape], name="states") + self.actions = tf.placeholder(tf.uint8, shape=[None], name="actions") + self.rewards = tf.placeholder(tf.float32, shape=[None], name="rewards") + self.advantage = tf.placeholder(tf.float32, shape=[None], name="advantage") + + action_onehot = tf.one_hot(self.actions, output_dim, name="action_onehot") + net = self.states + + with tf.variable_scope("layer1") : + net = tf.layers.dense(net,60,name = "dense") + net = tf.nn.relu(net,name = 'relu') + + with tf.variable_scope("layer2") : + net = tf.layers.dense(net,60,name = "dense") + net = tf.nn.relu(net,name = 'relu') + + # actor network + actions = tf.layers.dense(net, output_dim, name="final_fc") + self.action_prob = tf.nn.softmax(actions, name="action_prob") + single_action_prob = tf.reduce_sum(self.action_prob * action_onehot, axis=1) + + entropy = - self.action_prob * tf.log(self.action_prob + 1e-7) + entropy = tf.reduce_sum(entropy, axis=1) + + log_action_prob = tf.log(single_action_prob + 1e-7) + maximize_objective = log_action_prob * self.advantage + entropy * 0.01 + self.actor_loss = - tf.reduce_mean(maximize_objective) + + # value network + self.values = tf.squeeze(tf.layers.dense(net, 1, name="values")) + self.value_loss = tf.losses.mean_squared_error(labels=self.rewards, + predictions=self.values) + + self.total_loss = self.actor_loss + self.value_loss * .5 + # self.optimizer = tf.train.RMSPropOptimizer(learning_rate=0.01, decay=.99) + self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4) + var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) + self.gradients = self.optimizer.compute_gradients(self.total_loss, var_list) + self.gradients_placeholders = [] + + for grad, var in self.gradients: + self.gradients_placeholders.append((tf.placeholder(var.dtype, shape=var.get_shape()), var)) + self.apply_gradients = self.optimizer.apply_gradients(self.gradients_placeholders) + + if logdir: + loss_summary = tf.summary.scalar("total_loss", self.total_loss) + value_summary = tf.summary.histogram("values", self.values) + + self.summary_op = tf.summary.merge([loss_summary, value_summary]) + self.summary_writer = tf.summary.FileWriter(logdir) + + +class Agent(threading.Thread): + + def __init__(self, session, env, coord, name, global_network, input_shape, output_dim, saver,logdir=None): + """Agent worker thread + """ + super(Agent, self).__init__() + self.local = A3CNetwork(name, input_shape, output_dim, logdir) + self.global_to_local = copy_src_to_dst("global", name) + self.global_network = global_network + + self.input_shape = input_shape + self.output_dim = output_dim + self.env = env + self.sess = session + self.coord = coord + self.name = name + self.logdir = logdir + self.saver = saver + + + def play_episode(self): + self.sess.run(self.global_to_local) + if total_episodes == 10000 : + self.coord.request_stop() + states = [] + actions = [] + rewards = [] + global total_episodes + s = self.env.reset() + + done = False + total_reward = 0 + time_step = 0 + global_step = 0 + while not done: + + a = self.choose_action(s) + s2, r, done, next_coords, mod_rewards = self.env.step(a) + total_reward += r + + states.append(s) + actions.append(a) + s = s2 + + time_step += 1 + global_step += 1 + + if time_step >= 40: + if r == 1: + r *= np.power(0.99, (time_step/2)) + elif r == -1: + r *= np.power(1.01, (time_step/2)) + rewards.append(r) + + if time_step >= 50 or done: + self.train(states, actions, rewards) + self.sess.run(self.global_to_local) + states, actions, rewards = [], [], [] + time_step = 0 + + if done or global_step == 500 : + print("episode no. " + str(total_episodes) + " global episode " + str(global_step) +" total score :" + str(total_reward)) + total_episodes +=1 + break + + + def run(self): + while not self.coord.should_stop(): + self.play_episode() + + def choose_action(self, states): + states = np.reshape(states, [-1, self.input_shape]) + feed = { + self.local.states: states + } + + action = self.sess.run(self.local.action_prob, feed) + action = np.squeeze(action) + + return np.random.choice(np.arange(self.output_dim) , p=action) + + def train(self, states, actions, rewards): + states = np.array(states) + actions = np.array(actions) + rewards = np.array(rewards) + + feed = { + self.local.states: states + } + + values = self.sess.run(self.local.values, feed) + + rewards = discount_reward(rewards, gamma=0.99) + advantage = rewards - values + + feed = { + self.local.states: states, + self.local.actions: actions, + self.local.rewards: rewards, + self.local.advantage: advantage + } + + gradients = self.sess.run(self.local.gradients, feed) + + feed = [] + for (grad, _), (placeholder, _) in zip(gradients, self.global_network.gradients_placeholders): + feed.append((placeholder, grad)) + + feed = dict(feed) + self.sess.run(self.global_network.apply_gradients, feed) + + +def main(): + # try: + tf.reset_default_graph() + sess = tf.InteractiveSession() + coord = tf.train.Coordinator() + + n_threads = 8 + + input_shape = 15 + output_dim = 4 # {0,1, 2, 3} + global_network = A3CNetwork(name="global", + input_shape=input_shape, + output_dim=output_dim) + var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "global") + saver = tf.train.Saver(var_list=var_list) + + thread_list = [] + env_list = [] + + for id in range(n_threads): + env = Env() + + single_agent = Agent(env=env, + session=sess, + coord=coord, + name="thread_{}".format(id), + global_network=global_network, + input_shape=input_shape, + output_dim=output_dim, + saver=saver) + thread_list.append(single_agent) + env_list.append(env) + + + + init = tf.global_variables_initializer() + sess.run(init) + + for t in thread_list: + t.start() + + print("Ctrl + C to close") + + coord.wait_for_stop() + + + if coord.wait_for_stop() : + print 'stopped' + saver = tf.train.Saver() + saver.save(sess, 'models/modelr1.ckpt') + print 'model saved' + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/1-grid-world/8-a3c/A3C/A3C_tensorflow/grid_env_test.py b/1-grid-world/8-a3c/A3C/A3C_tensorflow/grid_env_test.py new file mode 100755 index 00000000..0be96e5a --- /dev/null +++ b/1-grid-world/8-a3c/A3C/A3C_tensorflow/grid_env_test.py @@ -0,0 +1,265 @@ +import tensorflow as tf +import numpy as np +import threading +import gym +import os +from scipy.misc import imresize +import environment_a3c_load_weights +from environment_a3c_load_weights import Env +from renderenv_load_weights import EnvRender + +total_episodes = 0 + +def copy_src_to_dst(from_scope, to_scope): + from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) + to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) + + op_holder = [] + for from_var, to_var in zip(from_vars, to_vars): + op_holder.append(to_var.assign(from_var)) + return op_holder + + +def discount_reward(rewards, gamma=0.99): + """Returns discounted rewards + """ + discounted_r = np.zeros_like(rewards, dtype=np.float32) + running_add = 0 + for t in reversed(range(len(rewards))): + if rewards[t] != 0: + running_add = 0 + running_add = running_add * gamma + rewards[t] + discounted_r[t] = running_add + + return discounted_r + + +class A3CNetwork(object): + + def __init__(self, name, input_shape, output_dim, logdir=None): + """Network structure is defined here + """ + with tf.variable_scope(name): + self.states = tf.placeholder(tf.float32, shape=[None, input_shape], name="states") + self.actions = tf.placeholder(tf.uint8, shape=[None], name="actions") + self.rewards = tf.placeholder(tf.float32, shape=[None], name="rewards") + self.advantage = tf.placeholder(tf.float32, shape=[None], name="advantage") + + action_onehot = tf.one_hot(self.actions, output_dim, name="action_onehot") + net = self.states + + with tf.variable_scope("layer1") : + net = tf.layers.dense(net,60,name = "dense") + net = tf.nn.relu(net,name = 'relu') + + with tf.variable_scope("layer2") : + net = tf.layers.dense(net,60,name = "dense") + net = tf.nn.relu(net,name = 'relu') + + # actor network + actions = tf.layers.dense(net, output_dim, name="final_fc") + self.action_prob = tf.nn.softmax(actions, name="action_prob") + single_action_prob = tf.reduce_sum(self.action_prob * action_onehot, axis=1) + + entropy = - self.action_prob * tf.log(self.action_prob + 1e-7) + entropy = tf.reduce_sum(entropy, axis=1) + + log_action_prob = tf.log(single_action_prob + 1e-7) + maximize_objective = log_action_prob * self.advantage + entropy * 0.01 + self.actor_loss = - tf.reduce_mean(maximize_objective) + + # value network + self.values = tf.squeeze(tf.layers.dense(net, 1, name="values")) + self.value_loss = tf.losses.mean_squared_error(labels=self.rewards, + predictions=self.values) + + self.total_loss = self.actor_loss + self.value_loss * .5 + # self.optimizer = tf.train.RMSPropOptimizer(learning_rate=0.01, decay=.99) + self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4) + var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) + self.gradients = self.optimizer.compute_gradients(self.total_loss, var_list) + self.gradients_placeholders = [] + + for grad, var in self.gradients: + self.gradients_placeholders.append((tf.placeholder(var.dtype, shape=var.get_shape()), var)) + self.apply_gradients = self.optimizer.apply_gradients(self.gradients_placeholders) + + if logdir: + loss_summary = tf.summary.scalar("total_loss", self.total_loss) + value_summary = tf.summary.histogram("values", self.values) + + self.summary_op = tf.summary.merge([loss_summary, value_summary]) + self.summary_writer = tf.summary.FileWriter(logdir) + + +class Agent(threading.Thread): + + def __init__(self, session, env, coord, name, global_network, input_shape, output_dim, saver,logdir=None): + """Agent worker thread + """ + super(Agent, self).__init__() + self.local = A3CNetwork(name, input_shape, output_dim, logdir) + self.global_to_local = copy_src_to_dst("global", name) + self.global_network = global_network + + self.input_shape = input_shape + self.output_dim = output_dim + self.env = env + self.sess = session + self.coord = coord + self.name = name + self.logdir = logdir + self.saver = saver + + + def play_episode(self, env_render): + self.sess.run(self.global_to_local) + global total_episodes + if total_episodes == 0 : + self.coord.request_stop() + states = [] + actions = [] + rewards = [] + + s = self.env.reset() + + env_render.reset(self.env) + + done = False + total_reward = 0 + time_step = 0 + global_step = 0 + while not done: + env_render.render() + a = self.choose_action(s) + s2, r, done, next_coords, mod_rewards = self.env.step(a) + env_render.move(next_coords, mod_rewards) + total_reward += r + + states.append(s) + actions.append(a) + s = s2 + + time_step += 1 + global_step += 1 + + rewards.append(r) + + if time_step >= 80 or done: + states, actions, rewards = [], [], [] + time_step = 0 + print("episode no. " + str(total_episodes) + " global episode " + str(global_step) +" total score :" + str(total_reward)) + total_episodes +=1 + break + + + def run(self): + gx = environment_a3c_load_weights.goal_x + gy = environment_a3c_load_weights.goal_y + Hx = environment_a3c_load_weights.HEIGHT + Hy = environment_a3c_load_weights.WIDTH + + env_render = EnvRender(gx, gy, Hx, Hy) + while not self.coord.should_stop(): + self.play_episode(env_render) + + def choose_action(self, states): + states = np.reshape(states, [-1, self.input_shape]) + feed = { + self.local.states: states + } + + action = self.sess.run(self.local.action_prob, feed) + action = np.squeeze(action) + + # print action + return np.argmax(action) + + def train(self, states, actions, rewards): + states = np.array(states) + actions = np.array(actions) + rewards = np.array(rewards) + + feed = { + self.local.states: states + } + + values = self.sess.run(self.local.values, feed) + + rewards = discount_reward(rewards, gamma=0.99) + + advantage = rewards - values + + feed = { + self.local.states: states, + self.local.actions: actions, + self.local.rewards: rewards, + self.local.advantage: advantage + } + + gradients = self.sess.run(self.local.gradients, feed) + + feed = [] + for (grad, _), (placeholder, _) in zip(gradients, self.global_network.gradients_placeholders): + feed.append((placeholder, grad)) + + feed = dict(feed) + self.sess.run(self.global_network.apply_gradients, feed) + + +def main(): + # try: + tf.reset_default_graph() + sess = tf.InteractiveSession() + coord = tf.train.Coordinator() + + save_path = "models/model.ckpt" + n_threads = 1 + + input_shape = 15 + output_dim = 4 # {0,1, 2, 3} + global_network = A3CNetwork(name="global", + input_shape=input_shape, + output_dim=output_dim) + var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "global") + saver = tf.train.Saver(var_list=var_list) + + thread_list = [] + env_list = [] + + global total_episodes + + for id in range(n_threads): + env = Env() + + single_agent = Agent(env=env, + session=sess, + coord=coord, + name="thread_{}".format(id), + global_network=global_network, + input_shape=input_shape, + output_dim=output_dim, + saver=saver) + thread_list.append(single_agent) + env_list.append(env) + + + saver = tf.train.Saver() + saver.restore(sess, 'models/modelr1.ckpt') + + + for t in thread_list: + t.start() + + print("Ctrl + C to close") + + coord.wait_for_stop() + + + if coord.wait_for_stop() : + print 'stopped' + + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/1-grid-world/8-a3c/A3C/A3C_tensorflow/models/checkpoint b/1-grid-world/8-a3c/A3C/A3C_tensorflow/models/checkpoint new file mode 100755 index 00000000..4b8afd39 --- /dev/null +++ b/1-grid-world/8-a3c/A3C/A3C_tensorflow/models/checkpoint @@ -0,0 +1,2 @@ +model_checkpoint_path: "modelr1.ckpt" +all_model_checkpoint_paths: "modelr1.ckpt" diff --git a/1-grid-world/8-a3c/A3C/A3C_tensorflow/models/modelr1.ckpt.data-00000-of-00001 b/1-grid-world/8-a3c/A3C/A3C_tensorflow/models/modelr1.ckpt.data-00000-of-00001 new file mode 100755 index 00000000..a09520d2 Binary files /dev/null and b/1-grid-world/8-a3c/A3C/A3C_tensorflow/models/modelr1.ckpt.data-00000-of-00001 differ diff --git a/1-grid-world/8-a3c/A3C/A3C_tensorflow/models/modelr1.ckpt.index b/1-grid-world/8-a3c/A3C/A3C_tensorflow/models/modelr1.ckpt.index new file mode 100755 index 00000000..f067609e Binary files /dev/null and b/1-grid-world/8-a3c/A3C/A3C_tensorflow/models/modelr1.ckpt.index differ diff --git a/1-grid-world/8-a3c/A3C/A3C_tensorflow/models/modelr1.ckpt.meta b/1-grid-world/8-a3c/A3C/A3C_tensorflow/models/modelr1.ckpt.meta new file mode 100755 index 00000000..01074f98 Binary files /dev/null and b/1-grid-world/8-a3c/A3C/A3C_tensorflow/models/modelr1.ckpt.meta differ diff --git a/1-grid-world/8-a3c/A3C/A3C_tensorflow/renderenv_load_weights.py b/1-grid-world/8-a3c/A3C/A3C_tensorflow/renderenv_load_weights.py new file mode 100755 index 00000000..656b7991 --- /dev/null +++ b/1-grid-world/8-a3c/A3C/A3C_tensorflow/renderenv_load_weights.py @@ -0,0 +1,143 @@ +import numpy as np +import Tkinter as tk +from PIL import ImageTk, Image +import time + +PhotoImage = ImageTk.PhotoImage + +UNIT = 50 # pixels + +HEIGHT = 0 +WIDTH = 0 + +# Goal position +goal_x = 0 # x position of goal +goal_y = 0 # y position of goal + +np.random.seed(1) + +class EnvRender(tk.Tk): + def __init__(self, gx, gy, Hx, Hy): + tk.Tk.__init__(self) + global HEIGHT + global WIDTH + + HEIGHT = Hx + WIDTH = Hy + + self.title('A3C') + self.geometry('{0}x{1}'.format(WIDTH * UNIT, HEIGHT * UNIT)) + self.shapes = self.load_images() + self.canvas = self._build_canvas() + + self.counter = 0 + self.objects = [] + + # obstacle + self.set_reward([0, 1], -1) + self.set_reward([1, 2], -1) + self.set_reward([2, 3], -1) + # #goal + global goal_x + global goal_y + goal_x = gx + goal_y = gy + + self.set_reward([goal_x, goal_y], 1) + + def _build_canvas(self): + + canvas = tk.Canvas(self, bg='white', + height=HEIGHT * UNIT, + width=WIDTH * UNIT) + # create grids + for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 + x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT + canvas.create_line(x0, y0, x1, y1) + for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 + x0, y0, x1, y1 = 0, r, WIDTH * UNIT, r + canvas.create_line(x0, y0, x1, y1) + + self.objects = [] + self.goal = [] + # add image to canvas + x, y = UNIT/2, UNIT/2 + self.rectangle = canvas.create_image(x, y, image=self.shapes[0]) + + # pack all` + canvas.pack() + + return canvas + + def load_images(self): + rectangle = PhotoImage( + Image.open("../img/rectangle.png").resize((30, 30))) + triangle = PhotoImage( + Image.open("../img/triangle.png").resize((30, 30))) + circle = PhotoImage( + Image.open("../img/circle.png").resize((30, 30))) + + return rectangle, triangle, circle + + def reset_object(self): + for obj in self.objects: + self.canvas.delete(obj['id']) + + self.objects = [] + # obstacle + self.set_reward([0, 1], -1) + self.set_reward([1, 2], -1) + self.set_reward([2, 3], -1) + # #goal + + self.set_reward([goal_x, goal_y], 1) + + def set_reward(self, state, reward): + state = [int(state[0]), int(state[1])] + x = int(state[0]) + y = int(state[1]) + + tmp = {} + if reward > 0: + tmp['id'] = self.canvas.create_image((UNIT * x) + UNIT / 2, (UNIT * y) + UNIT / 2, image=self.shapes[2]) + elif reward < 0: + tmp['id'] = self.canvas.create_image((UNIT * x) + UNIT / 2, (UNIT * y) + UNIT / 2, image=self.shapes[1]) + + tmp['reward'] = reward + self.objects.append(tmp) + + def reset(self, oenv): + self.update() + time.sleep(0.5) + + self.canvas.delete(self.rectangle) + self.rectangle = self.canvas.create_image(oenv.rectangle[0], oenv.rectangle[1], image=self.shapes[0]) + self.reset_object() + + def move(self, next_coords, mod_rewards): + self.render() + self.counter += 1 + + if self.counter % 2 == 1: + for obj in self.objects: + if obj['reward'] < 0: + self.canvas.delete(obj['id']) + + self.objects = [item for item in self.objects if item['reward'] > 0] + + for item in mod_rewards: + if item['reward'] < 0: + tmp = {} + tmp['id'] = self.canvas.create_image(item['figure'][0], item['figure'][1], image=self.shapes[1]) + tmp['reward'] = item['reward'] + self.objects.append(tmp) + + self.canvas.delete(self.rectangle) + self.rectangle = self.canvas.create_image(next_coords[0], next_coords[1], image=self.shapes[0]) + self.canvas.tag_raise(self.rectangle) + + + def render(self): + time.sleep(0.07) + self.update() + diff --git a/1-grid-world/8-a3c/A3C/README.md b/1-grid-world/8-a3c/A3C/README.md new file mode 100755 index 00000000..6ab59f57 --- /dev/null +++ b/1-grid-world/8-a3c/A3C/README.md @@ -0,0 +1,55 @@ +# Tensorflow implementation of A3C for a 2D grid environment + +## 1) Environment description + +The agent (red in color) should navigate to the purple circle (reward) by avoiding the moving obstacles (green triangles). +The reward location is randomly placed in the 2D grid environment at every episode. The agent generalizes well to unseen reward locations as well as unseen grid sizes at test time. + +![1](https://github.com/akileshbadrinaaraayanan/A3C_grid_world/raw/master/img/sample.png) + +## 2) Code organization +### A3C_tensorflow directory + +For train time, the below files are used: + +grid_env_r1.py contains the implementation of A3C algorithm. + +environment_a3c_r1.py is the code for 2D environment. + +For test time, the below files are used: + +grid_env_test.py - loads the saved model. + +environment_a3c_load_weights.py : game logic for 2D environment. + +renderenv_load_weights.py : To monitor how the agent behaves by rendering. + +### A3C_complex_environment directory + +The environment is more complex in this case with more number of obstacles as well as obstacles that move in both the directions (left-to-right and right-to-left). +For train time, the below files are used: + +grid_env_r3.py contains the implementation of A3C algorithm. + +environment_a3c_r3.py is the code for 2D environment. + +For test time, the below files are used: + +grid_env_test.py - loads the saved model. + +environment_a3c_load_weights.py : game logic for 2D environment. + +renderenv_load_weights.py : To monitor how the agent behaves by rendering. + +In both these cases, a non-uniform reward decay is used for the convergence of Reinforcement Learning (RL) agent. + +## 3) How to train? +``` +CUDA_VISIBLE_DEVICES="" python grid_env_r1.py +``` +If you want to just test with pre-trained model (stored inside models directory). Rendering is also enabled at test time. +``` +CUDA_VISIBLE_DEVICES="" python grid_env_test.py +``` +## 4) Acknowledgement +The basic environment code is based on grid world environment [here](https://github.com/rlcode/reinforcement-learning/tree/master/1-grid-world) diff --git a/1-grid-world/8-a3c/A3C/img/circle.png b/1-grid-world/8-a3c/A3C/img/circle.png new file mode 100755 index 00000000..7aeacd38 Binary files /dev/null and b/1-grid-world/8-a3c/A3C/img/circle.png differ diff --git a/1-grid-world/8-a3c/A3C/img/down.png b/1-grid-world/8-a3c/A3C/img/down.png new file mode 100755 index 00000000..cd94e13f Binary files /dev/null and b/1-grid-world/8-a3c/A3C/img/down.png differ diff --git a/1-grid-world/8-a3c/A3C/img/left.png b/1-grid-world/8-a3c/A3C/img/left.png new file mode 100755 index 00000000..079c57b6 Binary files /dev/null and b/1-grid-world/8-a3c/A3C/img/left.png differ diff --git a/1-grid-world/8-a3c/A3C/img/rectangle.png b/1-grid-world/8-a3c/A3C/img/rectangle.png new file mode 100755 index 00000000..b7cea073 Binary files /dev/null and b/1-grid-world/8-a3c/A3C/img/rectangle.png differ diff --git a/1-grid-world/8-a3c/A3C/img/right.png b/1-grid-world/8-a3c/A3C/img/right.png new file mode 100755 index 00000000..cbe1b1b5 Binary files /dev/null and b/1-grid-world/8-a3c/A3C/img/right.png differ diff --git a/1-grid-world/8-a3c/A3C/img/sample.png b/1-grid-world/8-a3c/A3C/img/sample.png new file mode 100755 index 00000000..57adf343 Binary files /dev/null and b/1-grid-world/8-a3c/A3C/img/sample.png differ diff --git a/1-grid-world/8-a3c/A3C/img/triangle.png b/1-grid-world/8-a3c/A3C/img/triangle.png new file mode 100755 index 00000000..1cd9db0a Binary files /dev/null and b/1-grid-world/8-a3c/A3C/img/triangle.png differ diff --git a/1-grid-world/8-a3c/A3C/img/up.png b/1-grid-world/8-a3c/A3C/img/up.png new file mode 100755 index 00000000..6e9c2176 Binary files /dev/null and b/1-grid-world/8-a3c/A3C/img/up.png differ