diff --git a/mujoco/utils/utils.py b/mujoco/utils/utils.py index 4a49a78..e857ef3 100644 --- a/mujoco/utils/utils.py +++ b/mujoco/utils/utils.py @@ -2,12 +2,6 @@ import math -def get_action(mu, std): - action = torch.normal(mu, std) - action = action.data.numpy() - return action - - def log_density(x, mu, std, logstd): var = std.pow(2) log_density = -(x - mu).pow(2) / (2 * var) \ diff --git a/unity/agent/ppo.py b/unity/agent/ppo.py index d9f224f..5db9f0e 100644 --- a/unity/agent/ppo.py +++ b/unity/agent/ppo.py @@ -1,9 +1,8 @@ import numpy as np from unity.utils.utils import * -from unity.hparams import HyperParams as hp -def get_gae(rewards, masks, values): +def get_gae(rewards, masks, values, args): rewards = torch.Tensor(rewards) masks = torch.Tensor(masks) returns = torch.zeros_like(rewards) @@ -14,10 +13,10 @@ def get_gae(rewards, masks, values): running_advants = 0 for t in reversed(range(0, len(rewards))): - running_returns = rewards[t] + hp.gamma * running_returns * masks[t] - running_tderror = rewards[t] + hp.gamma * previous_value * masks[t] - \ + running_returns = rewards[t] + args.gamma * running_returns * masks[t] + running_tderror = rewards[t] + args.gamma * previous_value * masks[t] - \ values.data[t] - running_advants = running_tderror + hp.gamma * hp.lamda * \ + running_advants = running_tderror + args.gamma * args.lamda * \ running_advants * masks[t] returns[t] = running_returns @@ -38,7 +37,7 @@ def surrogate_loss(actor, advants, states, old_policy, actions, index): return surrogate, ratio -def train_model(actor, critic, memory, actor_optim, critic_optim): +def train_model(actor, critic, memory, actor_optim, critic_optim, args): memory = np.array(memory) states = np.vstack(memory[:, 0]) actions = list(memory[:, 1]) @@ -48,9 +47,10 @@ def train_model(actor, critic, memory, actor_optim, critic_optim): # ---------------------------- # step 1: get returns and GAEs and log probability of old policy - returns, advants = get_gae(rewards, masks, values) + returns, advants = get_gae(rewards, masks, values, args) mu, std, logstd = actor(torch.Tensor(states)) old_policy = log_density(torch.Tensor(actions), mu, std, logstd) + old_values = critic(torch.Tensor(states)) criterion = torch.nn.MSELoss() n = len(states) @@ -61,39 +61,40 @@ def train_model(actor, critic, memory, actor_optim, critic_optim): for epoch in range(10): np.random.shuffle(arr) - for i in range(n // hp.batch_size): - batch_index = arr[hp.batch_size * i: hp.batch_size * (i + 1)] + for i in range(n // args.batch_size): + batch_index = arr[args.batch_size * i: args.batch_size * (i + 1)] batch_index = torch.LongTensor(batch_index) inputs = torch.Tensor(states)[batch_index] returns_samples = returns.unsqueeze(1)[batch_index] advants_samples = advants.unsqueeze(1)[batch_index] actions_samples = torch.Tensor(actions)[batch_index] + oldvalue_samples = old_values[batch_index].detach() + loss, ratio = surrogate_loss(actor, advants_samples, inputs, old_policy.detach(), actions_samples, batch_index) values = critic(inputs) - critic_loss = criterion(values, returns_samples) + clipped_values = oldvalue_samples + \ + torch.clamp(values - oldvalue_samples, + -args.clip_param, + args.clip_param) + critic_loss1 = criterion(clipped_values, returns_samples) + critic_loss2 = criterion(values, returns_samples) + critic_loss = torch.max(critic_loss1, critic_loss2).mean() clipped_ratio = torch.clamp(ratio, - 1.0 - hp.clip_param, - 1.0 + hp.clip_param) + 1.0 - args.clip_param, + 1.0 + args.clip_param) clipped_loss = clipped_ratio * advants_samples actor_loss = -torch.min(loss, clipped_loss).mean() loss = actor_loss + 0.5 * critic_loss critic_optim.zero_grad() - loss.backward() + loss.backward(retain_graph=True) critic_optim.step() actor_optim.zero_grad() loss.backward() actor_optim.step() - - - - - - - diff --git a/unity/agent/ppo2.py b/unity/agent/ppo2.py deleted file mode 100644 index 02a990e..0000000 --- a/unity/agent/ppo2.py +++ /dev/null @@ -1,101 +0,0 @@ -import numpy as np -from unity.utils.utils import * -from unity.hparams import HyperParams as hp - - -def get_gae(rewards, masks, values): - rewards = torch.Tensor(rewards) - masks = torch.Tensor(masks) - returns = torch.zeros_like(rewards) - advants = torch.zeros_like(rewards) - - running_returns = 0 - previous_value = 0 - running_advants = 0 - - for t in reversed(range(0, len(rewards))): - running_returns = rewards[t] + hp.gamma * running_returns * masks[t] - running_tderror = rewards[t] + hp.gamma * previous_value * masks[t] - \ - values.data[t] - running_advants = running_tderror + hp.gamma * hp.lamda * \ - running_advants * masks[t] - - returns[t] = running_returns - previous_value = values.data[t] - advants[t] = running_advants - - advants = (advants - advants.mean()) / advants.std() - return returns, advants - - -def surrogate_loss(actor, advants, states, old_policy, actions, index): - mu, std, logstd = actor(torch.Tensor(states)) - new_policy = log_density(actions, mu, std, logstd) - old_policy = old_policy[index] - - ratio = torch.exp(new_policy - old_policy) - surrogate = ratio * advants - return surrogate, ratio - - -def train_model(actor, critic, memory, actor_optim, critic_optim): - memory = np.array(memory) - states = np.vstack(memory[:, 0]) - actions = list(memory[:, 1]) - rewards = list(memory[:, 2]) - masks = list(memory[:, 3]) - values = critic(torch.Tensor(states)) - - # ---------------------------- - # step 1: get returns and GAEs and log probability of old policy - returns, advants = get_gae(rewards, masks, values) - mu, std, logstd = actor(torch.Tensor(states)) - old_policy = log_density(torch.Tensor(actions), mu, std, logstd) - old_values = critic(torch.Tensor(states)) - - criterion = torch.nn.MSELoss() - n = len(states) - arr = np.arange(n) - - # ---------------------------- - # step 2: get value loss and actor loss and update actor & critic - for epoch in range(10): - np.random.shuffle(arr) - - for i in range(n // hp.batch_size): - batch_index = arr[hp.batch_size * i: hp.batch_size * (i + 1)] - batch_index = torch.LongTensor(batch_index) - inputs = torch.Tensor(states)[batch_index] - returns_samples = returns.unsqueeze(1)[batch_index] - advants_samples = advants.unsqueeze(1)[batch_index] - actions_samples = torch.Tensor(actions)[batch_index] - oldvalue_samples = old_values[batch_index].detach() - - loss, ratio = surrogate_loss(actor, advants_samples, inputs, - old_policy.detach(), actions_samples, - batch_index) - - values = critic(inputs) - clipped_values = oldvalue_samples + \ - torch.clamp(values - oldvalue_samples, - -hp.clip_param, - hp.clip_param) - critic_loss1 = criterion(clipped_values, returns_samples) - critic_loss2 = criterion(values, returns_samples) - critic_loss = torch.max(critic_loss1, critic_loss2).mean() - - clipped_ratio = torch.clamp(ratio, - 1.0 - hp.clip_param, - 1.0 + hp.clip_param) - clipped_loss = clipped_ratio * advants_samples - actor_loss = -torch.min(loss, clipped_loss).mean() - - loss = actor_loss + 0.5 * critic_loss - - critic_optim.zero_grad() - loss.backward(retain_graph=True) - critic_optim.step() - - actor_optim.zero_grad() - loss.backward() - actor_optim.step() diff --git a/unity/env/unity-environment.log b/unity/env/unity-environment.log index 9818fc5..76124d5 100644 --- a/unity/env/unity-environment.log +++ b/unity/env/unity-environment.log @@ -1,2 +1,2 @@ -7/18/2018 11:17:24 PM +7/21/2018 2:02:54 PM diff --git a/unity/hparams.py b/unity/hparams.py deleted file mode 100644 index a67477a..0000000 --- a/unity/hparams.py +++ /dev/null @@ -1,10 +0,0 @@ -class HyperParams: - gamma = 0.995 - lamda = 0.95 - hidden = 512 - critic_lr = 0.0001 - actor_lr = 0.0001 - batch_size = 1024 - l2_rate = 0.001 - max_kl = 0.01 - clip_param = 0.1 diff --git a/unity/main.py b/unity/main.py index d4f31ea..2432622 100644 --- a/unity/main.py +++ b/unity/main.py @@ -1,28 +1,44 @@ import torch import argparse import numpy as np +import datetime import torch.optim as optim from unity.model import Actor, Critic from unity.utils.utils import get_action from collections import deque from unity.utils.running_state import ZFilter -from unity.hparams import HyperParams as hp -from unity.agent.ppo2 import train_model +from unity.agent.ppo import train_model +from unity.unityagents import UnityEnvironment -parser = argparse.ArgumentParser() -parser.add_argument('--render', default=False) +parser = argparse.ArgumentParser(description='Setting for unity walker agent') +parser.add_argument('--render', default=False, + help='if you dont want to render, set this to True') parser.add_argument('--load_model', default=None) +parser.add_argument('--gamma', default=0.995, help='discount factor') +parser.add_argument('--lambda', default=0.95, help='GAE hyper-parameter') +parser.add_argument('--hidden_size', default=512, + help='hidden unit size of actor and critic networks') +parser.add_argument('--critic_lr', default=0.0001) +parser.add_argument('--actor_lr', default=0.0001) +parser.add_argument('--batch_size', default=1024) +parser.add_argument('--l2_rate', default=0.001, + help='l2 regularizer coefficient') +parser.add_argument('--clip_param', default=0.1, + help='hyper parameter for ppo policy loss and value loss') +parser.add_argument('--activation', default='tanh', + help='you can choose between tanh and swish') args = parser.parse_args() -if __name__=="__main__": +if __name__ == "__main__": env_name = "./env/walker" train_mode = True torch.manual_seed(500) - from unity.unityagents import UnityEnvironment - - env = UnityEnvironment(file_name=env_name) + if args.render: + env = UnityEnvironment(file_name=env_name) + else: + env = UnityEnvironment(file_name=env_name, no_graphics=True) # setting for unity ml-agent default_brain = env.brain_names[0] @@ -34,17 +50,17 @@ print('state size:', num_inputs) print('action size:', num_actions) - actor = Actor(num_inputs, num_actions) - critic = Critic(num_inputs) + actor = Actor(num_inputs, num_actions, args) + critic = Critic(num_inputs, args) if args.load_model is not None: model_path = args.load_model - actor = torch.load(model_path + '/actor') - critic = torch.load(model_path + '/critic') + actor = actor.load_state_dict(model_path + 'actor.pt') + critic = critic.load_state_dict(model_path + 'critic.pt') - actor_optim = optim.Adam(actor.parameters(), lr=hp.actor_lr) - critic_optim = optim.Adam(critic.parameters(), lr=hp.critic_lr, - weight_decay=hp.l2_rate) + actor_optim = optim.Adam(actor.parameters(), lr=args.actor_lr) + critic_optim = optim.Adam(critic.parameters(), lr=args.critic_lr, + weight_decay=args.l2_rate) # running average of state running_state = ZFilter((num_inputs,), clip=5) @@ -93,9 +109,10 @@ score_avg = np.mean(scores) print('{} episode score is {:.2f}'.format(episodes, score_avg)) actor.train(), critic.train() - train_model(actor, critic, memory, actor_optim, critic_optim) + train_model(actor, critic, memory, actor_optim, critic_optim, args) if iter % 10: - torch.save(actor, 'save_model/actor1') - torch.save(critic, 'save_model/critic1') + time = datetime.datetime.now().strftime("%Y%m%d_%H%M") + actor.save_state_dict('save_model/' + time + 'actor.pt') + critic.save_state_dict('save_model/' + time + 'critic.pt') env.close() \ No newline at end of file diff --git a/unity/model.py b/unity/model.py index 86ebe17..50170c1 100644 --- a/unity/model.py +++ b/unity/model.py @@ -1,41 +1,50 @@ import torch import torch.nn as nn import torch.nn.functional as F -from mujoco.hparams import HyperParams as hp class Actor(nn.Module): - def __init__(self, num_inputs, num_outputs): + def __init__(self, num_inputs, num_outputs, args): + self.args = args self.num_inputs = num_inputs self.num_outputs = num_outputs super(Actor, self).__init__() - self.fc1 = nn.Linear(num_inputs, hp.hidden) - self.fc2 = nn.Linear(hp.hidden, hp.hidden) - self.fc3 = nn.Linear(hp.hidden, hp.hidden) - self.fc4 = nn.Linear(hp.hidden, num_outputs) + self.fc1 = nn.Linear(num_inputs, args.hidden) + self.fc2 = nn.Linear(args.hidden, args.hidden) + self.fc3 = nn.Linear(args.hidden, args.hidden) + self.fc4 = nn.Linear(args.hidden, num_outputs) self.fc4.weight.data.mul_(0.1) self.fc4.bias.data.mul_(0.0) def forward(self, x): - x = self.fc1(x) - x = x * F.sigmoid(x) - x = self.fc2(x) - x = x * F.sigmoid(x) - x = self.fc3(x) - x = x * F.sigmoid(x) - mu = self.fc4(x) + if self.args.activation == 'tanh': + x = F.tanh(self.fc1(x)) + x = F.tanh(self.fc2(x)) + x = F.tanh(self.fc3(x)) + mu = self.fc4(x) + elif self.args.activation == 'swish': + x = self.fc1(x) + x = x * F.sigmoid(x) + x = self.fc2(x) + x = x * F.sigmoid(x) + x = self.fc3(x) + x = x * F.sigmoid(x) + mu = self.fc4(x) + else: + raise ValueError + logstd = torch.zeros_like(mu) std = torch.exp(logstd) return mu, std, logstd class Critic(nn.Module): - def __init__(self, num_inputs): + def __init__(self, num_inputs, args): super(Critic, self).__init__() - self.fc1 = nn.Linear(num_inputs, hp.hidden) - self.fc2 = nn.Linear(hp.hidden, hp.hidden) - self.fc3 = nn.Linear(hp.hidden, 1) + self.fc1 = nn.Linear(num_inputs, args.hidden) + self.fc2 = nn.Linear(args.hidden, args.hidden) + self.fc3 = nn.Linear(args.hidden, 1) self.fc3.weight.data.mul_(0.1) self.fc3.bias.data.mul_(0.0) diff --git a/unity/save_model/actor1 b/unity/save_model/actor1 index 45d7f08..3b0c0e1 100644 Binary files a/unity/save_model/actor1 and b/unity/save_model/actor1 differ diff --git a/unity/save_model/critic1 b/unity/save_model/critic1 index 7437ec9..0aa4f9a 100644 Binary files a/unity/save_model/critic1 and b/unity/save_model/critic1 differ diff --git a/unity/test_agent.py b/unity/test_agent.py index e6ec894..1c0d214 100644 --- a/unity/test_agent.py +++ b/unity/test_agent.py @@ -2,17 +2,16 @@ import argparse import numpy as np from unity.model import Actor, Critic +from unity.unityagents import UnityEnvironment from unity.utils.utils import get_action from unity.utils.running_state import ZFilter if __name__=="__main__": - env_name = "./Env/walker2" + env_name = "./env/walker" train_mode = False torch.manual_seed(500) - from unity.unityagents import UnityEnvironment - env = UnityEnvironment(file_name=env_name) default_brain = env.brain_names[0] @@ -24,8 +23,8 @@ print('state size:', num_inputs) print('action size:', num_actions) - actor = torch.load('save_model/actor') - critic = torch.load('save_model/critic') + actor = torch.load('save_model/actor1') + critic = torch.load('save_model/critic1') running_state = ZFilter((num_inputs,), clip=5) episodes = 0