Skip to content

Commit

Permalink
change hyparams.py to argparser
Browse files Browse the repository at this point in the history
  • Loading branch information
dnddnjs committed Jul 21, 2018
1 parent 919086f commit da10bd8
Show file tree
Hide file tree
Showing 10 changed files with 87 additions and 178 deletions.
6 changes: 0 additions & 6 deletions mujoco/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@
import math


def get_action(mu, std):
action = torch.normal(mu, std)
action = action.data.numpy()
return action


def log_density(x, mu, std, logstd):
var = std.pow(2)
log_density = -(x - mu).pow(2) / (2 * var) \
Expand Down
41 changes: 21 additions & 20 deletions unity/agent/ppo.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import numpy as np
from unity.utils.utils import *
from unity.hparams import HyperParams as hp


def get_gae(rewards, masks, values):
def get_gae(rewards, masks, values, args):
rewards = torch.Tensor(rewards)
masks = torch.Tensor(masks)
returns = torch.zeros_like(rewards)
Expand All @@ -14,10 +13,10 @@ def get_gae(rewards, masks, values):
running_advants = 0

for t in reversed(range(0, len(rewards))):
running_returns = rewards[t] + hp.gamma * running_returns * masks[t]
running_tderror = rewards[t] + hp.gamma * previous_value * masks[t] - \
running_returns = rewards[t] + args.gamma * running_returns * masks[t]
running_tderror = rewards[t] + args.gamma * previous_value * masks[t] - \
values.data[t]
running_advants = running_tderror + hp.gamma * hp.lamda * \
running_advants = running_tderror + args.gamma * args.lamda * \
running_advants * masks[t]

returns[t] = running_returns
Expand All @@ -38,7 +37,7 @@ def surrogate_loss(actor, advants, states, old_policy, actions, index):
return surrogate, ratio


def train_model(actor, critic, memory, actor_optim, critic_optim):
def train_model(actor, critic, memory, actor_optim, critic_optim, args):
memory = np.array(memory)
states = np.vstack(memory[:, 0])
actions = list(memory[:, 1])
Expand All @@ -48,9 +47,10 @@ def train_model(actor, critic, memory, actor_optim, critic_optim):

# ----------------------------
# step 1: get returns and GAEs and log probability of old policy
returns, advants = get_gae(rewards, masks, values)
returns, advants = get_gae(rewards, masks, values, args)
mu, std, logstd = actor(torch.Tensor(states))
old_policy = log_density(torch.Tensor(actions), mu, std, logstd)
old_values = critic(torch.Tensor(states))

criterion = torch.nn.MSELoss()
n = len(states)
Expand All @@ -61,39 +61,40 @@ def train_model(actor, critic, memory, actor_optim, critic_optim):
for epoch in range(10):
np.random.shuffle(arr)

for i in range(n // hp.batch_size):
batch_index = arr[hp.batch_size * i: hp.batch_size * (i + 1)]
for i in range(n // args.batch_size):
batch_index = arr[args.batch_size * i: args.batch_size * (i + 1)]
batch_index = torch.LongTensor(batch_index)
inputs = torch.Tensor(states)[batch_index]
returns_samples = returns.unsqueeze(1)[batch_index]
advants_samples = advants.unsqueeze(1)[batch_index]
actions_samples = torch.Tensor(actions)[batch_index]
oldvalue_samples = old_values[batch_index].detach()

loss, ratio = surrogate_loss(actor, advants_samples, inputs,
old_policy.detach(), actions_samples,
batch_index)

values = critic(inputs)
critic_loss = criterion(values, returns_samples)
clipped_values = oldvalue_samples + \
torch.clamp(values - oldvalue_samples,
-args.clip_param,
args.clip_param)
critic_loss1 = criterion(clipped_values, returns_samples)
critic_loss2 = criterion(values, returns_samples)
critic_loss = torch.max(critic_loss1, critic_loss2).mean()

clipped_ratio = torch.clamp(ratio,
1.0 - hp.clip_param,
1.0 + hp.clip_param)
1.0 - args.clip_param,
1.0 + args.clip_param)
clipped_loss = clipped_ratio * advants_samples
actor_loss = -torch.min(loss, clipped_loss).mean()

loss = actor_loss + 0.5 * critic_loss

critic_optim.zero_grad()
loss.backward()
loss.backward(retain_graph=True)
critic_optim.step()

actor_optim.zero_grad()
loss.backward()
actor_optim.step()







101 changes: 0 additions & 101 deletions unity/agent/ppo2.py

This file was deleted.

2 changes: 1 addition & 1 deletion unity/env/unity-environment.log
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
7/18/2018 11:17:24 PM
7/21/2018 2:02:54 PM

10 changes: 0 additions & 10 deletions unity/hparams.py

This file was deleted.

53 changes: 35 additions & 18 deletions unity/main.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,44 @@
import torch
import argparse
import numpy as np
import datetime
import torch.optim as optim
from unity.model import Actor, Critic
from unity.utils.utils import get_action
from collections import deque
from unity.utils.running_state import ZFilter
from unity.hparams import HyperParams as hp
from unity.agent.ppo2 import train_model
from unity.agent.ppo import train_model
from unity.unityagents import UnityEnvironment

parser = argparse.ArgumentParser()
parser.add_argument('--render', default=False)
parser = argparse.ArgumentParser(description='Setting for unity walker agent')
parser.add_argument('--render', default=False,
help='if you dont want to render, set this to True')
parser.add_argument('--load_model', default=None)
parser.add_argument('--gamma', default=0.995, help='discount factor')
parser.add_argument('--lambda', default=0.95, help='GAE hyper-parameter')
parser.add_argument('--hidden_size', default=512,
help='hidden unit size of actor and critic networks')
parser.add_argument('--critic_lr', default=0.0001)
parser.add_argument('--actor_lr', default=0.0001)
parser.add_argument('--batch_size', default=1024)
parser.add_argument('--l2_rate', default=0.001,
help='l2 regularizer coefficient')
parser.add_argument('--clip_param', default=0.1,
help='hyper parameter for ppo policy loss and value loss')
parser.add_argument('--activation', default='tanh',
help='you can choose between tanh and swish')
args = parser.parse_args()


if __name__=="__main__":
if __name__ == "__main__":
env_name = "./env/walker"
train_mode = True
torch.manual_seed(500)

from unity.unityagents import UnityEnvironment

env = UnityEnvironment(file_name=env_name)
if args.render:
env = UnityEnvironment(file_name=env_name)
else:
env = UnityEnvironment(file_name=env_name, no_graphics=True)

# setting for unity ml-agent
default_brain = env.brain_names[0]
Expand All @@ -34,17 +50,17 @@
print('state size:', num_inputs)
print('action size:', num_actions)

actor = Actor(num_inputs, num_actions)
critic = Critic(num_inputs)
actor = Actor(num_inputs, num_actions, args)
critic = Critic(num_inputs, args)

if args.load_model is not None:
model_path = args.load_model
actor = torch.load(model_path + '/actor')
critic = torch.load(model_path + '/critic')
actor = actor.load_state_dict(model_path + 'actor.pt')
critic = critic.load_state_dict(model_path + 'critic.pt')

actor_optim = optim.Adam(actor.parameters(), lr=hp.actor_lr)
critic_optim = optim.Adam(critic.parameters(), lr=hp.critic_lr,
weight_decay=hp.l2_rate)
actor_optim = optim.Adam(actor.parameters(), lr=args.actor_lr)
critic_optim = optim.Adam(critic.parameters(), lr=args.critic_lr,
weight_decay=args.l2_rate)

# running average of state
running_state = ZFilter((num_inputs,), clip=5)
Expand Down Expand Up @@ -93,9 +109,10 @@
score_avg = np.mean(scores)
print('{} episode score is {:.2f}'.format(episodes, score_avg))
actor.train(), critic.train()
train_model(actor, critic, memory, actor_optim, critic_optim)
train_model(actor, critic, memory, actor_optim, critic_optim, args)
if iter % 10:
torch.save(actor, 'save_model/actor1')
torch.save(critic, 'save_model/critic1')
time = datetime.datetime.now().strftime("%Y%m%d_%H%M")
actor.save_state_dict('save_model/' + time + 'actor.pt')
critic.save_state_dict('save_model/' + time + 'critic.pt')

env.close()
43 changes: 26 additions & 17 deletions unity/model.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,50 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from mujoco.hparams import HyperParams as hp


class Actor(nn.Module):
def __init__(self, num_inputs, num_outputs):
def __init__(self, num_inputs, num_outputs, args):
self.args = args
self.num_inputs = num_inputs
self.num_outputs = num_outputs
super(Actor, self).__init__()
self.fc1 = nn.Linear(num_inputs, hp.hidden)
self.fc2 = nn.Linear(hp.hidden, hp.hidden)
self.fc3 = nn.Linear(hp.hidden, hp.hidden)
self.fc4 = nn.Linear(hp.hidden, num_outputs)
self.fc1 = nn.Linear(num_inputs, args.hidden)
self.fc2 = nn.Linear(args.hidden, args.hidden)
self.fc3 = nn.Linear(args.hidden, args.hidden)
self.fc4 = nn.Linear(args.hidden, num_outputs)

self.fc4.weight.data.mul_(0.1)
self.fc4.bias.data.mul_(0.0)

def forward(self, x):
x = self.fc1(x)
x = x * F.sigmoid(x)
x = self.fc2(x)
x = x * F.sigmoid(x)
x = self.fc3(x)
x = x * F.sigmoid(x)
mu = self.fc4(x)
if self.args.activation == 'tanh':
x = F.tanh(self.fc1(x))
x = F.tanh(self.fc2(x))
x = F.tanh(self.fc3(x))
mu = self.fc4(x)
elif self.args.activation == 'swish':
x = self.fc1(x)
x = x * F.sigmoid(x)
x = self.fc2(x)
x = x * F.sigmoid(x)
x = self.fc3(x)
x = x * F.sigmoid(x)
mu = self.fc4(x)
else:
raise ValueError

logstd = torch.zeros_like(mu)
std = torch.exp(logstd)
return mu, std, logstd


class Critic(nn.Module):
def __init__(self, num_inputs):
def __init__(self, num_inputs, args):
super(Critic, self).__init__()
self.fc1 = nn.Linear(num_inputs, hp.hidden)
self.fc2 = nn.Linear(hp.hidden, hp.hidden)
self.fc3 = nn.Linear(hp.hidden, 1)
self.fc1 = nn.Linear(num_inputs, args.hidden)
self.fc2 = nn.Linear(args.hidden, args.hidden)
self.fc3 = nn.Linear(args.hidden, 1)
self.fc3.weight.data.mul_(0.1)
self.fc3.bias.data.mul_(0.0)

Expand Down
Binary file modified unity/save_model/actor1
Binary file not shown.
Binary file modified unity/save_model/critic1
Binary file not shown.
Loading

0 comments on commit da10bd8

Please sign in to comment.