Skip to content

Commit

Permalink
Checked the current two main training loops for release
Browse files Browse the repository at this point in the history
  • Loading branch information
UnnamedMoose committed May 29, 2023
1 parent 7f72967 commit 074a86f
Show file tree
Hide file tree
Showing 30 changed files with 46,870 additions and 71 deletions.
2,880 changes: 2,880 additions & 0 deletions agentData/SAC_sblPretrain_try0_fromPID_3.monitor.csv

Large diffs are not rendered by default.

Binary file added agentData/SAC_sblPretrain_try0_fromPID_3.zip
Binary file not shown.
6,266 changes: 6,266 additions & 0 deletions agentData/SAC_sblPretrain_try0_fromPID_4.monitor.csv

Large diffs are not rendered by default.

Binary file added agentData/SAC_sblPretrain_try0_fromPID_4.zip
Binary file not shown.
Binary file not shown.
6,023 changes: 6,023 additions & 0 deletions agentData/SAC_sblPretrain_try0_fromPID_5.monitor.csv

Large diffs are not rendered by default.

Binary file added agentData/SAC_sblPretrain_try0_fromPID_5.zip
Binary file not shown.
Binary file not shown.
Binary file modified agentData/SAC_sblPretrain_try0_fromPID_convergence.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6,435 changes: 6,435 additions & 0 deletions agentData/SAC_try9_0.monitor.csv

Large diffs are not rendered by default.

Binary file added agentData/SAC_try9_0.zip
Binary file not shown.
Binary file added agentData/SAC_try9_0_replayBuffer.pkl
Binary file not shown.
6,198 changes: 6,198 additions & 0 deletions agentData/SAC_try9_1.monitor.csv

Large diffs are not rendered by default.

Binary file added agentData/SAC_try9_1.zip
Binary file not shown.
Binary file added agentData/SAC_try9_1_replayBuffer.pkl
Binary file not shown.
6,566 changes: 6,566 additions & 0 deletions agentData/SAC_try9_2.monitor.csv

Large diffs are not rendered by default.

Binary file added agentData/SAC_try9_2.zip
Binary file not shown.
Binary file added agentData/SAC_try9_2_replayBuffer.pkl
Binary file not shown.
6,329 changes: 6,329 additions & 0 deletions agentData/SAC_try9_3.monitor.csv

Large diffs are not rendered by default.

Binary file added agentData/SAC_try9_3.zip
Binary file not shown.
Binary file added agentData/SAC_try9_3_replayBuffer.pkl
Binary file not shown.
6,084 changes: 6,084 additions & 0 deletions agentData/SAC_try9_4.monitor.csv

Large diffs are not rendered by default.

Binary file added agentData/SAC_try9_4.zip
Binary file not shown.
Binary file added agentData/SAC_try9_4_replayBuffer.pkl
Binary file not shown.
Binary file added agentData/SAC_try9_convergence.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
49 changes: 49 additions & 0 deletions agentData/SAC_try9_hyperparameters.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
agentName: SAC_try9
agent_kwargs:
action_noise:
mu:
- 0.0
- 0.0
- 0.0
sigma:
- 0.05
- 0.05
- 0.05
batch_size: 256
buffer_size: 196608
ent_coef: auto_0.1
gamma: 0.95
gradient_steps: 1
learning_rate: 0.0005
learning_starts: 256
target_entropy: auto
train_freq:
- 1
- step
use_sde_at_warmup: false
verbose: 1
env_kwargs:
currentTurbScale: 2.0
currentVelScale: 1.0
noiseMagActuation: 0.1
noiseMagCoeffs: 0.1
nProc: 16
nTrainingSteps: 1500000
policy_kwargs:
activation_fn: <class 'torch.nn.modules.activation.GELU'>
net_arch:
pi:
- 128
- 128
- 128
qf:
- 128
- 128
- 128
use_sde: false
trainingTime:
- 3314.020191
- 3125.679191
- 2926.973152
- 2840.737664
- 2825.509448
26 changes: 13 additions & 13 deletions main_00_SAC_stable_baselines.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,8 @@
# An ugly fix for OpenMP conflicts in my installation.
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# TODO retrain with an extended state that will include surface pressure estimates
# based on body velocity, or maybe just zeros? These are needed to restart training in CFD.

# For saving trained agents.
agentName = "SAC_try8_restart_noReplayBuffer"
agentName = "SAC_try9"

# Set to None to pick the best agent from the trained set. Specify as string
# to load a particular saved model.
Expand All @@ -52,22 +49,22 @@
do_evaluation = False

# --- Training parameters ---

# agentToRestart = None
agentToRestart = "SAC_try8_forRestart_0"
loadReplayBuffer = True # For a "perfect" restart keep this on.
agentToRestart = None
# agentToRestart = "SAC_try8_forRestart_0"

# No. parallel processes.
nProc = 16

# Do everything N times to rule out random successes and failures.
nAgents = 1
nAgents = 5

# Any found agent will be left alone unless this is set to true.
overwrite = True

# nTrainingSteps = 1_500_000
nTrainingSteps = 500_000

nTrainingSteps = 1_500_000
# nTrainingSteps = 500_000
#
agent_kwargs = {
'learning_rate': 5e-4,
'gamma': 0.95,
Expand All @@ -80,7 +77,9 @@
"action_noise": VectorizedActionNoise(NormalActionNoise(
np.zeros(3), 0.05*np.ones(3)), nProc),
"use_sde_at_warmup": False,
"target_entropy": -4.,
# "target_entropy": -4.,
"target_entropy": "auto",
"ent_coef": "auto_0.1",
}
policy_kwargs = {
"activation_fn": torch.nn.GELU,
Expand Down Expand Up @@ -136,7 +135,8 @@
else:
agent = stable_baselines3.SAC.load("./agentData/{}".format(agentToRestart),
env=env, force_reset=False)
# agent.load_replay_buffer("./agentData/{}_replayBuffer".format(agentToRestart))
if loadReplayBuffer:
agent.load_replay_buffer("./agentData/{}_replayBuffer".format(agentToRestart))

# Train the agent for N steps
conv, trainingTime = resources.trainAgent(agent, nTrainingSteps, saveFile)
Expand Down
60 changes: 9 additions & 51 deletions main_01_SAC_sbl_customInit.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@
# %% Create test rollouts.

# Create a random seed.
seed = 1
seed = 3
rng = np.random.default_rng(seed)
np.random.seed(seed)

Expand Down Expand Up @@ -146,53 +146,13 @@
observations = ep[stateVars].values
actions = ep[actionVars].values[:-1]
rewards = ep["reward"].values[:-1]
# observations = [ep[stateVars].values[0]]
# actions = np.empty((0, env_pretrain.action_space.shape[0]))
# rewards = []
# for i in range(1, len(ep)):
# obs = ep[stateVars].values[i]
# action = ep[actionVars].values[i]
# reward = ep["reward"].values[i]
# observations = np.append(observations, obs[np.newaxis, :], axis=0)
# actions = np.append(actions, action[np.newaxis, :], axis=0)
# rewards = np.append(rewards, reward)

out_dict_stacked = {"rews": rewards, "acts": actions, "obs": observations, "infos": None}
traj = TrajectoryWithRew(**out_dict_stacked, terminal=(rewards[-1] < -200))
assert traj.rews.shape[0] == traj.acts.shape[0] == traj.obs.shape[0] - 1

rollouts.append(traj)

# Initial approach with creating rollouts from the env directly. Storing and
# reading data is faster and allows the same data to be reused for reproducibility.
# # Create own rollout generation to circumvent the issues with the imitation wrappers.
# rollouts = []
# for iEp in range(nPretrainEpisodes):
# obs = env_pretrain.reset()
# observations = [obs]
# actions = np.empty((0, env_pretrain.action_space.shape[0]))
# rewards = []
# for i in range(env_pretrain._max_episode_steps):
# action, _states = pdController.predict(obs, deterministic=True)
# obs, reward, done, info = env_pretrain.step(action)
# observations = np.append(observations, obs[np.newaxis, :], axis=0)
# actions = np.append(actions, action[np.newaxis, :], axis=0)
# rewards = np.append(rewards, reward)

# out_dict_stacked = {"rews": rewards, "acts": actions, "obs": observations, "infos": None}
# traj = TrajectoryWithRew(**out_dict_stacked, terminal=done)
# assert traj.rews.shape[0] == traj.acts.shape[0] == traj.obs.shape[0] - 1

# rollouts.append(traj)

# # Pack the generated rollouts into DataFrames for eventual saving and easier visualisation.
# pretrainEpisodes = []
# for r in rollouts:
# pretrainEpisodes.append(pandas.DataFrame(
# data=np.concatenate([r.obs[1:, :], r.acts, r.rews[:, np.newaxis]], axis=1),
# columns=["s{:d}".format(i) for i in range(r.obs.shape[1])] +
# ["a{:d}".format(i) for i in range(r.acts.shape[1])] + ["r"]))

# Plot the different training episodes.
fig, ax = plt.subplots()
ax.set_xlabel("s0")
Expand Down Expand Up @@ -228,7 +188,7 @@

# Evaluate
print("\nRandomly initialised agent")
rewards_init, _ = resources.evaluate_agent(agent, env_eval, num_episodes=100)
_, rewards_init = resources.evaluate_agent(agent, env_eval, num_episodes=100)

# TODO Choose episodes for pretraining at random.
iPretrain = np.random.default_rng().choice(
Expand Down Expand Up @@ -258,7 +218,7 @@

# Evaluate
print("\nPretrained agent")
rewards_pre, _ = resources.evaluate_agent(agent, env_eval, num_episodes=100)
_, rewards_pre = resources.evaluate_agent(agent, env_eval, num_episodes=100)

# Save the pretrained agent.
agent.save(saveFile+"_pretrained")
Expand All @@ -283,7 +243,7 @@

# Evaluate
print("\nTrained agent")
rewards_trained, _ = resources.evaluate_agent(agent, env_eval, num_episodes=100)
_, rewards_trained = resources.evaluate_agent(agent, env_eval, num_episodes=100)

# Plot convergence of each agent. Redo after each agent to provide
# intermediate updates on how the training is going.
Expand All @@ -294,15 +254,13 @@
fig, ax = plt.subplots()
ax.set_xlabel("Reward range")
ax.set_ylabel("Episode count")
bins = np.linspace(0, rewards_trained.max(), 11)
h, x = np.histogram(rewards_init, bins=bins)
x = (x[1:] + x[:-1])/2
bins = np.linspace(0, np.max(rewards_trained), 11)
x = (bins[1:] + bins[:-1])/2
h, _ = np.histogram(rewards_init, bins=bins)
plt.bar(x, h, color="green", alpha=0.5, label="Initialised", width=20)
h, x = np.histogram(rewards_pre, bins=bins)
x = (x[1:] + x[:-1])/2
h, _ = np.histogram(rewards_pre, bins=bins)
plt.bar(x, h, color="blue", alpha=0.5, label="Pretrained", width=20)
h, x = np.histogram(rewards_trained, bins=bins)
x = (x[1:] + x[:-1])/2
h, _ = np.histogram(rewards_trained, bins=bins)
plt.bar(x, h, color="red", alpha=0.5, label="Pretrained+Trained", width=20)
ax.legend()

Expand Down
20 changes: 15 additions & 5 deletions script_1_compareTraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,15 @@
# %% Load
trainings = {
# Baseline agent with nothing special.
"Random init.": "SAC_try8",
# "Random init.": "SAC_try8",

# 2 dummy state vars for CFD integration
"Random init long state": "SAC_try9",

# Restart tests.
"For restart": "SAC_try8_forRestart",
"Restart": "SAC_try8_restart",
"Restart no replay buffer": "SAC_try8_restart_noReplayBuffer",
# "For restart": "SAC_try8_forRestart",
# "Restart": "SAC_try8_restart",
# "Restart no replay buffer": "SAC_try8_restart_noReplayBuffer",

# Own pretraining - see separate branch.
# "Pretrained actor":
Expand All @@ -39,7 +42,7 @@
# "SAC_customInit_try1_copyActorCritic_LR_5e-4_targetEntropy_-4_actionNoise_0.05",

# SBL pretraining.
# "Pretrained from PID": "SAC_sblPretrain_try0_fromPID",
"Pretrained from PID": "SAC_sblPretrain_try0_fromPID",
}

colours = plt.cm.nipy_spectral(np.linspace(0., 0.95, len(trainings)))
Expand Down Expand Up @@ -67,11 +70,18 @@

lns = []
for i, t in enumerate(trainings):
kBest = None
for k in data:
if t not in k:
continue
elif kBest is None:
kBest = k
ln = ax[0].plot(data[k]["r"], c=colours[i], label=t)
if np.mean(data[k]["r"].values[-50:]) > np.mean(data[kBest]["r"].values[-50:]):
kBest = k
ax[1].plot(data[k]["l"], c=colours[i], label=t)
# print(t, kBest)
# ax[0].plot(data[kBest]["r"], "b-", lw=4, alpha=0.25)
lns += ln
fig.legend(lns, [l.get_label() for l in lns], loc="upper center", ncol=3, framealpha=1)

5 changes: 3 additions & 2 deletions verySimpleAuv.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def __init__(self, seed=None, dt=0.02, noiseMagCoeffs=0.0, noiseMagActuation=0.0
shape=(self.lenAction,), dtype=np.float32)

# Observation space.
lenState = 9
lenState = 9 + 2
self.observation_space = gym.spaces.Box(
-1*np.ones(lenState, dtype=np.float32),
np.ones(lenState, dtype=np.float32),
Expand Down Expand Up @@ -167,7 +167,8 @@ def dataToState(self, pos, heading, velocities):
min(1., max(-1., (perr[0]-self.perr_o[0])/0.025)),
min(1., max(-1., (perr[1]-self.perr_o[1])/0.025)),
]),
np.clip(velocities/[0.2, 0.2, 30./180.*np.pi], -1., 1.)
np.clip(velocities/[0.2, 0.2, 30./180.*np.pi], -1., 1.),
np.zeros(2), # Placeholder for additional state variables used only in CFD
])

return newState
Expand Down

0 comments on commit 074a86f

Please sign in to comment.