Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Algorithmic Expansion to PPO #462

Draft
wants to merge 33 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
e996737
PPO
Sep 11, 2024
5e12c2f
Merge branch 'main' into drl-ppo
Sep 11, 2024
46f843a
DRL PPO Update
Sep 20, 2024
4b889aa
Merge branch 'main' into drl-ppo
Sep 20, 2024
552f549
Buffer changes, update_policy and further advancements in DRL related…
Sep 28, 2024
128727f
first RUNABLE BUT NOT VALIDATED Version of PPO
kim-mskw Oct 1, 2024
991c311
- Removed unused parts of the RolloutBuffer
Oct 19, 2024
aba37f1
Implemented centralized critic
Oct 23, 2024
9047578
Fixed comments regarding centralized critic
Oct 25, 2024
a5e0f5f
Merge branch 'main' into drl-ppo
nick-harder Oct 28, 2024
ca1f9fc
- implemnted perform eval differentiation that gets rid of stochasticity
kim-mskw Oct 28, 2024
d102a2d
Merge remote-tracking branch 'origin/main' into drl-ppo
maurerle Oct 28, 2024
7e19389
Merge branch 'drl-ppo' of https://github.com/assume-framework/assume …
kim-mskw Oct 29, 2024
e011baa
- tensor handling in get_actions function
kim-mskw Oct 29, 2024
5d2dd2d
convergence of PPO? zurückhaltendes wuhu
kim-mskw Oct 29, 2024
79c287a
- pushed config setting for reproducability
kim-mskw Oct 29, 2024
97c3bb6
- added further todos for prettier code and better handling of critic…
kim-mskw Oct 29, 2024
9d669e0
Merge branch 'main' into drl-ppo
nick-harder Oct 30, 2024
77f28d5
- moved advantage and value calculation outside of the loop since it …
kim-mskw Oct 30, 2024
a0c2a88
- introduce new actor architecture with distributuon layer so that we…
kim-mskw Oct 30, 2024
8542f08
Merge branch 'drl-ppo' of https://github.com/assume-framework/assume …
kim-mskw Oct 30, 2024
cd30915
- delted epochs from config
kim-mskw Oct 30, 2024
5ad6e15
add mini batch sampling to ppo
adiwied Nov 14, 2024
8928bf7
fix clamping of action distribution
adiwied Nov 15, 2024
949ba73
ppo is now stable in ex2a base, added orthogonal initialization and w…
adiwied Nov 18, 2024
68c16a9
improve hyperparams
adiwied Nov 19, 2024
8b3196a
Merge pull request #487 from adiwied/drl-ppo
kim-mskw Nov 20, 2024
2cf7d0b
Merge branch 'main' of https://github.com/assume-framework/assume int…
kim-mskw Nov 21, 2024
9ce1c36
align all configs with additional algorithm feature
kim-mskw Nov 21, 2024
3f1ac6c
- adjusted all learning configs to match new config format
kim-mskw Nov 25, 2024
61e9bfa
- cleaned weirdly merged configs
kim-mskw Dec 3, 2024
9d43562
Merge branch 'main' of https://github.com/assume-framework/assume int…
kim-mskw Dec 4, 2024
3fa7359
- adjusted exploration noise handling to fit to PPO, makes merge comm…
kim-mskw Dec 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions assume/common/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,19 @@ def __init__(
for strategy in self.bidding_strategies.values()
):
self.outputs["actions"] = TensorFastSeries(value=0.0, index=self.index)
self.outputs["exploration_noise"] = TensorFastSeries(
value=0.0,
index=self.index,
)
self.outputs["reward"] = FastSeries(value=0.0, index=self.index)
self.outputs["regret"] = FastSeries(value=0.0, index=self.index)

# RL data stored as lists to simplify storing to the buffer
self.outputs["rl_observations"] = []
self.outputs["rl_actions"] = []
self.outputs["rl_rewards"] = []
self.avg_op_time = 0
self.total_op_time = 0


# RL data stored as lists to simplify storing to the buffer
self.outputs["rl_observations"] = []
self.outputs["rl_actions"] = []
self.outputs["rl_rewards"] = []
self.outputs["rl_log_probs"] = []


def calculate_bids(
self,
Expand Down Expand Up @@ -742,6 +744,14 @@ def __init__(
# them into suitable format for recurrent neural networks
self.num_timeseries_obs_dim = num_timeseries_obs_dim

self.rl_algorithm_name = kwargs.get("algorithm", "matd3")
if self.rl_algorithm_name == "matd3":
from assume.reinforcement_learning.algorithms.matd3 import get_actions
self.get_actions = get_actions
elif self.rl_algorithm_name == "ppo":
from assume.reinforcement_learning.algorithms.ppo import get_actions
self.get_actions = get_actions


class LearningConfig(TypedDict):
"""
Expand Down
1 change: 1 addition & 0 deletions assume/reinforcement_learning/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
# SPDX-License-Identifier: AGPL-3.0-or-later

from assume.reinforcement_learning.buffer import ReplayBuffer
from assume.reinforcement_learning.buffer import RolloutBuffer
from assume.reinforcement_learning.learning_role import Learning
2 changes: 2 additions & 0 deletions assume/reinforcement_learning/algorithms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from assume.reinforcement_learning.neural_network_architecture import (
MLPActor,
LSTMActor,
DistActor,
)

actor_architecture_aliases: dict[str, type[nn.Module]] = {
"mlp": MLPActor,
"lstm": LSTMActor,
"dist": DistActor,
}
17 changes: 1 addition & 16 deletions assume/reinforcement_learning/algorithms/base_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,32 +34,17 @@ def __init__(
# init learning_role as object of Learning class
learning_role,
learning_rate=1e-4,
episodes_collecting_initial_experience=100,
batch_size=1024,
tau=0.005,
gamma=0.99,
gradient_steps=-1,
policy_delay=2,
target_policy_noise=0.2,
target_noise_clip=0.5,
actor_architecture="mlp",
**kwargs, # allow additional params for specific algorithms
):
super().__init__()

self.learning_role = learning_role
self.learning_rate = learning_rate
self.episodes_collecting_initial_experience = (
episodes_collecting_initial_experience
)
self.batch_size = batch_size
self.gamma = gamma
self.tau = tau

self.gradient_steps = gradient_steps

self.policy_delay = policy_delay
self.target_noise_clip = target_noise_clip
self.target_policy_noise = target_policy_noise

if actor_architecture in actor_architecture_aliases.keys():
self.actor_architecture_class = actor_architecture_aliases[
Expand Down
133 changes: 86 additions & 47 deletions assume/reinforcement_learning/algorithms/matd3.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from assume.common.base import LearningStrategy
from assume.reinforcement_learning.algorithms.base_algorithm import RLAlgorithm
from assume.reinforcement_learning.learning_utils import polyak_update
from assume.reinforcement_learning.learning_utils import polyak_update, collect_obs_for_central_critic
from assume.reinforcement_learning.neural_network_architecture import CriticTD3

logger = logging.getLogger(__name__)
Expand All @@ -28,7 +28,7 @@ class TD3(RLAlgorithm):

Original paper: https://arxiv.org/pdf/1802.09477.pdf
"""

def __init__(
self,
learning_role,
Expand All @@ -46,16 +46,16 @@ def __init__(
super().__init__(
learning_role,
learning_rate,
episodes_collecting_initial_experience,
batch_size,
tau,
gamma,
gradient_steps,
policy_delay,
target_policy_noise,
target_noise_clip,
actor_architecture,
)
self.episodes_collecting_initial_experience = episodes_collecting_initial_experience
self.tau = tau
self.gradient_steps = gradient_steps
self.policy_delay = policy_delay
self.target_policy_noise = target_policy_noise
self.target_noise_clip = target_noise_clip
self.n_updates = 0

def save_params(self, directory):
Expand Down Expand Up @@ -201,6 +201,8 @@ def load_actor_params(self, directory: str) -> None:
except Exception:
logger.warning(f"No actor values loaded for agent {u_id}")



def initialize_policy(self, actors_and_critics: dict = None) -> None:
"""
Create actor and critic networks for reinforcement learning.
Expand Down Expand Up @@ -293,7 +295,7 @@ def create_critics(self) -> None:
This method initializes critic networks for each agent in the reinforcement learning setup.

Notes:
The observation dimension need to be the same, due to the centralized criic that all actors share.
The observation dimension need to be the same, due to the centralized critic that all actors share.
If you have units with different observation dimensions. They need to have different critics and hence learning roles.
"""
n_agents = len(self.learning_role.rl_strats)
Expand Down Expand Up @@ -458,47 +460,14 @@ def update_policy(self):

all_actions = actions.view(self.batch_size, -1)

# this takes the unique observations from all other agents assuming that
# the unique observations are at the end of the observation vector
temp = th.cat(
(
states[:, :i, self.obs_dim - self.unique_obs_dim :].reshape(
self.batch_size, -1
),
states[
:, i + 1 :, self.obs_dim - self.unique_obs_dim :
].reshape(self.batch_size, -1),
),
axis=1,
#collect observations for critic
all_states = collect_obs_for_central_critic(
states, i, self.obs_dim, self.unique_obs_dim, self.batch_size
)

# the final all_states vector now contains the current agent's observation
# and the unique observations from all other agents
all_states = th.cat(
(states[:, i, :].reshape(self.batch_size, -1), temp), axis=1
).view(self.batch_size, -1)
# all_states = states[:, i, :].reshape(self.batch_size, -1)

# this is the same as above but for the next states
temp = th.cat(
(
next_states[
:, :i, self.obs_dim - self.unique_obs_dim :
].reshape(self.batch_size, -1),
next_states[
:, i + 1 :, self.obs_dim - self.unique_obs_dim :
].reshape(self.batch_size, -1),
),
axis=1,
all_next_states = collect_obs_for_central_critic(
next_states, i, self.obs_dim, self.unique_obs_dim, self.batch_size
)

# the final all_next_states vector now contains the current agent's observation
# and the unique observations from all other agents
all_next_states = th.cat(
(next_states[:, i, :].reshape(self.batch_size, -1), temp), axis=1
).view(self.batch_size, -1)
# all_next_states = next_states[:, i, :].reshape(self.batch_size, -1)

with th.no_grad():
# Compute the next Q-values: min over all critics targets
next_q_values = th.cat(
Expand Down Expand Up @@ -548,3 +517,73 @@ def update_policy(self):
actor.parameters(), actor_target.parameters(), self.tau
)
i += 1


def get_actions(rl_strategy, next_observation):
"""
Gets actions for a unit based on the observation using MATD3.

Args:
rl_strategy (RLStrategy): The strategy containing relevant information.
next_observation (torch.Tensor): The observation.

Returns:
torch.Tensor: The actions containing two bid prices.
tuple: The noise (if applicable).

Note:
If the agent is in learning mode, the actions are chosen by the actor neuronal net and noise is added to the action.
In the first x episodes, the agent is in initial exploration mode, where the action is chosen by noise only to explore
the entire action space. X is defined by episodes_collecting_initial_experience.
If the agent is not in learning mode, the actions are chosen by the actor neuronal net without noise.
"""

actor = rl_strategy.actor
device = rl_strategy.device
float_type = rl_strategy.float_type
act_dim = rl_strategy.act_dim
learning_mode = rl_strategy.learning_mode
perform_evaluation = rl_strategy.perform_evaluation
action_noise = rl_strategy.action_noise
collect_initial_experience_mode = rl_strategy.collect_initial_experience_mode

# distinction whether we are in learning mode or not to handle exploration realised with noise
if learning_mode and not perform_evaluation:
# if we are in learning mode the first x episodes we want to explore the entire action space
# to get a good initial experience, in the area around the costs of the agent
if collect_initial_experience_mode:
# define current action as solely noise
noise = (
th.normal(mean=0.0, std=0.2, size=(1, act_dim), dtype=float_type)
.to(device)
.squeeze()
)

# =============================================================================
# 2.1 Get Actions and handle exploration
# =============================================================================
base_bid = next_observation[-1]

# add noise to the last dimension of the observation
# needs to be adjusted if observation space is changed, because only makes sense
# if the last dimension of the observation space are the marginal cost
curr_action = noise + base_bid.clone().detach()

else:
# if we are not in the initial exploration phase we choose the action with the actor neural net
# and add noise to the action
curr_action = actor(next_observation).detach() # calls the forward method of the actor network
noise = th.tensor(
action_noise.noise(), device=device, dtype=float_type
)
curr_action += noise
else:
# if we are not in learning mode we just use the actor neural net to get the action without adding noise
curr_action = actor(next_observation).detach()
noise = tuple(0 for _ in range(act_dim))

# Clamp actions to be within the valid action space bounds
curr_action = curr_action.clamp(-1, 1)

return curr_action, noise

Loading
Loading