assume-framework · kim-mskw · Sep 11, 2024 · Sep 11, 2024 · Sep 20, 2024 · Sep 20, 2024
diff --git a/assume/common/base.py b/assume/common/base.py
@@ -68,17 +68,19 @@ def __init__(
             for strategy in self.bidding_strategies.values()
         ):
             self.outputs["actions"] = TensorFastSeries(value=0.0, index=self.index)
-            self.outputs["exploration_noise"] = TensorFastSeries(
-                value=0.0,
-                index=self.index,
-            )
             self.outputs["reward"] = FastSeries(value=0.0, index=self.index)
             self.outputs["regret"] = FastSeries(value=0.0, index=self.index)
 
-            # RL data stored as lists to simplify storing to the buffer
-            self.outputs["rl_observations"] = []
-            self.outputs["rl_actions"] = []
-            self.outputs["rl_rewards"] = []
+        self.avg_op_time = 0
+        self.total_op_time = 0
+
+
+        # RL data stored as lists to simplify storing to the buffer
+        self.outputs["rl_observations"] = []
+        self.outputs["rl_actions"] = []
+        self.outputs["rl_rewards"] = []
+        self.outputs["rl_log_probs"] = []
+
 
     def calculate_bids(
         self,
@@ -742,6 +744,14 @@ def __init__(
         # them into suitable format for recurrent neural networks
         self.num_timeseries_obs_dim = num_timeseries_obs_dim
 
+        self.rl_algorithm_name = kwargs.get("algorithm", "matd3")
+        if self.rl_algorithm_name == "matd3":
+            from assume.reinforcement_learning.algorithms.matd3 import get_actions
+            self.get_actions = get_actions
+        elif self.rl_algorithm_name == "ppo":
+            from assume.reinforcement_learning.algorithms.ppo import get_actions
+            self.get_actions = get_actions
+
 
 class LearningConfig(TypedDict):
     """

diff --git a/assume/reinforcement_learning/__init__.py b/assume/reinforcement_learning/__init__.py
@@ -3,4 +3,5 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
 from assume.reinforcement_learning.buffer import ReplayBuffer
+from assume.reinforcement_learning.buffer import RolloutBuffer
 from assume.reinforcement_learning.learning_role import Learning
diff --git a/assume/reinforcement_learning/algorithms/__init__.py b/assume/reinforcement_learning/algorithms/__init__.py
@@ -7,9 +7,11 @@
 from assume.reinforcement_learning.neural_network_architecture import (
     MLPActor,
     LSTMActor,
+    DistActor,
 )
 
 actor_architecture_aliases: dict[str, type[nn.Module]] = {
     "mlp": MLPActor,
     "lstm": LSTMActor,
+    "dist": DistActor,
 }
diff --git a/assume/reinforcement_learning/algorithms/base_algorithm.py b/assume/reinforcement_learning/algorithms/base_algorithm.py
@@ -34,32 +34,17 @@ def __init__(
         # init learning_role as object of Learning class
         learning_role,
         learning_rate=1e-4,
-        episodes_collecting_initial_experience=100,
         batch_size=1024,
-        tau=0.005,
         gamma=0.99,
-        gradient_steps=-1,
-        policy_delay=2,
-        target_policy_noise=0.2,
-        target_noise_clip=0.5,
         actor_architecture="mlp",
+        **kwargs,  # allow additional params for specific algorithms
     ):
         super().__init__()
 
         self.learning_role = learning_role
         self.learning_rate = learning_rate
-        self.episodes_collecting_initial_experience = (
-            episodes_collecting_initial_experience
-        )
         self.batch_size = batch_size
         self.gamma = gamma
-        self.tau = tau
-
-        self.gradient_steps = gradient_steps
-
-        self.policy_delay = policy_delay
-        self.target_noise_clip = target_noise_clip
-        self.target_policy_noise = target_policy_noise
 
         if actor_architecture in actor_architecture_aliases.keys():
             self.actor_architecture_class = actor_architecture_aliases[

diff --git a/assume/reinforcement_learning/algorithms/matd3.py b/assume/reinforcement_learning/algorithms/matd3.py
@@ -11,7 +11,7 @@
 
 from assume.common.base import LearningStrategy
 from assume.reinforcement_learning.algorithms.base_algorithm import RLAlgorithm
-from assume.reinforcement_learning.learning_utils import polyak_update
+from assume.reinforcement_learning.learning_utils import polyak_update, collect_obs_for_central_critic
 from assume.reinforcement_learning.neural_network_architecture import CriticTD3
 
 logger = logging.getLogger(__name__)
@@ -28,7 +28,7 @@ class TD3(RLAlgorithm):
 
     Original paper: https://arxiv.org/pdf/1802.09477.pdf
     """
-
+    
     def __init__(
         self,
         learning_role,
@@ -46,16 +46,16 @@ def __init__(
         super().__init__(
             learning_role,
             learning_rate,
-            episodes_collecting_initial_experience,
             batch_size,
-            tau,
             gamma,
-            gradient_steps,
-            policy_delay,
-            target_policy_noise,
-            target_noise_clip,
             actor_architecture,
         )
+        self.episodes_collecting_initial_experience = episodes_collecting_initial_experience
+        self.tau = tau
+        self.gradient_steps = gradient_steps
+        self.policy_delay = policy_delay
+        self.target_policy_noise = target_policy_noise
+        self.target_noise_clip = target_noise_clip
         self.n_updates = 0
 
     def save_params(self, directory):
@@ -201,6 +201,8 @@ def load_actor_params(self, directory: str) -> None:
             except Exception:
                 logger.warning(f"No actor values loaded for agent {u_id}")
 
+
+
     def initialize_policy(self, actors_and_critics: dict = None) -> None:
         """
         Create actor and critic networks for reinforcement learning.
@@ -293,7 +295,7 @@ def create_critics(self) -> None:
         This method initializes critic networks for each agent in the reinforcement learning setup.
 
         Notes:
-            The observation dimension need to be the same, due to the centralized criic that all actors share.
+            The observation dimension need to be the same, due to the centralized critic that all actors share.
             If you have units with different observation dimensions. They need to have different critics and hence learning roles.
         """
         n_agents = len(self.learning_role.rl_strats)
@@ -458,47 +460,14 @@ def update_policy(self):
 
                 all_actions = actions.view(self.batch_size, -1)
 
-                # this takes the unique observations from all other agents assuming that
-                # the unique observations are at the end of the observation vector
-                temp = th.cat(
-                    (
-                        states[:, :i, self.obs_dim - self.unique_obs_dim :].reshape(
-                            self.batch_size, -1
-                        ),
-                        states[
-                            :, i + 1 :, self.obs_dim - self.unique_obs_dim :
-                        ].reshape(self.batch_size, -1),
-                    ),
-                    axis=1,
+                #collect observations for critic
+                all_states = collect_obs_for_central_critic(
+                    states, i, self.obs_dim, self.unique_obs_dim, self.batch_size
                 )
-
-                # the final all_states vector now contains the current agent's observation
-                # and the unique observations from all other agents
-                all_states = th.cat(
-                    (states[:, i, :].reshape(self.batch_size, -1), temp), axis=1
-                ).view(self.batch_size, -1)
-                # all_states = states[:, i, :].reshape(self.batch_size, -1)
-
-                # this is the same as above but for the next states
-                temp = th.cat(
-                    (
-                        next_states[
-                            :, :i, self.obs_dim - self.unique_obs_dim :
-                        ].reshape(self.batch_size, -1),
-                        next_states[
-                            :, i + 1 :, self.obs_dim - self.unique_obs_dim :
-                        ].reshape(self.batch_size, -1),
-                    ),
-                    axis=1,
+                all_next_states = collect_obs_for_central_critic(
+                    next_states, i, self.obs_dim, self.unique_obs_dim, self.batch_size
                 )
 
-                # the final all_next_states vector now contains the current agent's observation
-                # and the unique observations from all other agents
-                all_next_states = th.cat(
-                    (next_states[:, i, :].reshape(self.batch_size, -1), temp), axis=1
-                ).view(self.batch_size, -1)
-                # all_next_states = next_states[:, i, :].reshape(self.batch_size, -1)
-
                 with th.no_grad():
                     # Compute the next Q-values: min over all critics targets
                     next_q_values = th.cat(
@@ -548,3 +517,73 @@ def update_policy(self):
                         actor.parameters(), actor_target.parameters(), self.tau
                     )
                 i += 1
+
+
+def get_actions(rl_strategy, next_observation):
+    """
+    Gets actions for a unit based on the observation using MATD3.
+
+    Args:
+        rl_strategy (RLStrategy): The strategy containing relevant information.
+        next_observation (torch.Tensor): The observation.
+
+    Returns:
+        torch.Tensor: The actions containing two bid prices.
+        tuple: The noise (if applicable).
+
+    Note:
+        If the agent is in learning mode, the actions are chosen by the actor neuronal net and noise is added to the action.
+        In the first x episodes, the agent is in initial exploration mode, where the action is chosen by noise only to explore 
+        the entire action space. X is defined by episodes_collecting_initial_experience.
+        If the agent is not in learning mode, the actions are chosen by the actor neuronal net without noise.
+    """
+
+    actor = rl_strategy.actor
+    device = rl_strategy.device
+    float_type = rl_strategy.float_type
+    act_dim = rl_strategy.act_dim
+    learning_mode = rl_strategy.learning_mode
+    perform_evaluation = rl_strategy.perform_evaluation
+    action_noise = rl_strategy.action_noise
+    collect_initial_experience_mode = rl_strategy.collect_initial_experience_mode
+
+    # distinction whether we are in learning mode or not to handle exploration realised with noise
+    if learning_mode and not perform_evaluation:
+        # if we are in learning mode the first x episodes we want to explore the entire action space
+        # to get a good initial experience, in the area around the costs of the agent
+        if collect_initial_experience_mode:
+            # define current action as solely noise
+            noise = (
+                th.normal(mean=0.0, std=0.2, size=(1, act_dim), dtype=float_type)
+                .to(device)
+                .squeeze()
+            )
+
+            # =============================================================================
+            # 2.1 Get Actions and handle exploration
+            # =============================================================================
+            base_bid = next_observation[-1]
+
+            # add noise to the last dimension of the observation
+            # needs to be adjusted if observation space is changed, because only makes sense
+            # if the last dimension of the observation space are the marginal cost
+            curr_action = noise + base_bid.clone().detach()
+
+        else:
+            # if we are not in the initial exploration phase we choose the action with the actor neural net
+            # and add noise to the action
+            curr_action = actor(next_observation).detach()  # calls the forward method of the actor network
+            noise = th.tensor(
+                action_noise.noise(), device=device, dtype=float_type
+            )
+            curr_action += noise
+    else:
+        # if we are not in learning mode we just use the actor neural net to get the action without adding noise
+        curr_action = actor(next_observation).detach()
+        noise = tuple(0 for _ in range(act_dim))
+
+    # Clamp actions to be within the valid action space bounds
+    curr_action = curr_action.clamp(-1, 1)
+
+    return curr_action, noise
+