From e99673703ed9859ff1775b76632a8431de1052db Mon Sep 17 00:00:00 2001
From: ufqjh <ufqjh@student.kit.edu>
Date: Wed, 11 Sep 2024 14:25:09 +0200
Subject: [PATCH 01/23] PPO

---
 assume/reinforcement_learning/__init__.py     |   1 +
 .../algorithms/matd3.py                       |  16 +-
 .../reinforcement_learning/algorithms/ppo.py  | 668 ++++++++++++++++++
 assume/reinforcement_learning/buffer.py       |  97 +++
 .../reinforcement_learning/learning_role.py   |  13 +-
 .../reinforcement_learning/learning_utils.py  |  75 +-
 examples/examples.py                          |   8 +-
 7 files changed, 855 insertions(+), 23 deletions(-)
 create mode 100644 assume/reinforcement_learning/algorithms/ppo.py

diff --git a/assume/reinforcement_learning/__init__.py b/assume/reinforcement_learning/__init__.py
index a10131609..152fcbbdb 100644
--- a/assume/reinforcement_learning/__init__.py
+++ b/assume/reinforcement_learning/__init__.py
@@ -3,4 +3,5 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
 from assume.reinforcement_learning.buffer import ReplayBuffer
+from assume.reinforcement_learning.buffer import RolloutBuffer
 from assume.reinforcement_learning.learning_role import Learning
diff --git a/assume/reinforcement_learning/algorithms/matd3.py b/assume/reinforcement_learning/algorithms/matd3.py
index 66010da13..cc84799e5 100644
--- a/assume/reinforcement_learning/algorithms/matd3.py
+++ b/assume/reinforcement_learning/algorithms/matd3.py
@@ -27,7 +27,7 @@ class TD3(RLAlgorithm):
 
     Original paper: https://arxiv.org/pdf/1802.09477.pdf
     """
-
+    
     def __init__(
         self,
         learning_role,
@@ -198,6 +198,19 @@ def load_actor_params(self, directory: str) -> None:
             except Exception:
                 logger.warning(f"No actor values loaded for agent {u_id}")
 
+
+
+
+
+
+
+
+
+
+
+
+
+
     def initialize_policy(self, actors_and_critics: dict = None) -> None:
         """
         Create actor and critic networks for reinforcement learning.
@@ -514,3 +527,4 @@ def update_policy(self):
                     )
 
                 i += 1
+
diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
new file mode 100644
index 000000000..2a3ab93c4
--- /dev/null
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -0,0 +1,668 @@
+# SPDX-FileCopyrightText: ASSUME Developers
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import logging
+import os
+
+import torch as th
+from torch.nn import functional as F
+from torch.optim import Adam
+
+from assume.common.base import LearningStrategy
+from assume.reinforcement_learning.algorithms.base_algorithm import RLAlgorithm
+# Check later
+from assume.reinforcement_learning.learning_utils import Actor, CriticPPO, polyak_update
+
+logger = logging.getLogger(__name__)
+
+
+class PPO(RLAlgorithm):
+    """
+    Proximal Policy Optimization (PPO) is a robust and efficient policy gradient method for reinforcement learning. 
+    It strikes a balance between trust-region methods and simpler approaches by using clipped objective functions. 
+    PPO avoids large updates to the policy by restricting changes to stay within a specified range, which helps stabilize training. 
+    The key improvements include the introduction of a surrogate objective that limits policy updates and ensures efficient learning, 
+    as well as the use of multiple epochs of stochastic gradient descent on batches of data.
+
+    Open AI Spinning guide: https://spinningup.openai.com/en/latest/algorithms/ppo.html#
+
+    Original paper: https://arxiv.org/pdf/1802.09477.pdf
+    """
+    
+    # Change order and mandatory parameters in the superclass, removed and newly added parameters
+    def __init__(
+        self,
+        learning_role,
+        learning_rate=1e-4, 
+        batch_size=1024, 
+        gamma=0.99, # Discount factor used for future reward consideration.
+
+        # Added PPO parameters
+        epochs=10, # Number of epochs to train each policy update.
+        clip_ratio=0.2, # Clipping ratio for PPO's policy loss to limit the change in policy update. Anstatt den vollen Wert der Änderung zu maximieren, wird das Ziel in einem bestimmten Bereich begrenzt. Der Bereich ist durch einen Parameter (den Clipping-Faktor) definiert, der angibt, wie weit die neue Policy von der alten Policy abweichen darf. Wenn das Verhältnis der Wahrscheinlichkeiten zu weit von 1 abweicht, wird die Verbesserung beschnitten.
+        vf_coef=0.5, # Coefficient for value function loss in PPO.
+        entropy_coef=0.01, # Coefficient for entropy bonus (to encourage exploration).
+        max_grad_norm=0.5, # Maximum gradient norm for gradient clipping.
+
+        # Removed MATD3 parameters
+        # episodes_collecting_initial_experience=100,
+        # tau=0.005,
+        # gradient_steps=-1,
+        # policy_delay=2,
+        # target_policy_noise=0.2,
+        # target_noise_clip=0.5,
+    ):
+        super().__init__(
+            learning_role,
+            learning_rate,
+            batch_size,
+            gamma,
+
+            # episodes_collecting_initial_experience,
+            # tau,
+            # gradient_steps,
+            # policy_delay,
+            # target_policy_noise,
+            # target_noise_clip,
+        )
+        #self.n_updates = 0
+        self.clip_ratio = clip_ratio
+        self.vf_coef = vf_coef
+        self.entropy_coef = entropy_coef
+        self.max_grad_norm = max_grad_norm
+        self.epochs = epochs
+
+    # Unchanged method from MATD3
+    def save_params(self, directory):
+        """
+        This method saves the parameters of both the actor and critic networks associated with the learning role. It organizes the
+        saved parameters into separate directories for critics and actors within the specified base directory.
+
+        Args:
+            directory (str): The base directory for saving the parameters.
+        """
+        self.save_critic_params(directory=f"{directory}/critics")
+        self.save_actor_params(directory=f"{directory}/actors")
+
+    # Removed critic_target in comparison to MATD3
+    def save_critic_params(self, directory):
+        """
+        Save the parameters of critic networks.
+
+        This method saves the parameters of the critic networks, including the critic's state_dict, critic_target's state_dict. It organizes the saved parameters into a directory structure specific to the critic
+        associated with each learning   strategy.
+
+        Args:
+            directory (str): The base directory for saving the parameters.
+        """
+        os.makedirs(directory, exist_ok=True)
+        for u_id in self.learning_role.rl_strats.keys():
+            obj = {
+                "critic": self.learning_role.critics[u_id].state_dict(),
+                # "critic_target": self.learning_role.target_critics[u_id].state_dict(),
+                "critic_optimizer": self.learning_role.critics[
+                    u_id
+                ].optimizer.state_dict(),
+            }
+            path = f"{directory}/critic_{u_id}.pt"
+            th.save(obj, path)
+
+    # Removed actor_target in comparison to MATD3
+    def save_actor_params(self, directory):
+        """
+        Save the parameters of actor networks.
+
+        This method saves the parameters of the actor networks, including the actor's state_dict, actor_target's state_dict, and
+        the actor's optimizer state_dict. It organizes the saved parameters into a directory structure specific to the actor
+        associated with each learning strategy.
+
+        Args:
+            directory (str): The base directory for saving the parameters.
+        """
+        os.makedirs(directory, exist_ok=True)
+        for u_id in self.learning_role.rl_strats.keys():
+            obj = {
+                "actor": self.learning_role.rl_strats[u_id].actor.state_dict(),
+                # "actor_target": self.learning_role.rl_strats[
+                #     u_id
+                # ].actor_target.state_dict(),
+                "actor_optimizer": self.learning_role.rl_strats[
+                    u_id
+                ].actor.optimizer.state_dict(),
+            }
+            path = f"{directory}/actor_{u_id}.pt"
+            th.save(obj, path)
+
+    # Unchanged method from MATD3
+    def load_params(self, directory: str) -> None:
+        """
+        Load the parameters of both actor and critic networks.
+
+        This method loads the parameters of both the actor and critic networks associated with the learning role from the specified
+        directory. It uses the `load_critic_params` and `load_actor_params` methods to load the respective parameters.
+
+        Args:
+            directory (str): The directory from which the parameters should be loaded.
+        """
+        self.load_critic_params(directory)
+        self.load_actor_params(directory)
+
+    # Removed critic_target in comparison to MATD3
+    def load_critic_params(self, directory: str) -> None:
+        """
+        Load the parameters of critic networks from a specified directory.
+
+        This method loads the parameters of critic networks, including the critic's state_dict, critic_target's state_dict, and
+        the critic's optimizer state_dict, from the specified directory. It iterates through the learning strategies associated
+        with the learning role, loads the respective parameters, and updates the critic and target critic networks accordingly.
+
+        Args:
+            directory (str): The directory from which the parameters should be loaded.
+        """
+        logger.info("Loading critic parameters...")
+
+        if not os.path.exists(directory):
+            logger.warning(
+                "Specified directory for loading the critics does not exist! Starting with randomly initialized values!"
+            )
+            return
+
+        for u_id in self.learning_role.rl_strats.keys():
+            try:
+                critic_params = self.load_obj(
+                    directory=f"{directory}/critics/critic_{str(u_id)}.pt"
+                )
+                self.learning_role.critics[u_id].load_state_dict(
+                    critic_params["critic"]
+                )
+                # self.learning_role.target_critics[u_id].load_state_dict(
+                #     critic_params["critic_target"]
+                # )
+                self.learning_role.critics[u_id].optimizer.load_state_dict(
+                    critic_params["critic_optimizer"]
+                )
+            except Exception:
+                logger.warning(f"No critic values loaded for agent {u_id}")
+
+    # Removed actor_target in comparison to MATD3
+    def load_actor_params(self, directory: str) -> None:
+        """
+        Load the parameters of actor networks from a specified directory.
+
+        This method loads the parameters of actor networks, including the actor's state_dict, actor_target's state_dict, and
+        the actor's optimizer state_dict, from the specified directory. It iterates through the learning strategies associated
+        with the learning role, loads the respective parameters, and updates the actor and target actor networks accordingly.
+
+        Args:
+            directory (str): The directory from which the parameters should be loaded.
+        """
+        logger.info("Loading actor parameters...")
+        if not os.path.exists(directory):
+            logger.warning(
+                "Specified directory for loading the actors does not exist! Starting with randomly initialized values!"
+            )
+            return
+
+        for u_id in self.learning_role.rl_strats.keys():
+            try:
+                actor_params = self.load_obj(
+                    directory=f"{directory}/actors/actor_{str(u_id)}.pt"
+                )
+                self.learning_role.rl_strats[u_id].actor.load_state_dict(
+                    actor_params["actor"]
+                )
+                # self.learning_role.rl_strats[u_id].actor_target.load_state_dict(
+                #     actor_params["actor_target"]
+                # )
+                self.learning_role.rl_strats[u_id].actor.optimizer.load_state_dict(
+                    actor_params["actor_optimizer"]
+                )
+            except Exception:
+                logger.warning(f"No actor values loaded for agent {u_id}")
+
+
+    # Removed target_critics and actor_target in comparison to MATD3
+    def initialize_policy(self, actors_and_critics: dict = None) -> None:
+        """
+        Create actor and critic networks for reinforcement learning.
+
+        If `actors_and_critics` is None, this method creates new actor and critic networks.
+        If `actors_and_critics` is provided, it assigns existing networks to the respective attributes.
+
+        Args:
+            actors_and_critics (dict): The actor and critic networks to be assigned.
+
+        """
+        if actors_and_critics is None:
+            self.create_actors()
+            self.create_critics()
+
+        else:
+            self.learning_role.critics = actors_and_critics["critics"]
+            # self.learning_role.target_critics = actors_and_critics["target_critics"]
+            for u_id, unit_strategy in self.learning_role.rl_strats.items():
+                unit_strategy.actor = actors_and_critics["actors"][u_id]
+                # unit_strategy.actor_target = actors_and_critics["actor_targets"][u_id]
+
+            self.obs_dim = actors_and_critics["obs_dim"]
+            self.act_dim = actors_and_critics["act_dim"]
+            self.unique_obs_dim = actors_and_critics["unique_obs_dim"]
+
+    # Removed actor_target in comparison to MATD3
+    def create_actors(self) -> None:
+        """
+        Create actor networks for reinforcement learning for each unit strategy.
+
+        This method initializes actor networks and their corresponding target networks for each unit strategy.
+        The actors are designed to map observations to action probabilities in a reinforcement learning setting.
+
+        The created actor networks are associated with each unit strategy and stored as attributes.
+
+        Notes:
+            The observation dimension need to be the same, due to the centralized criic that all actors share.
+            If you have units with different observation dimensions. They need to have different critics and hence learning roles.
+
+        """
+
+        obs_dim_list = []
+        act_dim_list = []
+
+        for _, unit_strategy in self.learning_role.rl_strats.items():
+            unit_strategy.actor = Actor(
+                obs_dim=unit_strategy.obs_dim,
+                act_dim=unit_strategy.act_dim,
+                float_type=self.float_type,
+            ).to(self.device)
+
+            # unit_strategy.actor_target = Actor(
+            #     obs_dim=unit_strategy.obs_dim,
+            #     act_dim=unit_strategy.act_dim,
+            #     float_type=self.float_type,
+            # ).to(self.device)
+            # unit_strategy.actor_target.load_state_dict(unit_strategy.actor.state_dict())
+            # unit_strategy.actor_target.train(mode=False)
+
+            unit_strategy.actor.optimizer = Adam(
+                unit_strategy.actor.parameters(), lr=self.learning_rate
+            )
+
+            obs_dim_list.append(unit_strategy.obs_dim)
+            act_dim_list.append(unit_strategy.act_dim)
+
+        if len(set(obs_dim_list)) > 1:
+            raise ValueError(
+                "All observation dimensions must be the same for all RL agents"
+            )
+        else:
+            self.obs_dim = obs_dim_list[0]
+
+        if len(set(act_dim_list)) > 1:
+            raise ValueError("All action dimensions must be the same for all RL agents")
+        else:
+            self.act_dim = act_dim_list[0]
+
+    # Removed target_critics in comparison to MATD3
+    # Changed initialization of CriticPPO compared to MATD3 
+    def create_critics(self) -> None:
+        """
+        Create critic networks for reinforcement learning.
+
+        This method initializes critic networks for each agent in the reinforcement learning setup.
+
+        Notes:
+            The observation dimension need to be the same, due to the centralized critic that all actors share.
+            If you have units with different observation dimensions. They need to have different critics and hence learning roles.
+        """
+        n_agents = len(self.learning_role.rl_strats)
+        strategy: LearningStrategy
+        unique_obs_dim_list = []
+
+        for u_id, strategy in self.learning_role.rl_strats.items():
+
+            self, 
+            n_agents: int, 
+            obs_dim: int, 
+            float_type,
+            unique_obs_dim: int, 
+
+            
+            self.learning_role.critics[u_id] = CriticPPO(
+                n_agents=n_agents,
+                obs_dim=strategy.obs_dim,
+                unique_obs_dim=strategy.unique_obs_dim,
+                float_type=self.float_type,
+            )
+
+            self.learning_role.critics[u_id].optimizer = Adam(
+                self.learning_role.critics[u_id].parameters(), lr=self.learning_rate
+            )
+
+            self.learning_role.target_critics[u_id].load_state_dict(
+                self.learning_role.critics[u_id].state_dict()
+            )
+            self.learning_role.target_critics[u_id].train(mode=False)
+
+            self.learning_role.critics[u_id] = self.learning_role.critics[u_id].to(
+                self.device
+            )
+            self.learning_role.target_critics[u_id] = self.learning_role.target_critics[
+                u_id
+            ].to(self.device)
+
+            unique_obs_dim_list.append(strategy.unique_obs_dim)
+
+        # check if all unique_obs_dim are the same and raise an error if not
+        # if they are all the same, set the unique_obs_dim attribute
+        if len(set(unique_obs_dim_list)) > 1:
+            raise ValueError(
+                "All unique_obs_dim values must be the same for all RL agents"
+            )
+        else:
+            self.unique_obs_dim = unique_obs_dim_list[0]
+
+    def extract_policy(self) -> dict:
+        """
+        Extract actor and critic networks.
+
+        This method extracts the actor and critic networks associated with each learning strategy and organizes them into a
+        dictionary structure. The extracted networks include actors, actor_targets, critics, and target_critics. The resulting
+        dictionary is typically used for saving and sharing these networks.
+
+        Returns:
+            dict: The extracted actor and critic networks.
+        """
+        actors = {}
+        actor_targets = {}
+
+        for u_id, unit_strategy in self.learning_role.rl_strats.items():
+            actors[u_id] = unit_strategy.actor
+            actor_targets[u_id] = unit_strategy.actor_target
+
+        actors_and_critics = {
+            "actors": actors,
+            # "actor_targets": actor_targets,
+            "critics": self.learning_role.critics,
+            "target_critics": self.learning_role.target_critics,
+            "obs_dim": self.obs_dim,
+            "act_dim": self.act_dim,
+            "unique_obs_dim": self.unique_obs_dim,
+        }
+
+        return actors_and_critics
+    
+    def update_policy(self):
+        """
+        Perform policy updates using PPO with the clipped objective.
+        """
+        logger.debug("Updating Policy")
+
+        for epoch in range(self.epochs):
+            # Sample a batch from the replay buffer
+            transitions = self.learning_role.buffer.sample(self.batch_size)
+            states, actions, log_probs_old, returns, advantages = (
+                transitions.observations,
+                transitions.actions,
+                transitions.log_probs,
+                transitions.returns,
+                transitions.advantages,
+            )
+
+            # Update the policy (actor)
+            log_probs_new, entropy = self.learning_role.actor.evaluate_actions(states, actions)
+
+            # Calculate the ratio of new policy probability to old policy probability
+            # This represents how much the new policy has changed compared to the old policy
+            ratio = (log_probs_new - log_probs_old).exp()
+
+            # Compute the surrogate loss without clipping
+            # This is the raw loss term based on the advantage function
+            surrogate1 = ratio * advantages
+
+            # Apply the clipping function to the ratio to prevent large policy updates
+            # The clipping function limits the ratio to be within the range [1 - clip_ratio, 1 + clip_ratio]
+            # This prevents the policy from deviating too much from the old policy
+            surrogate2 = th.clamp(ratio, 1.0 - self.clip_ratio, 1.0 + self.clip_ratio) * advantages
+
+            # Calculate the final policy loss by taking the minimum of the unclipped and clipped surrogate losses
+            # The idea is to prevent large changes in policy and ensure stability during training
+            # The final policy loss is the negative mean of this minimum value
+            policy_loss = -th.min(surrogate1, surrogate2).mean()
+
+            surrogate1 = ratio * advantages
+            surrogate2 = th.clamp(ratio, 1.0 - self.clip_ratio, 1.0 + self.clip_ratio) * advantages
+            policy_loss = -th.min(surrogate1, surrogate2).mean()
+
+            # Update the critic (value function)
+            values = self.learning_role.critic(states).squeeze()
+            value_loss = F.mse_loss(returns, values)
+
+            # Total loss
+            loss = policy_loss + self.vf_coef * value_loss - self.entropy_coef * entropy.mean()
+
+            # Optimize actor and critic
+            self.learning_role.actor.optimizer.zero_grad()
+            self.learning_role.critic.optimizer.zero_grad()
+            loss.backward()
+
+            # Clip gradients
+            th.nn.utils.clip_grad_norm_(self.learning_role.actor.parameters(), self.max_grad_norm)
+            th.nn.utils.clip_grad_norm_(self.learning_role.critic.parameters(), self.max_grad_norm)
+
+            self.learning_role.actor.optimizer.step()
+            self.learning_role.critic.optimizer.step()
+
+
+    # def update_policy(self):
+    #     """
+    #     Update the policy of the reinforcement learning agent using the Twin Delayed Deep Deterministic Policy Gradients (TD3) algorithm.
+
+    #     Notes:
+    #         This function performs the policy update step, which involves updating the actor (policy) and critic (Q-function) networks
+    #         using TD3 algorithm. It iterates over the specified number of gradient steps and performs the following steps for each
+    #         learning strategy:
+
+    #         1. Sample a batch of transitions from the replay buffer.
+    #         2. Calculate the next actions with added noise using the actor target network.
+    #         3. Compute the target Q-values based on the next states, rewards, and the target critic network.
+    #         4. Compute the critic loss as the mean squared error between current Q-values and target Q-values.
+    #         5. Optimize the critic network by performing a gradient descent step.
+    #         6. Optionally, update the actor network if the specified policy delay is reached.
+    #         7. Apply Polyak averaging to update target networks.
+
+    #         This function implements the TD3 algorithm's key step for policy improvement and exploration.
+    #     """
+
+    #     logger.debug("Updating Policy")
+    #     n_rl_agents = len(self.learning_role.rl_strats.keys())
+    #     for _ in range(self.gradient_steps):
+    #         self.n_updates += 1
+    #         i = 0
+
+    #         for u_id in self.learning_role.rl_strats.keys():
+    #             critic_target = self.learning_role.target_critics[u_id]
+    #             critic = self.learning_role.critics[u_id]
+    #             actor = self.learning_role.rl_strats[u_id].actor
+    #             actor_target = self.learning_role.rl_strats[u_id].actor_target
+
+    #             if i % 100 == 0:
+    #                 transitions = self.learning_role.buffer.sample(self.batch_size)
+    #                 states = transitions.observations
+    #                 actions = transitions.actions
+    #                 next_states = transitions.next_observations
+    #                 rewards = transitions.rewards
+
+    #                 with th.no_grad():
+    #                     # Select action according to policy and add clipped noise
+    #                     noise = actions.clone().data.normal_(
+    #                         0, self.target_policy_noise
+    #                     )
+    #                     noise = noise.clamp(
+    #                         -self.target_noise_clip, self.target_noise_clip
+    #                     )
+    #                     next_actions = [
+    #                         (actor_target(next_states[:, i, :]) + noise[:, i, :]).clamp(
+    #                             -1, 1
+    #                         )
+    #                         for i in range(n_rl_agents)
+    #                     ]
+    #                     next_actions = th.stack(next_actions)
+
+    #                     next_actions = next_actions.transpose(0, 1).contiguous()
+    #                     next_actions = next_actions.view(-1, n_rl_agents * self.act_dim)
+
+    #             all_actions = actions.view(self.batch_size, -1)
+
+    #             # this takes the unique observations from all other agents assuming that
+    #             # the unique observations are at the end of the observation vector
+    #             temp = th.cat(
+    #                 (
+    #                     states[:, :i, self.obs_dim - self.unique_obs_dim :].reshape(
+    #                         self.batch_size, -1
+    #                     ),
+    #                     states[
+    #                         :, i + 1 :, self.obs_dim - self.unique_obs_dim :
+    #                     ].reshape(self.batch_size, -1),
+    #                 ),
+    #                 axis=1,
+    #             )
+
+    #             # the final all_states vector now contains the current agent's observation
+    #             # and the unique observations from all other agents
+    #             all_states = th.cat(
+    #                 (states[:, i, :].reshape(self.batch_size, -1), temp), axis=1
+    #             ).view(self.batch_size, -1)
+    #             # all_states = states[:, i, :].reshape(self.batch_size, -1)
+
+    #             # this is the same as above but for the next states
+    #             temp = th.cat(
+    #                 (
+    #                     next_states[
+    #                         :, :i, self.obs_dim - self.unique_obs_dim :
+    #                     ].reshape(self.batch_size, -1),
+    #                     next_states[
+    #                         :, i + 1 :, self.obs_dim - self.unique_obs_dim :
+    #                     ].reshape(self.batch_size, -1),
+    #                 ),
+    #                 axis=1,
+    #             )
+
+    #             # the final all_next_states vector now contains the current agent's observation
+    #             # and the unique observations from all other agents
+    #             all_next_states = th.cat(
+    #                 (next_states[:, i, :].reshape(self.batch_size, -1), temp), axis=1
+    #             ).view(self.batch_size, -1)
+    #             # all_next_states = next_states[:, i, :].reshape(self.batch_size, -1)
+
+    #             with th.no_grad():
+    #                 # Compute the next Q-values: min over all critics targets
+    #                 next_q_values = th.cat(
+    #                     critic_target(all_next_states, next_actions), dim=1
+    #                 )
+    #                 next_q_values, _ = th.min(next_q_values, dim=1, keepdim=True)
+    #                 target_Q_values = (
+    #                     rewards[:, i].unsqueeze(1) + self.gamma * next_q_values
+    #                 )
+
+    #             # Get current Q-values estimates for each critic network
+    #             current_Q_values = critic(all_states, all_actions)
+
+    #             # Compute critic loss
+    #             critic_loss = sum(
+    #                 F.mse_loss(current_q, target_Q_values)
+    #                 for current_q in current_Q_values
+    #             )
+
+    #             # Optimize the critics
+    #             critic.optimizer.zero_grad()
+    #             critic_loss.backward()
+    #             critic.optimizer.step()
+
+    #             # Delayed policy updates
+    #             if self.n_updates % self.policy_delay == 0:
+    #                 # Compute actor loss
+    #                 state_i = states[:, i, :]
+    #                 action_i = actor(state_i)
+
+    #                 all_actions_clone = actions.clone()
+    #                 all_actions_clone[:, i, :] = action_i
+    #                 all_actions_clone = all_actions_clone.view(self.batch_size, -1)
+
+    #                 actor_loss = -critic.q1_forward(
+    #                     all_states, all_actions_clone
+    #                 ).mean()
+
+    #                 actor.optimizer.zero_grad()
+    #                 actor_loss.backward()
+    #                 actor.optimizer.step()
+
+    #                 polyak_update(
+    #                     critic.parameters(), critic_target.parameters(), self.tau
+    #                 )
+    #                 polyak_update(
+    #                     actor.parameters(), actor_target.parameters(), self.tau
+    #                 )
+
+    #             i += 1
+
+
+
+#     def save_params(self, directory):
+#         """ Save the parameters of the actor and critic networks """
+#         self.save_actor_params(directory=f"{directory}/actors")
+#         self.save_critic_params(directory=f"{directory}/critics")
+
+#     def save_actor_params(self, directory):
+#         """ Save actor parameters. """
+#         os.makedirs(directory, exist_ok=True)
+#         for u_id in self.learning_role.rl_strats.keys():
+#             obj = {
+#                 "actor": self.learning_role.rl_strats[u_id].actor.state_dict(),
+#                 "actor_optimizer": self.learning_role.rl_strats[u_id].actor.optimizer.state_dict(),
+#             }
+#             path = f"{directory}/actor_{u_id}.pt"
+#             th.save(obj, path)
+
+#     def save_critic_params(self, directory):
+#         """ Save critic parameters. """
+#         os.makedirs(directory, exist_ok=True)
+#         for u_id in self.learning_role.rl_strats.keys():
+#             obj = {
+#                 "critic": self.learning_role.critics[u_id].state_dict(),
+#                 "critic_optimizer": self.learning_role.critics[u_id].optimizer.state_dict(),
+#             }
+#             path = f"{directory}/critic_{u_id}.pt"
+#             th.save(obj, path)
+
+#     def load_params(self, directory: str) -> None:
+#         """ Load actor and critic parameters """
+#         self.load_actor_params(directory)
+#         self.load_critic_params(directory)
+
+#     def load_actor_params(self, directory: str) -> None:
+#         """ Load actor parameters from a directory """
+#         if not os.path.exists(directory):
+#             logger.warning("Actor directory does not exist! Initializing randomly.")
+#             return
+
+#         for u_id in self.learning_role.rl_strats.keys():
+#             try:
+#                 actor_params = self.load_obj(f"{directory}/actors/actor_{str(u_id)}.pt")
+#                 self.learning_role.rl_strats[u_id].actor.load_state_dict(actor_params["actor"])
+#                 self.learning_role.rl_strats[u_id].actor.optimizer.load_state_dict(actor_params["actor_optimizer"])
+#             except Exception:
+#                 logger.warning(f"No actor values loaded for agent {u_id}")
+
+#     def load_critic_params(self, directory: str) -> None:
+#         """ Load critic parameters from a directory """
+#         if not os.path.exists(directory):
+#             logger.warning("Critic directory does not exist! Initializing randomly.")
+#             return
+
+#         for u_id in self.learning_role.rl_strats.keys():
+#             try:
+#                 critic_params = self.load_obj(f"{directory}/critics/critic_{str(u_id)}.pt")
+#                 self.learning_role.critics[u_id].load_state_dict(critic_params["critic"])
+#                 self.learning_role.critics[u_id].optimizer.load_state_dict(critic_params["critic_optimizer"])
+#             except Exception:
+#                 logger.warning(f"No critic values loaded for agent {u_id}")
diff --git a/assume/reinforcement_learning/buffer.py b/assume/reinforcement_learning/buffer.py
index e9406aca8..b1ec386a9 100644
--- a/assume/reinforcement_learning/buffer.py
+++ b/assume/reinforcement_learning/buffer.py
@@ -172,3 +172,100 @@ def sample(self, batch_size: int) -> ReplayBufferSamples:
         )
 
         return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
+
+
+class RolloutBuffer:
+    def __init__(self, buffer_size, obs_dim, act_dim, n_agents, gamma=0.99, gae_lambda=0.95, device="cpu"):
+        """
+        A class for storing rollout data for PPO in a multi-agent setting.
+        Stores the trajectories (observations, actions, rewards, log_probs) for all agents.
+
+        Args:
+            buffer_size (int): Max size of the buffer (in terms of time steps).
+            obs_dim (int): Dimension of the observation space.
+            act_dim (int): Dimension of the action space.
+            n_agents (int): Number of agents.
+            gamma (float): Discount factor for rewards.
+            gae_lambda (float): Lambda parameter for Generalized Advantage Estimation (GAE).
+            device (str): Device to store the data ('cpu' or 'cuda').
+        """
+        self.buffer_size = buffer_size
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        self.n_agents = n_agents
+        self.device = device
+        self.gamma = gamma
+        self.gae_lambda = gae_lambda
+
+        # Initialize buffers
+        self.observations = np.zeros((buffer_size, n_agents, obs_dim), dtype=np.float32)
+        self.actions = np.zeros((buffer_size, n_agents, act_dim), dtype=np.float32)
+        self.rewards = np.zeros((buffer_size, n_agents), dtype=np.float32)
+        self.log_probs = np.zeros((buffer_size, n_agents), dtype=np.float32)
+        self.values = np.zeros((buffer_size, n_agents), dtype=np.float32)
+        self.advantages = np.zeros((buffer_size, n_agents), dtype=np.float32)
+        self.returns = np.zeros((buffer_size, n_agents), dtype=np.float32)
+        self.masks = np.ones((buffer_size, n_agents), dtype=np.float32)  # Used to indicate episode boundaries
+
+        self.pos = 0
+
+    def add(self, obs, actions, rewards, log_probs, values, dones):
+        """
+        Add data for the current time step to the buffer.
+        
+        Args:
+            obs (np.array): The observations for all agents.
+            actions (np.array): The actions taken by all agents.
+            rewards (np.array): The rewards received by all agents.
+            log_probs (np.array): The log probabilities of the actions taken.
+            values (np.array): The value estimates for all agents.
+            dones (np.array): Whether the episode has finished for each agent.
+        """
+        self.observations[self.pos] = obs
+        self.actions[self.pos] = actions
+        self.rewards[self.pos] = rewards
+        self.log_probs[self.pos] = log_probs
+        self.values[self.pos] = values
+        self.masks[self.pos] = 1.0 - dones
+
+        self.pos += 1
+
+    def compute_returns_and_advantages(self, last_values, dones):
+        """
+        Compute the returns and advantages using Generalized Advantage Estimation (GAE).
+
+        Args:
+            last_values (np.array): Value estimates for the last observation.
+            dones (np.array): Whether the episode has finished for each agent.
+        """
+        last_advantage = 0
+        for step in reversed(range(self.pos)):
+            if step == self.pos - 1:
+                next_non_terminal = 1.0 - dones
+                next_values = last_values
+            else:
+                next_non_terminal = self.masks[step + 1]
+                next_values = self.values[step + 1]
+
+            delta = self.rewards[step] + self.gamma * next_values * next_non_terminal - self.values[step]
+            self.advantages[step] = last_advantage = delta + self.gamma * self.gae_lambda * next_non_terminal * last_advantage
+            self.returns[step] = self.advantages[step] + self.values[step]
+
+    def get(self):
+        """
+        Get all data stored in the buffer and convert it to PyTorch tensors.
+        Returns the observations, actions, log_probs, advantages, returns, and masks.
+        """
+        data = (
+            self.observations[:self.pos],
+            self.actions[:self.pos],
+            self.log_probs[:self.pos],
+            self.advantages[:self.pos],
+            self.returns[:self.pos],
+            self.masks[:self.pos],
+        )
+        return tuple(map(lambda x: th.tensor(x, device=self.device), data))
+
+    def reset(self):
+        """Reset the buffer after each update."""
+        self.pos = 0
diff --git a/assume/reinforcement_learning/learning_role.py b/assume/reinforcement_learning/learning_role.py
index 60eb47db3..f659c308e 100644
--- a/assume/reinforcement_learning/learning_role.py
+++ b/assume/reinforcement_learning/learning_role.py
@@ -28,6 +28,7 @@ class Learning(Role):
 
     """
 
+    # TD3 and PPO (Replay buffer, gradient steps, early stopping, self.eval_episodes_done potentiall irrelevant for PPO)
     def __init__(
         self,
         learning_config: LearningConfig,
@@ -39,8 +40,8 @@ def __init__(
             "early_stopping_threshold", 0.05
         )
         self.episodes_done = 0
-        self.rl_strats: dict[int, LearningStrategy] = {}
-        self.rl_algorithm = learning_config["algorithm"]
+        self.rl_strats: dict[int, LearningStrategy] = {}  # A dictionary for learning strategies, indexed according to integers.
+        self.rl_algorithm = learning_config["algorithm"] # The name of the reinforcement learning algorithm.
         self.critics = {}
         self.target_critics = {}
 
@@ -97,6 +98,7 @@ def __init__(
         # list of avg_changes
         self.avg_rewards = []
 
+    # TD3 and PPO
     def load_inter_episodic_data(self, inter_episodic_data):
         """
         Load the inter-episodic data from the dict stored across simulation runs.
@@ -119,6 +121,7 @@ def load_inter_episodic_data(self, inter_episodic_data):
 
         self.initialize_policy(inter_episodic_data["actors_and_critics"])
 
+    # TD3 and PPO
     def get_inter_episodic_data(self):
         """
         Dump the inter-episodic data to a dict for storing across simulation runs.
@@ -137,6 +140,7 @@ def get_inter_episodic_data(self):
             "actors_and_critics": self.rl_algorithm.extract_policy(),
         }
 
+    # TD3 and PPO
     def setup(self) -> None:
         """
         Set up the learning role for reinforcement learning training.
@@ -173,6 +177,7 @@ def save_buffer_and_update(self, content: dict, meta: dict) -> None:
 
         self.update_policy()
 
+    # TD3
     def turn_off_initial_exploration(self) -> None:
         """
         Disable initial exploration mode for all learning strategies.
@@ -185,6 +190,7 @@ def turn_off_initial_exploration(self) -> None:
         for _, unit in self.rl_strats.items():
             unit.collect_initial_experience_mode = False
 
+    # TD3 and PPO
     def create_learning_algorithm(self, algorithm: RLAlgorithm):
         """
         Create and initialize the reinforcement learning algorithm.
@@ -207,6 +213,7 @@ def create_learning_algorithm(self, algorithm: RLAlgorithm):
         else:
             logger.error(f"Learning algorithm {algorithm} not implemented!")
 
+    # TD3
     def initialize_policy(self, actors_and_critics: dict = None) -> None:
         """
         Initialize the policy of the reinforcement learning agent considering the respective algorithm.
@@ -228,6 +235,7 @@ def initialize_policy(self, actors_and_critics: dict = None) -> None:
                     f"Folder with pretrained policies {directory} does not exist"
                 )
 
+    # TD3 and PPO
     def update_policy(self) -> None:
         """
         Update the policy of the reinforcement learning agent.
@@ -242,6 +250,7 @@ def update_policy(self) -> None:
         if self.episodes_done > self.episodes_collecting_initial_experience:
             self.rl_algorithm.update_policy()
 
+    # TD3 and PPO
     def compare_and_save_policies(self, metrics: dict) -> None:
         """
         Compare evaluation metrics and save policies based on the best achieved performance according to the metrics calculated.
diff --git a/assume/reinforcement_learning/learning_utils.py b/assume/reinforcement_learning/learning_utils.py
index d2bce6142..652c7b915 100644
--- a/assume/reinforcement_learning/learning_utils.py
+++ b/assume/reinforcement_learning/learning_utils.py
@@ -11,15 +11,40 @@
 from torch.nn import functional as F
 
 
+# TD3 and PPO
 class ObsActRew(TypedDict):
     observation: list[th.Tensor]
     action: list[th.Tensor]
     reward: list[th.Tensor]
 
 
+# TD3 and PPO
 observation_dict = dict[list[datetime], ObsActRew]
 
 
+# TD3 and PPO
+class Actor(nn.Module):
+    """
+    The neurnal network for the actor.
+    """
+
+    def __init__(self, obs_dim: int, act_dim: int, float_type):
+        super(Actor, self).__init__()
+
+        self.FC1 = nn.Linear(obs_dim, 256, dtype=float_type)
+        self.FC2 = nn.Linear(256, 128, dtype=float_type)
+        self.FC3 = nn.Linear(128, act_dim, dtype=float_type)
+
+    def forward(self, obs):
+        x = F.relu(self.FC1(obs))
+        x = F.relu(self.FC2(x))
+        x = F.softsign(self.FC3(x))
+        # x = th.tanh(self.FC3(x))
+
+        return x
+
+
+# TD3
 class CriticTD3(nn.Module):
     """Initialize parameters and build model.
 
@@ -37,7 +62,7 @@ def __init__(
         float_type,
         unique_obs_dim: int = 0,
     ):
-        super().__init__()
+        super(CriticTD3, self).__init__()
 
         self.obs_dim = obs_dim + unique_obs_dim * (n_agents - 1)
         self.act_dim = act_dim * n_agents
@@ -83,7 +108,7 @@ def forward(self, obs, actions):
         x2 = self.FC2_4(x2)
 
         return x1, x2
-
+    
     def q1_forward(self, obs, actions):
         """
         Only predict the Q-value using the first network.
@@ -102,29 +127,45 @@ def q1_forward(self, obs, actions):
         x = self.FC1_4(x)
 
         return x
+    
 
+class CriticPPO(nn.Module):
+    """Critic Network for Proximal Policy Optimization (PPO) in a Multi-Agent Setting.
 
-class Actor(nn.Module):
-    """
-    The neurnal network for the actor.
+    Args:
+        n_agents (int): Number of agents
+        obs_dim (int): Dimension of each state
+        unique_obs_dim (int): Unique observation dimension per agent
+        float_type: Data type for the model parameters
     """
+    # Actor dimension missing compared to MATD3 -> not needed for PPO
+    def __init__(
+        self, 
+        n_agents: int, 
+        obs_dim: int, 
+        float_type,
+        unique_obs_dim: int, 
+       ):
 
-    def __init__(self, obs_dim: int, act_dim: int, float_type):
-        super().__init__()
+        super(CriticPPO, self).__init__()
 
-        self.FC1 = nn.Linear(obs_dim, 256, dtype=float_type)
-        self.FC2 = nn.Linear(256, 128, dtype=float_type)
-        self.FC3 = nn.Linear(128, act_dim, dtype=float_type)
+        # Define the combined observation dimension
+        combined_obs_dim = obs_dim + unique_obs_dim * (n_agents - 1)
 
-    def forward(self, obs):
-        x = F.relu(self.FC1(obs))
-        x = F.relu(self.FC2(x))
-        x = F.softsign(self.FC3(x))
-        # x = th.tanh(self.FC3(x))
+        # Define the architecture of the Critic network
+        self.fc1 = nn.Linear(combined_obs_dim, 256, dtype=float_type)
+        self.fc2 = nn.Linear(256, 128, dtype=float_type)
+        self.fc3 = nn.Linear(128, 1, dtype=float_type)
 
-        return x
+    def forward(self, x):
+        """Forward pass through the network."""
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        value = self.fc3(x)
+        return value
 
 
+# TD3
 # Ornstein-Uhlenbeck Noise
 # from https://github.com/songrotek/DDPG/blob/master/ou_noise.py
 class OUNoise:
@@ -158,6 +199,7 @@ def noise(self):
         return noise
 
 
+# TD3
 class NormalActionNoise:
     """
     A gaussian action noise
@@ -176,6 +218,7 @@ def noise(self):
         return noise
 
 
+# TD3
 def polyak_update(params, target_params, tau: float):
     """
     Perform a Polyak average update on ``target_params`` using ``params``:
diff --git a/examples/examples.py b/examples/examples.py
index 491c36a12..fcf00f250 100644
--- a/examples/examples.py
+++ b/examples/examples.py
@@ -11,7 +11,7 @@
 
 log = logging.getLogger(__name__)
 
-csv_path = ""
+csv_path = "C:/Users/manuk/OneDrive - bwedu/01_Studium/Master/6. Semester/Spezialveranstaltung/CSVs"
 
 os.makedirs("./examples/local_db", exist_ok=True)
 
@@ -60,7 +60,7 @@
         "scenario": "example_02",
         "study_case": "dam_case_2019",
     },
-    "small_learning_1": {"scenario": "example_02a", "study_case": "base"},
+    "small_learning_1": {"scenario": "example_02a", "study_case": "tiny"},
     "small_learning_2": {"scenario": "example_02b", "study_case": "base"},
     "small_learning_3": {"scenario": "example_02c", "study_case": "base"},
     "learning_with_complex_bids": {
@@ -76,8 +76,8 @@
     - local_db: without database and grafana
     - timescale: with database and grafana (note: you need docker installed)
     """
-    data_format = "local_db"  # "local_db" or "timescale"
-    example = "small"
+    data_format = "timescale"  # "local_db" or "timescale"
+    example = "small_learning_1"
 
     if data_format == "local_db":
         db_uri = f"sqlite:///./examples/local_db/assume_db_{example}.db"

From 46f843aacc24ed8dc6ee98c68488ad11fad4a481 Mon Sep 17 00:00:00 2001
From: ufqjh <ufqjh@student.kit.edu>
Date: Fri, 20 Sep 2024 11:49:29 +0200
Subject: [PATCH 02/23] DRL PPO Update

---
 assume/common/base.py                         |   3 +
 .../algorithms/base_algorithm.py              |  17 +-
 .../algorithms/matd3.py                       |  26 +-
 .../reinforcement_learning/algorithms/ppo.py  |  57 ++--
 assume/reinforcement_learning/buffer.py       |  28 +-
 .../reinforcement_learning/learning_role.py   | 225 ++++++++++----
 .../reinforcement_learning/learning_utils.py  | 142 ---------
 .../neural_network_architecture.py            |  76 +++++
 assume/reinforcement_learning/raw_ppo.py      | 189 ++++++++++++
 assume/scenario/loader_csv.py                 | 291 ++++++++++++++----
 assume/strategies/learning_advanced_orders.py |   2 -
 examples/inputs/example_02a/config.yaml       |  84 +++--
 .../inputs/example_02a/config_backup.yaml     | 137 +++++++++
 13 files changed, 922 insertions(+), 355 deletions(-)
 create mode 100644 assume/reinforcement_learning/raw_ppo.py
 create mode 100644 examples/inputs/example_02a/config_backup.yaml

diff --git a/assume/common/base.py b/assume/common/base.py
index f4532d3d8..5b553257d 100644
--- a/assume/common/base.py
+++ b/assume/common/base.py
@@ -781,6 +781,9 @@ def __init__(
         # them into suitable format for recurrent neural networks
         self.num_timeseries_obs_dim = num_timeseries_obs_dim
 
+        print("TEST")
+        print(kwargs.get("algorithm", "lel"))
+
 
 class LearningConfig(TypedDict):
     """
diff --git a/assume/reinforcement_learning/algorithms/base_algorithm.py b/assume/reinforcement_learning/algorithms/base_algorithm.py
index d2e3bb78f..14614c0fc 100644
--- a/assume/reinforcement_learning/algorithms/base_algorithm.py
+++ b/assume/reinforcement_learning/algorithms/base_algorithm.py
@@ -34,32 +34,17 @@ def __init__(
         # init learning_role as object of Learning class
         learning_role,
         learning_rate=1e-4,
-        episodes_collecting_initial_experience=100,
         batch_size=1024,
-        tau=0.005,
         gamma=0.99,
-        gradient_steps=-1,
-        policy_delay=2,
-        target_policy_noise=0.2,
-        target_noise_clip=0.5,
         actor_architecture="mlp",
+        **kwargs,  # allow additional params for specific algorithms
     ):
         super().__init__()
 
         self.learning_role = learning_role
         self.learning_rate = learning_rate
-        self.episodes_collecting_initial_experience = (
-            episodes_collecting_initial_experience
-        )
         self.batch_size = batch_size
         self.gamma = gamma
-        self.tau = tau
-
-        self.gradient_steps = gradient_steps
-
-        self.policy_delay = policy_delay
-        self.target_noise_clip = target_noise_clip
-        self.target_policy_noise = target_policy_noise
 
         if actor_architecture in actor_architecture_aliases:
             self.actor_architecture_class = actor_architecture_aliases[
diff --git a/assume/reinforcement_learning/algorithms/matd3.py b/assume/reinforcement_learning/algorithms/matd3.py
index 20438824a..b5100801e 100644
--- a/assume/reinforcement_learning/algorithms/matd3.py
+++ b/assume/reinforcement_learning/algorithms/matd3.py
@@ -46,16 +46,16 @@ def __init__(
         super().__init__(
             learning_role,
             learning_rate,
-            episodes_collecting_initial_experience,
             batch_size,
-            tau,
             gamma,
-            gradient_steps,
-            policy_delay,
-            target_policy_noise,
-            target_noise_clip,
             actor_architecture,
         )
+        self.episodes_collecting_initial_experience = episodes_collecting_initial_experience
+        self.tau = tau
+        self.gradient_steps = gradient_steps
+        self.policy_delay = policy_delay
+        self.target_policy_noise = target_policy_noise
+        self.target_noise_clip = target_noise_clip
         self.n_updates = 0
 
     def save_params(self, directory):
@@ -203,17 +203,6 @@ def load_actor_params(self, directory: str) -> None:
 
 
 
-
-
-
-
-
-
-
-
-
-
-
     def initialize_policy(self, actors_and_critics: dict = None) -> None:
         """
         Create actor and critic networks for reinforcement learning.
@@ -519,6 +508,7 @@ def update_policy(self):
                     all_actions_clone[:, i, :] = action_i
                     all_actions_clone = all_actions_clone.view(self.batch_size, -1)
 
+                    # Policy gradient calculation start (different for PPO)
                     actor_loss = -critic.q1_forward(
                         all_states, all_actions_clone
                     ).mean()
@@ -526,6 +516,7 @@ def update_policy(self):
                     actor.optimizer.zero_grad()
                     actor_loss.backward()
                     actor.optimizer.step()
+                    # Policy gradient calculation end
 
                     polyak_update(
                         critic.parameters(), critic_target.parameters(), self.tau
@@ -535,3 +526,4 @@ def update_policy(self):
                     )
                 i += 1
 
+
diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index 2a3ab93c4..e2dbfe388 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -11,8 +11,7 @@
 
 from assume.common.base import LearningStrategy
 from assume.reinforcement_learning.algorithms.base_algorithm import RLAlgorithm
-# Check later
-from assume.reinforcement_learning.learning_utils import Actor, CriticPPO, polyak_update
+from assume.reinforcement_learning.neural_network_architecture import ActorPPO, CriticPPO
 
 logger = logging.getLogger(__name__)
 
@@ -34,44 +33,31 @@ class PPO(RLAlgorithm):
     def __init__(
         self,
         learning_role,
-        learning_rate=1e-4, 
-        batch_size=1024, 
-        gamma=0.99, # Discount factor used for future reward consideration.
-
-        # Added PPO parameters
-        epochs=10, # Number of epochs to train each policy update.
-        clip_ratio=0.2, # Clipping ratio for PPO's policy loss to limit the change in policy update. Anstatt den vollen Wert der Änderung zu maximieren, wird das Ziel in einem bestimmten Bereich begrenzt. Der Bereich ist durch einen Parameter (den Clipping-Faktor) definiert, der angibt, wie weit die neue Policy von der alten Policy abweichen darf. Wenn das Verhältnis der Wahrscheinlichkeiten zu weit von 1 abweicht, wird die Verbesserung beschnitten.
-        vf_coef=0.5, # Coefficient for value function loss in PPO.
-        entropy_coef=0.01, # Coefficient for entropy bonus (to encourage exploration).
-        max_grad_norm=0.5, # Maximum gradient norm for gradient clipping.
-
-        # Removed MATD3 parameters
-        # episodes_collecting_initial_experience=100,
-        # tau=0.005,
-        # gradient_steps=-1,
-        # policy_delay=2,
-        # target_policy_noise=0.2,
-        # target_noise_clip=0.5,
+        learning_rate=1e-4,
+        batch_size=1024,
+        gamma=0.99,
+        epochs=10,  # PPO specific
+        clip_ratio=0.2,  # PPO specific
+        vf_coef=0.5,  # PPO specific
+        entropy_coef=0.01,  # PPO specific
+        max_grad_norm=0.5,  # PPO specific
+        gae_lambda=0.95,  # PPO specific
+        actor_architecture="mlp",
     ):
         super().__init__(
             learning_role,
             learning_rate,
             batch_size,
             gamma,
-
-            # episodes_collecting_initial_experience,
-            # tau,
-            # gradient_steps,
-            # policy_delay,
-            # target_policy_noise,
-            # target_noise_clip,
+            actor_architecture,
         )
-        #self.n_updates = 0
+        self.epochs = epochs
         self.clip_ratio = clip_ratio
         self.vf_coef = vf_coef
         self.entropy_coef = entropy_coef
         self.max_grad_norm = max_grad_norm
-        self.epochs = epochs
+        self.gae_lambda = gae_lambda
+        
 
     # Unchanged method from MATD3
     def save_params(self, directory):
@@ -108,7 +94,7 @@ def save_critic_params(self, directory):
             path = f"{directory}/critic_{u_id}.pt"
             th.save(obj, path)
 
-    # Removed actor_target in comparison to MATD3
+    # Removed actor_target in comparison to MATD3 (Actor network = policy network)
     def save_actor_params(self, directory):
         """
         Save the parameters of actor networks.
@@ -148,7 +134,7 @@ def load_params(self, directory: str) -> None:
         self.load_critic_params(directory)
         self.load_actor_params(directory)
 
-    # Removed critic_target in comparison to MATD3
+    # Removed critic_target in comparison to MATD3 (critic network = value function network)
     def load_critic_params(self, directory: str) -> None:
         """
         Load the parameters of critic networks from a specified directory.
@@ -269,7 +255,7 @@ def create_actors(self) -> None:
         act_dim_list = []
 
         for _, unit_strategy in self.learning_role.rl_strats.items():
-            unit_strategy.actor = Actor(
+            unit_strategy.actor = ActorPPO(
                 obs_dim=unit_strategy.obs_dim,
                 act_dim=unit_strategy.act_dim,
                 float_type=self.float_type,
@@ -320,13 +306,6 @@ def create_critics(self) -> None:
 
         for u_id, strategy in self.learning_role.rl_strats.items():
 
-            self, 
-            n_agents: int, 
-            obs_dim: int, 
-            float_type,
-            unique_obs_dim: int, 
-
-            
             self.learning_role.critics[u_id] = CriticPPO(
                 n_agents=n_agents,
                 obs_dim=strategy.obs_dim,
diff --git a/assume/reinforcement_learning/buffer.py b/assume/reinforcement_learning/buffer.py
index b1ec386a9..d7184a285 100644
--- a/assume/reinforcement_learning/buffer.py
+++ b/assume/reinforcement_learning/buffer.py
@@ -205,7 +205,7 @@ def __init__(self, buffer_size, obs_dim, act_dim, n_agents, gamma=0.99, gae_lamb
         self.values = np.zeros((buffer_size, n_agents), dtype=np.float32)
         self.advantages = np.zeros((buffer_size, n_agents), dtype=np.float32)
         self.returns = np.zeros((buffer_size, n_agents), dtype=np.float32)
-        self.masks = np.ones((buffer_size, n_agents), dtype=np.float32)  # Used to indicate episode boundaries
+        self.masks = np.ones((buffer_size, n_agents), dtype=np.float32)  # Mask to indicate episode boundaries (1 for ongoing episode, 0 if episode ended)
 
         self.pos = 0
 
@@ -238,19 +238,45 @@ def compute_returns_and_advantages(self, last_values, dones):
             last_values (np.array): Value estimates for the last observation.
             dones (np.array): Whether the episode has finished for each agent.
         """
+        # Initialize the last advantage to 0. This will accumulate as we move backwards in time.
         last_advantage = 0
+        
+        # Loop backward through all the steps in the buffer to calculate returns and advantages.
+        # This is because GAE (Generalized Advantage Estimation) relies on future rewards,
+        # so we compute it from the last step back to the first step.
         for step in reversed(range(self.pos)):
+            
+            # If we are at the last step in the buffer
             if step == self.pos - 1:
+                # If it's the last step, check whether the episode has finished using `dones`.
+                # `next_non_terminal` is 0 if the episode has ended, 1 if it's ongoing.
                 next_non_terminal = 1.0 - dones
+                # Use the provided last values (value estimates for the final observation in the episode)
                 next_values = last_values
             else:
+                # For other steps, use the mask to determine if the episode is ongoing.
+                # If `masks[step + 1]` is 1, the episode is ongoing; if it's 0, the episode has ended.
                 next_non_terminal = self.masks[step + 1]
+                # Use the value of the next time step to compute the future returns
                 next_values = self.values[step + 1]
 
+            # Temporal difference (TD) error, also known as delta:
+            # This is the difference between the reward obtained at this step and the estimated value of this step
+            # plus the discounted value of the next step (if the episode is ongoing).
+            # This measures how "off" the value function is at predicting the future return.
             delta = self.rewards[step] + self.gamma * next_values * next_non_terminal - self.values[step]
+
+            # Compute the advantage for this step using GAE:
+            # `delta` is the immediate advantage, and we add to it the discounted future advantage,
+            # scaled by the factor `lambda` (from GAE). This allows for a more smooth approximation of advantage.
+            # `next_non_terminal` ensures that if the episode has ended, the future advantage stops accumulating.
             self.advantages[step] = last_advantage = delta + self.gamma * self.gae_lambda * next_non_terminal * last_advantage
+
+            # The return is the advantage plus the baseline value estimate.
+            # This makes sure that the return includes both the immediate rewards and the learned value of future rewards.
             self.returns[step] = self.advantages[step] + self.values[step]
 
+
     def get(self):
         """
         Get all data stored in the buffer and convert it to PyTorch tensors.
diff --git a/assume/reinforcement_learning/learning_role.py b/assume/reinforcement_learning/learning_role.py
index a3f3572d6..96c8656f5 100644
--- a/assume/reinforcement_learning/learning_role.py
+++ b/assume/reinforcement_learning/learning_role.py
@@ -12,7 +12,9 @@
 from assume.common.base import LearningConfig, LearningStrategy
 from assume.reinforcement_learning.algorithms.base_algorithm import RLAlgorithm
 from assume.reinforcement_learning.algorithms.matd3 import TD3
+from assume.reinforcement_learning.algorithms.ppo import PPO
 from assume.reinforcement_learning.buffer import ReplayBuffer
+from assume.reinforcement_learning.buffer import RolloutBuffer
 
 logger = logging.getLogger(__name__)
 
@@ -33,21 +35,16 @@ def __init__(
         self,
         learning_config: LearningConfig,
     ):
-        # how many learning roles do exist and how are they named
-        self.buffer: ReplayBuffer = None
-        self.early_stopping_steps = learning_config.get("early_stopping_steps", 10)
-        self.early_stopping_threshold = learning_config.get(
-            "early_stopping_threshold", 0.05
-        )
+        # General parameters
+        self.rl_algorithm_name = learning_config["algorithm"]
+        self.early_stopping_steps = learning_config.get(self.rl_algorithm_name, {}).get("early_stopping_steps", 10)
+        self.early_stopping_threshold = learning_config.get(self.rl_algorithm_name, {}).get("early_stopping_threshold", 0.05)
         self.episodes_done = 0
         self.rl_strats: dict[int, LearningStrategy] = {}
-        self.rl_algorithm = learning_config["algorithm"]
-        self.actor_architecture = learning_config["actor_architecture"]
+        
         self.critics = {}
-        self.target_critics = {}
 
         # define whether we train model or evaluate it
-        self.training_episodes = learning_config["training_episodes"]
         self.learning_mode = learning_config["learning_mode"]
         self.continue_learning = learning_config["continue_learning"]
         self.perform_evaluation = learning_config["perform_evaluation"]
@@ -56,48 +53,66 @@ def __init__(
             "trained_policies_load_path", self.trained_policies_save_path
         )
 
-        cuda_device = (
-            learning_config["device"]
-            if "cuda" in learning_config.get("device", "cpu")
-            else "cpu"
-        )
-        self.device = th.device(cuda_device if th.cuda.is_available() else "cpu")
-
+        self.device = th.device(learning_config["device"] if th.cuda.is_available() else "cpu")
+
+        # Algorithm-specific parameters
+        if self.rl_algorithm_name == "matd3":
+            self.buffer: ReplayBuffer = None
+            self.target_critics = {}
+
+            self.actor_architecture = learning_config.get(self.rl_algorithm_name, {}).get("actor_architecture", "mlp")
+            self.training_episodes = learning_config["matd3"]["training_episodes"]
+            self.train_freq = learning_config["matd3"]["train_freq"]
+            self.gradient_steps = int(self.train_freq[:-1]) if learning_config["matd3"].get("gradient_steps", -1) == -1 else learning_config["matd3"]["gradient_steps"]
+            # self.batch_size = learning_config["matd3"]["batch_size"]
+            # self.gamma = learning_config["matd3"]["gamma"]
+
+            self.batch_size = learning_config.get(self.rl_algorithm_name, {}).get("batch_size", 128)
+            self.gamma = learning_config.get(self.rl_algorithm_name, {}).get("gamma", 0.99)
+
+            self.learning_rate = learning_config["matd3"]["learning_rate"]
+            self.noise_sigma = learning_config["matd3"]["noise_sigma"]
+            self.noise_scale = learning_config["matd3"]["noise_scale"]
+            self.episodes_collecting_initial_experience = max(learning_config.get(self.rl_algorithm_name, {}).get("episodes_collecting_initial_experience", 5), 1)
+            
+        elif self.rl_algorithm_name == "ppo":
+            self.buffer: RolloutBuffer = None
+            self.actor_architecture = learning_config.get(self.rl_algorithm_name, {}).get("actor_architecture", "mlp")
+            self.training_episodes = learning_config["ppo"]["training_episodes"]
+            self.steps_per_epoch = learning_config["ppo"]["steps_per_epoch"]
+            # self.batch_size = learning_config["matd3"]["batch_size"]
+            # self.gamma = learning_config["matd3"]["gamma"]
+
+            self.batch_size = learning_config.get(self.rl_algorithm_name, {}).get("batch_size", 128)
+            self.gamma = learning_config.get(self.rl_algorithm_name, {}).get("gamma", 0.99)
+
+            self.clip_ratio = learning_config["ppo"]["clip_ratio"]
+            self.entropy_coeff = learning_config["ppo"]["entropy_coeff"]
+            self.value_coeff = learning_config["ppo"]["value_coeff"]
+            self.device = th.device(learning_config["ppo"]["device"] if th.cuda.is_available() else "cpu")
+            self.learning_rate = learning_config["ppo"]["learning_rate"]
+
+        
+       
+
+        # Set up CUDA and float types
+        th.backends.cuda.matmul.allow_tf32 = True
+        th.backends.cudnn.allow_tf32 = True
+        
         # future: add option to choose between float16 and float32
         # float_type = learning_config.get("float_type", "float32")
         self.float_type = th.float
 
-        th.backends.cuda.matmul.allow_tf32 = True
-        th.backends.cudnn.allow_tf32 = True
-
-        self.learning_rate = learning_config.get("learning_rate", 1e-4)
-
-        # if we do not have initital experience collected we will get an error as no samples are avaiable on the
-        # buffer from which we can draw exprience to adapt the strategy, hence we set it to minium one episode
-
-        self.episodes_collecting_initial_experience = max(
-            learning_config.get("episodes_collecting_initial_experience", 5), 1
-        )
-
-        self.train_freq = learning_config.get("train_freq", "1h")
-        self.gradient_steps = (
-            int(self.train_freq[:-1])
-            if learning_config.get("gradient_steps", -1) == -1
-            else learning_config["gradient_steps"]
-        )
-        self.batch_size = learning_config.get("batch_size", 128)
-        self.gamma = learning_config.get("gamma", 0.99)
+        # Initialize the algorithm depending on the type
+        self.create_learning_algorithm(self.rl_algorithm_name)
 
+        # Initialize evaluation metrics
         self.eval_episodes_done = 0
-
-        # function that initializes learning, needs to be an extra function so that it can be called after buffer is given to Role
-        self.create_learning_algorithm(self.rl_algorithm)
-
-        # store evaluation values
         self.max_eval = defaultdict(lambda: -1e9)
         self.rl_eval = defaultdict(list)
-        # list of avg_changes
+        # List of avg changes
         self.avg_rewards = []
+   
 
     # TD3 and PPO
     def load_inter_episodic_data(self, inter_episodic_data):
@@ -214,16 +229,10 @@ def get_noise_scale(self) -> None:
         stored_scale = list(self.rl_strats.values())[0].action_noise.scale
 
         return stored_scale
-
-    def create_learning_algorithm(self, algorithm: RLAlgorithm):
+    
+    def create_learning_algorithm(self, algorithm: str):
         """
-        Create and initialize the reinforcement learning algorithm.
-
-        This method creates and initializes the reinforcement learning algorithm based on the specified algorithm name. The algorithm
-        is associated with the learning role and configured with relevant hyperparameters.
-
-        Args:
-            algorithm (RLAlgorithm): The name of the reinforcement learning algorithm.
+        Algorithm initialization depending on the type
         """
         if algorithm == "matd3":
             self.rl_algorithm = TD3(
@@ -235,9 +244,27 @@ def create_learning_algorithm(self, algorithm: RLAlgorithm):
                 gamma=self.gamma,
                 actor_architecture=self.actor_architecture,
             )
+        elif algorithm == "ppo":
+            self.rl_algorithm = PPO(
+                learning_role=self,
+                learning_rate=self.learning_rate,
+                steps_per_epoch=self.steps_per_epoch,
+                batch_size=self.batch_size,
+                gamma=self.gamma,
+                clip_ratio=self.clip_ratio,
+                entropy_coeff=self.entropy_coeff,
+                value_coeff=self.value_coeff,
+                actor_architecture=self.actor_architecture,
+            )
         else:
             logger.error(f"Learning algorithm {algorithm} not implemented!")
 
+        # Loop over rl_strats
+        # self.rl_algorithm an die Learning Strategy übergeben 
+        # Damit die Learning Strategy auf act/get_actions zugreifen kann
+
+
+
     # TD3
     def initialize_policy(self, actors_and_critics: dict = None) -> None:
         """
@@ -343,3 +370,97 @@ def compare_and_save_policies(self, metrics: dict) -> bool:
 
                         return True
             return False
+
+ # def __init__(
+    #     self,
+    #     learning_config: LearningConfig,
+    # ):
+    #     # how many learning roles do exist and how are they named
+    #     self.buffer: ReplayBuffer = None
+    #     self.early_stopping_steps = learning_config.get("early_stopping_steps", 10)
+    #     self.early_stopping_threshold = learning_config.get(
+    #         "early_stopping_threshold", 0.05
+    #     )
+    #     self.episodes_done = 0
+    #     self.rl_strats: dict[int, LearningStrategy] = {}
+    #     self.rl_algorithm = learning_config["algorithm"]
+    #     self.actor_architecture = learning_config["actor_architecture"]
+    #     self.critics = {}
+    #     self.target_critics = {}
+
+    #     # define whether we train model or evaluate it
+    #     self.training_episodes = learning_config["training_episodes"]
+    #     self.learning_mode = learning_config["learning_mode"]
+    #     self.continue_learning = learning_config["continue_learning"]
+    #     self.perform_evaluation = learning_config["perform_evaluation"]
+    #     self.trained_policies_save_path = learning_config["trained_policies_save_path"]
+    #     self.trained_policies_load_path = learning_config.get(
+    #         "trained_policies_load_path", self.trained_policies_save_path
+    #     )
+
+    #     cuda_device = (
+    #         learning_config["device"]
+    #         if "cuda" in learning_config.get("device", "cpu")
+    #         else "cpu"
+    #     )
+    #     self.device = th.device(cuda_device if th.cuda.is_available() else "cpu")
+
+    #     # future: add option to choose between float16 and float32
+    #     # float_type = learning_config.get("float_type", "float32")
+    #     self.float_type = th.float
+
+    #     th.backends.cuda.matmul.allow_tf32 = True
+    #     th.backends.cudnn.allow_tf32 = True
+
+    #     self.learning_rate = learning_config.get("learning_rate", 1e-4)
+
+    #     # if we do not have initital experience collected we will get an error as no samples are avaiable on the
+    #     # buffer from which we can draw exprience to adapt the strategy, hence we set it to minium one episode
+
+    #     self.episodes_collecting_initial_experience = max(
+    #         learning_config.get("episodes_collecting_initial_experience", 5), 1
+    #     )
+
+    #     self.train_freq = learning_config.get("train_freq", "1h")
+    #     self.gradient_steps = (
+    #         int(self.train_freq[:-1])
+    #         if learning_config.get("gradient_steps", -1) == -1
+    #         else learning_config["gradient_steps"]
+    #     )
+    #     self.batch_size = learning_config.get("batch_size", 128)
+    #     self.gamma = learning_config.get("gamma", 0.99)
+
+    #     self.eval_episodes_done = 0
+
+    #     # function that initializes learning, needs to be an extra function so that it can be called after buffer is given to Role
+    #     self.create_learning_algorithm(self.rl_algorithm)
+
+    #     # store evaluation values
+    #     self.max_eval = defaultdict(lambda: -1e9)
+    #     self.rl_eval = defaultdict(list)
+    #     # list of avg_changes
+    #     self.avg_rewards = []
+
+    # MATD3 version
+    # def create_learning_algorithm(self, algorithm: RLAlgorithm):
+    #     """
+    #     Create and initialize the reinforcement learning algorithm.
+
+    #     This method creates and initializes the reinforcement learning algorithm based on the specified algorithm name. The algorithm
+    #     is associated with the learning role and configured with relevant hyperparameters.
+
+    #     Args:
+    #         algorithm (RLAlgorithm): The name of the reinforcement learning algorithm.
+    #     """
+    #     if algorithm == "matd3":
+    #         self.rl_algorithm = TD3(
+    #             learning_role=self,
+    #             learning_rate=self.learning_rate,
+    #             episodes_collecting_initial_experience=self.episodes_collecting_initial_experience,
+    #             gradient_steps=self.gradient_steps,
+    #             batch_size=self.batch_size,
+    #             gamma=self.gamma,
+    #             actor_architecture=self.actor_architecture,
+    #         )
+    #     else:
+    #         logger.error(f"Learning algorithm {algorithm} not implemented!")
\ No newline at end of file
diff --git a/assume/reinforcement_learning/learning_utils.py b/assume/reinforcement_learning/learning_utils.py
index 78e5ff01c..d7d181958 100644
--- a/assume/reinforcement_learning/learning_utils.py
+++ b/assume/reinforcement_learning/learning_utils.py
@@ -20,148 +20,6 @@ class ObsActRew(TypedDict):
 observation_dict = dict[list[datetime], ObsActRew]
 
 
-# TD3 and PPO
-class Actor(nn.Module):
-    """
-    The neurnal network for the actor.
-    """
-
-    def __init__(self, obs_dim: int, act_dim: int, float_type):
-        super(Actor, self).__init__()
-
-        self.FC1 = nn.Linear(obs_dim, 256, dtype=float_type)
-        self.FC2 = nn.Linear(256, 128, dtype=float_type)
-        self.FC3 = nn.Linear(128, act_dim, dtype=float_type)
-
-    def forward(self, obs):
-        x = F.relu(self.FC1(obs))
-        x = F.relu(self.FC2(x))
-        x = F.softsign(self.FC3(x))
-        # x = th.tanh(self.FC3(x))
-
-        return x
-
-
-# TD3
-class CriticTD3(nn.Module):
-    """Initialize parameters and build model.
-
-    Args:
-        n_agents (int): Number of agents
-        obs_dim (int): Dimension of each state
-        act_dim (int): Dimension of each action
-    """
-
-    def __init__(
-        self,
-        n_agents: int,
-        obs_dim: int,
-        act_dim: int,
-        float_type,
-        unique_obs_dim: int = 0,
-    ):
-        super(CriticTD3, self).__init__()
-
-        self.obs_dim = obs_dim + unique_obs_dim * (n_agents - 1)
-        self.act_dim = act_dim * n_agents
-
-        # Q1 architecture
-        if n_agents <= 50:
-            self.FC1_1 = nn.Linear(self.obs_dim + self.act_dim, 512, dtype=float_type)
-            self.FC1_2 = nn.Linear(512, 256, dtype=float_type)
-            self.FC1_3 = nn.Linear(256, 128, dtype=float_type)
-            self.FC1_4 = nn.Linear(128, 1, dtype=float_type)
-        else:
-            self.FC1_1 = nn.Linear(self.obs_dim + self.act_dim, 1024, dtype=float_type)
-            self.FC1_2 = nn.Linear(1024, 512, dtype=float_type)
-            self.FC1_3 = nn.Linear(512, 128, dtype=float_type)
-            self.FC1_4 = nn.Linear(128, 1, dtype=float_type)
-
-        # Q2 architecture
-        if n_agents <= 50:
-            self.FC2_1 = nn.Linear(self.obs_dim + self.act_dim, 512, dtype=float_type)
-            self.FC2_2 = nn.Linear(512, 256, dtype=float_type)
-            self.FC2_3 = nn.Linear(256, 128, dtype=float_type)
-            self.FC2_4 = nn.Linear(128, 1, dtype=float_type)
-        else:
-            self.FC2_1 = nn.Linear(self.obs_dim + self.act_dim, 1024, dtype=float_type)
-            self.FC2_2 = nn.Linear(1024, 512, dtype=float_type)
-            self.FC2_3 = nn.Linear(512, 128, dtype=float_type)
-            self.FC2_4 = nn.Linear(128, 1, dtype=float_type)
-
-    def forward(self, obs, actions):
-        """
-        Forward pass through the network, from observation to actions.
-        """
-        xu = th.cat([obs, actions], 1)
-
-        x1 = F.relu(self.FC1_1(xu))
-        x1 = F.relu(self.FC1_2(x1))
-        x1 = F.relu(self.FC1_3(x1))
-        x1 = self.FC1_4(x1)
-
-        x2 = F.relu(self.FC2_1(xu))
-        x2 = F.relu(self.FC2_2(x2))
-        x2 = F.relu(self.FC2_3(x2))
-        x2 = self.FC2_4(x2)
-
-        return x1, x2
-    
-    def q1_forward(self, obs, actions):
-        """
-        Only predict the Q-value using the first network.
-        This allows to reduce computation when all the estimates are not needed
-        (e.g. when updating the policy in TD3).
-
-        Args:
-            obs (torch.Tensor): The observations
-            actions (torch.Tensor): The actions
-
-        """
-        x = th.cat([obs, actions], 1)
-        x = F.relu(self.FC1_1(x))
-        x = F.relu(self.FC1_2(x))
-        x = F.relu(self.FC1_3(x))
-        x = self.FC1_4(x)
-
-        return x
-    
-
-class CriticPPO(nn.Module):
-    """Critic Network for Proximal Policy Optimization (PPO) in a Multi-Agent Setting.
-
-    Args:
-        n_agents (int): Number of agents
-        obs_dim (int): Dimension of each state
-        unique_obs_dim (int): Unique observation dimension per agent
-        float_type: Data type for the model parameters
-    """
-    # Actor dimension missing compared to MATD3 -> not needed for PPO
-    def __init__(
-        self, 
-        n_agents: int, 
-        obs_dim: int, 
-        float_type,
-        unique_obs_dim: int, 
-       ):
-
-        super(CriticPPO, self).__init__()
-
-        # Define the combined observation dimension
-        combined_obs_dim = obs_dim + unique_obs_dim * (n_agents - 1)
-
-        # Define the architecture of the Critic network
-        self.fc1 = nn.Linear(combined_obs_dim, 256, dtype=float_type)
-        self.fc2 = nn.Linear(256, 128, dtype=float_type)
-        self.fc3 = nn.Linear(128, 1, dtype=float_type)
-
-    def forward(self, x):
-        """Forward pass through the network."""
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        value = self.fc3(x)
-        return value
-
 
 # TD3
 # Ornstein-Uhlenbeck Noise
diff --git a/assume/reinforcement_learning/neural_network_architecture.py b/assume/reinforcement_learning/neural_network_architecture.py
index 565d6998e..3cbb76aed 100644
--- a/assume/reinforcement_learning/neural_network_architecture.py
+++ b/assume/reinforcement_learning/neural_network_architecture.py
@@ -91,6 +91,82 @@ def q1_forward(self, obs, actions):
         return x
 
 
+class CriticPPO(nn.Module):
+    """Critic Network for Proximal Policy Optimization (PPO) in a Multi-Agent Setting.
+
+    Args:
+        n_agents (int): Number of agents
+        obs_dim (int): Dimension of each state
+        unique_obs_dim (int): Unique observation dimension per agent
+        float_type: Data type for the model parameters
+    """
+    # Actor dimension missing compared to MATD3 -> not needed for PPO
+    def __init__(
+        self, 
+        n_agents: int, 
+        obs_dim: int, 
+        float_type,
+        unique_obs_dim: int, 
+       ):
+
+        super(CriticPPO, self).__init__()
+
+        # Define the combined observation dimension
+        combined_obs_dim = obs_dim + unique_obs_dim * (n_agents - 1)
+
+        # Define the architecture of the Critic network
+        self.fc1 = nn.Linear(combined_obs_dim, 256, dtype=float_type)
+        self.fc2 = nn.Linear(256, 128, dtype=float_type)
+        self.fc3 = nn.Linear(128, 1, dtype=float_type)
+
+    def forward(self, x):
+        """Forward pass through the network."""
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        value = self.fc3(x)
+        return value
+
+class ActorPPO(nn.Module):
+    """
+    Actor network for PPO using MLP architecture with action sampling.
+
+    Args:
+        obs_dim (int): Dimension of the observation space.
+        act_dim (int): Dimension of the action space.
+        float_type: Data type for the model parameters.
+    """
+    def __init__(self, obs_dim: int, act_dim: int, float_type):
+        super().__init__()
+        # Define the actor network layers
+        self.fc1 = nn.Linear(obs_dim, 256, dtype=float_type)
+        self.fc2 = nn.Linear(256, 128, dtype=float_type)
+        self.fc3 = nn.Linear(128, act_dim, dtype=float_type)
+
+    def forward(self, obs):
+        """Forward pass to generate action logits."""
+        x = F.relu(self.fc1(obs))
+        x = F.relu(self.fc2(x))
+        action_logits = self.fc3(x) # action_logits are mean values for continuous action space
+        return F.tanh(action_logits)  # Bound action space between [-1, 1]
+
+    def act(self, obs):
+        """
+        Samples an action and returns both the action and its log probability.
+
+        Args:
+            obs (torch.Tensor): The observation input.
+
+        Returns:
+            action (torch.Tensor): The sampled action.
+            log_prob (torch.Tensor): Log probability of the action.
+        """
+        action_logits = self.forward(obs)
+        action_dist = th.distributions.Normal(action_logits, 1.0)  # Assuming standard deviation of 1 for simplicity
+        action = action_dist.sample() # Choose a random action from the distribution
+        log_prob = action_dist.log_prob(action).sum(dim=-1)  # Summing log probs across action dimensions
+        return action, log_prob
+
+
 class Actor(nn.Module):
     """
     Parent class for actor networks.
diff --git a/assume/reinforcement_learning/raw_ppo.py b/assume/reinforcement_learning/raw_ppo.py
new file mode 100644
index 000000000..5af03b5dd
--- /dev/null
+++ b/assume/reinforcement_learning/raw_ppo.py
@@ -0,0 +1,189 @@
+import gym
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from collections import deque
+import numpy as np
+
+class MLPActorCritic(nn.Module):
+    """
+    Simple MLP Actor-Critic network with separate actor and critic heads.
+    """
+    def __init__(self, obs_dim, act_dim):
+        super(MLPActorCritic, self).__init__()
+        self.shared = nn.Sequential(
+            nn.Linear(obs_dim, 64),
+            nn.ReLU(),
+            nn.Linear(64, 64),
+            nn.ReLU()
+        )
+        # Actor head
+        self.actor = nn.Linear(64, act_dim)
+        # Critic head
+        self.critic = nn.Linear(64, 1)
+
+    def forward(self, obs):
+        shared_out = self.shared(obs)
+        return self.actor(shared_out), self.critic(shared_out)
+
+    def act(self, obs):
+        logits, value = self(obs)
+        dist = torch.distributions.Categorical(logits=logits)
+        action = dist.sample()
+        return action, dist.log_prob(action), value
+
+    def evaluate_actions(self, obs, actions):
+        logits, values = self(obs)
+        dist = torch.distributions.Categorical(logits=logits)
+        action_log_probs = dist.log_prob(actions)
+        dist_entropy = dist.entropy()
+        return action_log_probs, torch.squeeze(values, dim=-1), dist_entropy
+
+
+class PPO:
+    """
+    Proximal Policy Optimization (PPO) implementation in PyTorch.
+    """
+    def __init__(self, env, actor_critic, clip_param=0.2, entcoeff=0.01, optim_stepsize=1e-3, optim_epochs=4, gamma=0.99, lam=0.95, batch_size=64):
+        self.env = env
+        self.actor_critic = actor_critic
+        self.clip_param = clip_param
+        self.entcoeff = entcoeff
+        self.optim_epochs = optim_epochs
+        self.optim_stepsize = optim_stepsize
+        self.gamma = gamma
+        self.lam = lam
+        self.batch_size = batch_size
+        self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=self.optim_stepsize)
+
+    def discount_rewards(self, rewards, dones, gamma):
+        """
+        Compute discounted rewards.
+        """
+        discounted_rewards = []
+        r = 0
+        for reward, done in zip(reversed(rewards), reversed(dones)):
+            if done:
+                r = 0
+            r = reward + gamma * r
+            discounted_rewards.insert(0, r)
+        return discounted_rewards
+
+    def compute_gae(self, rewards, values, dones, gamma, lam):
+        """
+        Compute Generalized Advantage Estimation (GAE).
+        """
+        adv = 0
+        advantages = []
+        for t in reversed(range(len(rewards))):
+            delta = rewards[t] + gamma * values[t + 1] * (1 - dones[t]) - values[t]
+            adv = delta + gamma * lam * adv * (1 - dones[t])
+            advantages.insert(0, adv)
+        return advantages
+
+    def rollout(self, timesteps_per_actorbatch):
+        """
+        Collect trajectories by running the policy in the environment.
+        """
+        # Reset env
+        obs = self.env.reset()
+        obs_list, actions_list, rewards_list, dones_list, log_probs_list, values_list = [], [], [], [], [], []
+        for _ in range(timesteps_per_actorbatch):
+            obs_tensor = torch.FloatTensor(obs).unsqueeze(0)
+            action, log_prob, value = self.actor_critic.act(obs_tensor)
+
+            obs_list.append(obs_tensor)
+            actions_list.append(action)
+            log_probs_list.append(log_prob)
+            values_list.append(value)
+
+            next_obs, reward, done, _ = self.env.step(action.item())
+            rewards_list.append(reward)
+            dones_list.append(done)
+
+            obs = next_obs
+            if done:
+                obs = self.env.reset()
+
+        obs_tensor = torch.FloatTensor(obs).unsqueeze(0)
+        _, _, last_value = self.actor_critic.act(obs_tensor)
+
+        values_list.append(last_value)
+
+        return {
+            "observations": torch.cat(obs_list),
+            "actions": torch.cat(actions_list),
+            "log_probs": torch.cat(log_probs_list),
+            "values": torch.cat(values_list),
+            "rewards": rewards_list,
+            "dones": dones_list,
+        }
+
+    def ppo_update(self, batch, clip_param, entcoeff):
+        """
+        Update the policy using PPO objective.
+        """
+        observations, actions, old_log_probs, returns, advantages = batch
+
+        for _ in range(self.optim_epochs):
+            new_log_probs, values, entropy = self.actor_critic.evaluate_actions(observations, actions)
+
+            ratio = torch.exp(new_log_probs - old_log_probs)
+            surr1 = ratio * advantages
+            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantages
+
+            policy_loss = -torch.min(surr1, surr2).mean()
+            value_loss = (returns - values).pow(2).mean()
+            entropy_loss = entropy.mean()
+
+            loss = policy_loss + 0.5 * value_loss - entcoeff * entropy_loss
+
+            self.optimizer.zero_grad()
+            loss.backward()
+            self.optimizer.step()
+
+    def train(self, total_timesteps, timesteps_per_actorbatch, log_interval=100):
+        """
+        Main training loop.
+        """
+        total_timesteps_done = 0
+        reward_history = deque(maxlen=100)
+
+        while total_timesteps_done < total_timesteps:
+            # Rollout
+            batch = self.rollout(timesteps_per_actorbatch)
+            observations = batch["observations"]
+            actions = batch["actions"]
+            old_log_probs = batch["log_probs"]
+            rewards = batch["rewards"]
+            dones = batch["dones"]
+            values = batch["values"].detach()
+
+            # Compute discounted rewards and advantages
+            returns = torch.FloatTensor(self.discount_rewards(rewards, dones, self.gamma))
+            advantages = torch.FloatTensor(self.compute_gae(rewards, values.numpy(), dones, self.gamma, self.lam))
+
+            # Normalize advantages
+            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+
+            # Update the policy using PPO
+            batch_data = (observations, actions, old_log_probs, returns, advantages)
+            self.ppo_update(batch_data, self.clip_param, self.entcoeff)
+
+            total_timesteps_done += timesteps_per_actorbatch
+            avg_reward = np.mean(rewards)
+            reward_history.append(avg_reward)
+
+            if total_timesteps_done % log_interval == 0:
+                print(f"Timesteps: {total_timesteps_done}, Avg Reward: {np.mean(reward_history)}")
+
+
+# Example usage with CartPole environment
+env = gym.make('CartPole-v1')
+obs_dim = env.observation_space.shape[0]
+act_dim = env.action_space.n
+
+actor_critic = MLPActorCritic(obs_dim, act_dim)
+ppo = PPO(env, actor_critic, clip_param=0.2, entcoeff=0.01, optim_stepsize=1e-3, optim_epochs=4, gamma=0.99, lam=0.95)
+
+ppo.train(total_timesteps=10000, timesteps_per_actorbatch=256)
diff --git a/assume/scenario/loader_csv.py b/assume/scenario/loader_csv.py
index dee013371..941229ff7 100644
--- a/assume/scenario/loader_csv.py
+++ b/assume/scenario/loader_csv.py
@@ -874,75 +874,74 @@ def run_learning(
     verbose: bool = False,
 ) -> None:
     """
-    Train Deep Reinforcement Learning (DRL) agents to act in a simulated market environment.
-
-    This function runs multiple episodes of simulation to train DRL agents, performs evaluation, and saves the best runs. It maintains the buffer and learned agents in memory to avoid resetting them with each new run.
-
-    Args:
-        world (World): An instance of the World class representing the simulation environment.
-        inputs_path (str): The path to the folder containing input files necessary for the simulation.
-        scenario (str): The name of the scenario for the simulation.
-        study_case (str): The specific study case for the simulation.
-
-    Note:
-        - The function uses a ReplayBuffer to store experiences for training the DRL agents.
-        - It iterates through training episodes, updating the agents and evaluating their performance at regular intervals.
-        - Initial exploration is active at the beginning and is disabled after a certain number of episodes to improve the performance of DRL algorithms.
-        - Upon completion of training, the function performs an evaluation run using the best policy learned during training.
-        - The best policies are chosen based on the average reward obtained during the evaluation runs, and they are saved for future use.
+    Train Deep Reinforcement Learning (DRL) agents (either MATD3 or PPO) to act in a simulated market environment.
+    This function runs multiple episodes of simulation to train DRL agents, performs evaluation, and saves the best runs.
+    It maintains the buffer and learned agents in memory to avoid resetting them with each new run.
     """
-    from assume.reinforcement_learning.buffer import ReplayBuffer
+    from assume.reinforcement_learning.buffer import ReplayBuffer, RolloutBuffer
 
     if not verbose:
         logger.setLevel(logging.WARNING)
 
-    # remove csv path so that nothing is written while learning
     temp_csv_path = world.export_csv_path
     world.export_csv_path = ""
 
-    # initialize policies already here to set the obs_dim and act_dim in the learning role
     actors_and_critics = None
-    world.learning_role.initialize_policy(actors_and_critics=actors_and_critics)
+    world.learning_role.initialize_policy(actors_and_critics=actors_and_critics) # Leads to the initialization of the Learning role, makes world.learning_role.rl_algorithm_name accessible
     world.output_role.del_similar_runs()
 
-    # check if we already stored policies for this simualtion
     save_path = world.learning_config["trained_policies_save_path"]
 
     if Path(save_path).is_dir():
-        # we are in learning mode and about to train new policies, which might overwrite existing ones
-        accept = input(
-            f"{save_path=} exists - should we overwrite current learnings? (y/N) "
-        )
+        accept = input(f"{save_path=} exists - should we overwrite current learnings? (y/N) ")
         if not accept.lower().startswith("y"):
-            # stop here - do not start learning or save anything
-            raise AssumeException("don't overwrite existing strategies")
+            raise AssumeException("Don't overwrite existing strategies")
 
-    # -----------------------------------------
-    # Load scenario data to reuse across episodes
+    # Load scenario data
     scenario_data = load_config_and_create_forecaster(inputs_path, scenario, study_case)
 
-    # -----------------------------------------
-    # Information that needs to be stored across episodes, aka one simulation run
-    inter_episodic_data = {
-        "buffer": ReplayBuffer(
+    print(world.learning_role.rl_algorithm_name)
+
+    if world.learning_role.rl_algorithm_name == "matd3":
+        buffer = ReplayBuffer(
             buffer_size=int(world.learning_config.get("replay_buffer_size", 5e5)),
             obs_dim=world.learning_role.rl_algorithm.obs_dim,
             act_dim=world.learning_role.rl_algorithm.act_dim,
             n_rl_units=len(world.learning_role.rl_strats),
             device=world.learning_role.device,
             float_type=world.learning_role.float_type,
-        ),
+        )
+    elif world.learning_role.rl_algorithm_name == "ppo":
+        buffer = RolloutBuffer(
+            buffer_size=int(world.learning_config.get("rollout_buffer_size", 2048)),
+            obs_dim=world.learning_role.rl_algorithm.obs_dim,
+            act_dim=world.learning_role.rl_algorithm.act_dim,
+            n_rl_units=len(world.learning_role.rl_strats),
+            device=world.learning_role.device,
+            float_type=world.learning_role.float_type,
+        )
+
+    inter_episodic_data = {
+        "buffer": buffer,
         "actors_and_critics": None,
         "max_eval": defaultdict(lambda: -1e9),
         "all_eval": defaultdict(list),
         "avg_all_eval": [],
         "episodes_done": 0,
         "eval_episodes_done": 0,
-        "noise_scale": world.learning_config.get("noise_scale", 1.0),
     }
 
-    # -----------------------------------------
+    if world.learning_role.rl_algorithm_name == "matd3":
+        # In MATD3, noise_scale is relevant because it adds noise to the actions taken by the agents
+        # during exploration. This is essential in deterministic policy gradient methods like TD3 and MATD3,
+        # where the agents need to explore the environment sufficiently to avoid getting stuck in local optima.
+        # Noise is added to the actions to encourage exploration. In contrast, PPO uses stochastic policies
+        # that naturally explore the environment by sampling actions from a probability distribution, making
+        # external noise addition unnecessary.
+        inter_episodic_data["noise_scale"] = world.learning_config.get("noise_scale", 1.0)
+
 
+    # Sets the validation interval: After how many episodes does validation take place
     validation_interval = min(
         world.learning_role.training_episodes,
         world.learning_config.get("validation_episodes_interval", 5),
@@ -954,7 +953,6 @@ def run_learning(
         range(1, world.learning_role.training_episodes + 1),
         desc="Training Episodes",
     ):
-        # TODO normally, loading twice should not create issues, somehow a scheduling issue is raised currently
         if episode != 1:
             setup_world(
                 world=world,
@@ -963,27 +961,27 @@ def run_learning(
                 episode=episode,
             )
 
-        # -----------------------------------------
-        # Give the newly initliazed learning role the needed information across episodes
         world.learning_role.load_inter_episodic_data(inter_episodic_data)
-
         world.run()
 
-        # -----------------------------------------
-        # Store updated information across episodes
         inter_episodic_data = world.learning_role.get_inter_episodic_data()
         inter_episodic_data["episodes_done"] = episode
 
-        # evaluation run:
+        # Reset the PPO Rollout Buffer after each episode
+        if world.learning_role.rl_algorithm_name == "ppo":
+            inter_episodic_data["buffer"].reset()
+
+
+
+
+
+        # Perform validation at regular intervals
         if (
             episode % validation_interval == 0
-            and episode
-            >= world.learning_role.episodes_collecting_initial_experience
-            + validation_interval
+            and episode >= world.learning_role.episodes_collecting_initial_experience + validation_interval
         ):
             world.reset()
 
-            # load evaluation run
             setup_world(
                 world=world,
                 scenario_data=scenario_data,
@@ -993,20 +991,24 @@ def run_learning(
             )
 
             world.learning_role.load_inter_episodic_data(inter_episodic_data)
-
             world.run()
 
-            total_rewards = world.output_role.get_sum_reward()
-            avg_reward = np.mean(total_rewards)
-            # check reward improvement in evaluation run
-            # and store best run in eval folder
-            terminate = world.learning_role.compare_and_save_policies(
-                {"avg_reward": avg_reward}
-            )
+            if world.learning_role.rl_algorithm_name == "ppo":
+                advantages, returns = compute_advantages_and_returns(world, inter_episodic_data)
+                world.learning_role.update_policy_with_ppo(advantages, returns)
+                surrogate_loss = compute_surrogate_loss(world, inter_episodic_data)
+                terminate = world.learning_role.compare_and_save_policies({"surrogate_loss": surrogate_loss})
+
+                # Reset the PPO Rollout Buffer after validation
+                inter_episodic_data["buffer"].reset()
+
+            elif world.learning_role.rl_algorithm_name == "matd3":
+                total_rewards = world.output_role.get_sum_reward()
+                avg_reward = np.mean(total_rewards)
+                terminate = world.learning_role.compare_and_save_policies({"avg_reward": avg_reward})
 
             inter_episodic_data["eval_episodes_done"] = eval_episode
 
-            # if we have not improved in the last x evaluations, we stop loop
             if terminate:
                 break
 
@@ -1014,20 +1016,18 @@ def run_learning(
 
         world.reset()
 
-        # if at end of simulation save last policies
         if episode == (world.learning_role.training_episodes):
             world.learning_role.rl_algorithm.save_params(
                 directory=f"{world.learning_role.trained_policies_save_path}/last_policies"
             )
 
-        # container shutdown implicitly with new initialisation
     logger.info("################")
     logger.info("Training finished, Start evaluation run")
     world.export_csv_path = temp_csv_path
 
     world.reset()
 
-    # load scenario for evaluation
+    # Based on the parameters for setup_world, it is automatically recognized if training or evaluation is to be performed. Now the evaluation is performed.
     setup_world(
         world=world,
         scenario_data=scenario_data,
@@ -1038,5 +1038,178 @@ def run_learning(
     world.learning_role.load_inter_episodic_data(inter_episodic_data)
 
 
+
+# def run_learning(
+#     world: World,
+#     inputs_path: str,
+#     scenario: str,
+#     study_case: str,
+#     verbose: bool = False,
+# ) -> None:
+#     """
+#     Train Deep Reinforcement Learning (DRL) agents to act in a simulated market environment.
+
+#     This function runs multiple episodes of simulation to train DRL agents, performs evaluation, and saves the best runs. It maintains the buffer and learned agents in memory to avoid resetting them with each new run.
+
+#     Args:
+#         world (World): An instance of the World class representing the simulation environment.
+#         inputs_path (str): The path to the folder containing input files necessary for the simulation.
+#         scenario (str): The name of the scenario for the simulation.
+#         study_case (str): The specific study case for the simulation.
+
+#     Note:
+#         - The function uses a ReplayBuffer to store experiences for training the DRL agents.
+#         - It iterates through training episodes, updating the agents and evaluating their performance at regular intervals.
+#         - Initial exploration is active at the beginning and is disabled after a certain number of episodes to improve the performance of DRL algorithms.
+#         - Upon completion of training, the function performs an evaluation run using the best policy learned during training.
+#         - The best policies are chosen based on the average reward obtained during the evaluation runs, and they are saved for future use.
+#     """
+#     from assume.reinforcement_learning.buffer import ReplayBuffer, RolloutBuffer
+
+#     if not verbose:
+#         logger.setLevel(logging.WARNING)
+
+#     # remove csv path so that nothing is written while learning
+#     temp_csv_path = world.export_csv_path
+#     world.export_csv_path = ""
+
+#     # initialize policies already here to set the obs_dim and act_dim in the learning role
+#     actors_and_critics = None
+#     world.learning_role.initialize_policy(actors_and_critics=actors_and_critics)
+#     world.output_role.del_similar_runs()
+
+#     # check if we already stored policies for this simualtion
+#     save_path = world.learning_config["trained_policies_save_path"]
+
+#     if Path(save_path).is_dir():
+#         # we are in learning mode and about to train new policies, which might overwrite existing ones
+#         accept = input(
+#             f"{save_path=} exists - should we overwrite current learnings? (y/N) "
+#         )
+#         if not accept.lower().startswith("y"):
+#             # stop here - do not start learning or save anything
+#             raise AssumeException("don't overwrite existing strategies")
+
+#     # -----------------------------------------
+#     # Load scenario data to reuse across episodes
+#     scenario_data = load_config_and_create_forecaster(inputs_path, scenario, study_case)
+
+#     # -----------------------------------------
+#     # Information that needs to be stored across episodes, aka one simulation run
+#     inter_episodic_data = {
+#         "buffer": ReplayBuffer(
+#             buffer_size=int(world.learning_config.get("replay_buffer_size", 5e5)),
+#             obs_dim=world.learning_role.rl_algorithm.obs_dim,
+#             act_dim=world.learning_role.rl_algorithm.act_dim,
+#             n_rl_units=len(world.learning_role.rl_strats),
+#             device=world.learning_role.device,
+#             float_type=world.learning_role.float_type,
+#         ),
+#         "actors_and_critics": None,
+#         "max_eval": defaultdict(lambda: -1e9),
+#         "all_eval": defaultdict(list),
+#         "avg_all_eval": [],
+#         "episodes_done": 0,
+#         "eval_episodes_done": 0,
+#         "noise_scale": world.learning_config.get("noise_scale", 1.0),
+#     }
+
+#     # -----------------------------------------
+
+#     validation_interval = min(
+#         world.learning_role.training_episodes,
+#         world.learning_config.get("validation_episodes_interval", 5),
+#     )
+
+#     eval_episode = 1
+
+#     for episode in tqdm(
+#         range(1, world.learning_role.training_episodes + 1),
+#         desc="Training Episodes",
+#     ):
+#         # TODO normally, loading twice should not create issues, somehow a scheduling issue is raised currently
+#         if episode != 1:
+#             setup_world(
+#                 world=world,
+#                 scenario_data=scenario_data,
+#                 study_case=study_case,
+#                 episode=episode,
+#             )
+
+#         # -----------------------------------------
+#         # Give the newly initliazed learning role the needed information across episodes
+#         world.learning_role.load_inter_episodic_data(inter_episodic_data)
+
+#         world.run()
+
+#         # -----------------------------------------
+#         # Store updated information across episodes
+#         inter_episodic_data = world.learning_role.get_inter_episodic_data()
+#         inter_episodic_data["episodes_done"] = episode
+
+#         # evaluation run:
+#         if (
+#             episode % validation_interval == 0
+#             and episode
+#             >= world.learning_role.episodes_collecting_initial_experience
+#             + validation_interval
+#         ):
+#             world.reset()
+
+#             # load evaluation run
+#             setup_world(
+#                 world=world,
+#                 scenario_data=scenario_data,
+#                 study_case=study_case,
+#                 perform_evaluation=True,
+#                 eval_episode=eval_episode,
+#             )
+
+#             world.learning_role.load_inter_episodic_data(inter_episodic_data)
+
+#             world.run()
+
+#             total_rewards = world.output_role.get_sum_reward()
+#             avg_reward = np.mean(total_rewards)
+#             # check reward improvement in evaluation run
+#             # and store best run in eval folder
+#             terminate = world.learning_role.compare_and_save_policies(
+#                 {"avg_reward": avg_reward}
+#             )
+
+#             inter_episodic_data["eval_episodes_done"] = eval_episode
+
+#             # if we have not improved in the last x evaluations, we stop loop
+#             if terminate:
+#                 break
+
+#             eval_episode += 1
+
+#         world.reset()
+
+#         # if at end of simulation save last policies
+#         if episode == (world.learning_role.training_episodes):
+#             world.learning_role.rl_algorithm.save_params(
+#                 directory=f"{world.learning_role.trained_policies_save_path}/last_policies"
+#             )
+
+#         # container shutdown implicitly with new initialisation
+#     logger.info("################")
+#     logger.info("Training finished, Start evaluation run")
+#     world.export_csv_path = temp_csv_path
+
+#     world.reset()
+
+#     # load scenario for evaluation
+#     setup_world(
+#         world=world,
+#         scenario_data=scenario_data,
+#         study_case=study_case,
+#         terminate_learning=True,
+#     )
+
+#     world.learning_role.load_inter_episodic_data(inter_episodic_data)
+
+
 if __name__ == "__main__":
     data = read_grid(Path("examples/inputs/example_01d"))
diff --git a/assume/strategies/learning_advanced_orders.py b/assume/strategies/learning_advanced_orders.py
index 6aa1d7e30..524e3057f 100644
--- a/assume/strategies/learning_advanced_orders.py
+++ b/assume/strategies/learning_advanced_orders.py
@@ -285,8 +285,6 @@ def calculate_bids(
         bids = self.remove_empty_bids(bids)
 
         return bids
-
-    def get_actions(self, next_observation):
         """
         Gets actions for a unit containing two bid prices depending on the observation
 
diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index 8da7961f6..0026b8bfd 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -14,19 +14,32 @@ tiny:
     trained_policies_save_path: null
     max_bid_price: 100
     algorithm: matd3
-    actor_architecture: mlp
-    learning_rate: 0.001
-    training_episodes: 10
-    episodes_collecting_initial_experience: 3
-    train_freq: 24h
-    gradient_steps: -1
-    batch_size: 64
-    gamma: 0.99
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
-    validation_episodes_interval: 5
+    matd3:
+      actor_architecture: mlp
+      learning_rate: 0.001
+      training_episodes: 10
+      episodes_collecting_initial_experience: 3
+      train_freq: 24h
+      gradient_steps: -1
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
+      validation_episodes_interval: 5
+    ppo: 
+      actor_architecture: mlp
+      learning_rate: 0.001
+      training_episodes: 10
+      train_freq: 24h
+      gradient_steps: -1
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
+      validation_episodes_interval: 5
 
   markets_config:
     EOM:
@@ -58,21 +71,38 @@ base:
     trained_policies_save_path: null
     max_bid_price: 100
     algorithm: matd3
-    actor_architecture: mlp
-    learning_rate: 0.001
-    training_episodes: 50
-    episodes_collecting_initial_experience: 5
-    train_freq: 24h
-    gradient_steps: -1
-    batch_size: 256
-    gamma: 0.99
-    device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
-    validation_episodes_interval: 5
-    early_stopping_steps: 10
-    early_stopping_threshold: 0.05
+    matd3:
+      actor_architecture: mlp
+      learning_rate: 0.001
+      training_episodes: 50
+      episodes_collecting_initial_experience: 5
+      train_freq: 24h
+      gradient_steps: -1
+      batch_size: 256
+      gamma: 0.99
+      device: cpu
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
+      validation_episodes_interval: 5
+      early_stopping_steps: 10
+      early_stopping_threshold: 0.05
+    ppo:
+      actor_architecture: mlp
+      learning_rate: 0.001
+      training_episodes: 50
+      episodes_collecting_initial_experience: 5
+      train_freq: 24h
+      gradient_steps: -1
+      batch_size: 256
+      gamma: 0.99
+      device: cpu
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
+      validation_episodes_interval: 5
+      early_stopping_steps: 10
+      early_stopping_threshold: 0.05
 
   markets_config:
     EOM:
diff --git a/examples/inputs/example_02a/config_backup.yaml b/examples/inputs/example_02a/config_backup.yaml
new file mode 100644
index 000000000..8da7961f6
--- /dev/null
+++ b/examples/inputs/example_02a/config_backup.yaml
@@ -0,0 +1,137 @@
+# SPDX-FileCopyrightText: ASSUME Developers
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+tiny:
+  start_date: 2019-01-01 00:00
+  end_date: 2019-01-05 00:00
+  time_step: 1h
+  save_frequency_hours: null
+  learning_mode: True
+
+  learning_config:
+    continue_learning: False
+    trained_policies_save_path: null
+    max_bid_price: 100
+    algorithm: matd3
+    actor_architecture: mlp
+    learning_rate: 0.001
+    training_episodes: 10
+    episodes_collecting_initial_experience: 3
+    train_freq: 24h
+    gradient_steps: -1
+    batch_size: 64
+    gamma: 0.99
+    device: cpu
+    noise_sigma: 0.1
+    noise_scale: 1
+    noise_dt: 1
+    validation_episodes_interval: 5
+
+  markets_config:
+    EOM:
+      operator: EOM_operator
+      product_type: energy
+      products:
+        - duration: 1h
+          count: 1
+          first_delivery: 1h
+      opening_frequency: 1h
+      opening_duration: 1h
+      volume_unit: MWh
+      maximum_bid_volume: 100000
+      maximum_bid_price: 3000
+      minimum_bid_price: -500
+      price_unit: EUR/MWh
+      market_mechanism: pay_as_clear
+
+
+base:
+  start_date: 2019-03-01 00:00
+  end_date: 2019-03-31 00:00
+  time_step: 1h
+  save_frequency_hours: null
+  learning_mode: True
+
+  learning_config:
+    continue_learning: False
+    trained_policies_save_path: null
+    max_bid_price: 100
+    algorithm: matd3
+    actor_architecture: mlp
+    learning_rate: 0.001
+    training_episodes: 50
+    episodes_collecting_initial_experience: 5
+    train_freq: 24h
+    gradient_steps: -1
+    batch_size: 256
+    gamma: 0.99
+    device: cpu
+    noise_sigma: 0.1
+    noise_scale: 1
+    noise_dt: 1
+    validation_episodes_interval: 5
+    early_stopping_steps: 10
+    early_stopping_threshold: 0.05
+
+  markets_config:
+    EOM:
+      operator: EOM_operator
+      product_type: energy
+      products:
+        - duration: 1h
+          count: 1
+          first_delivery: 1h
+      opening_frequency: 1h
+      opening_duration: 1h
+      volume_unit: MWh
+      maximum_bid_volume: 100000
+      maximum_bid_price: 3000
+      minimum_bid_price: -500
+      price_unit: EUR/MWh
+      market_mechanism: pay_as_clear
+
+base_lstm:
+  start_date: 2019-03-01 00:00
+  end_date: 2019-03-31 00:00
+  time_step: 1h
+  save_frequency_hours: null
+  learning_mode: True
+
+  learning_config:
+    continue_learning: False
+    trained_policies_save_path: null
+    max_bid_price: 100
+    algorithm: matd3
+    actor_architecture: lstm
+    learning_rate: 0.001
+    training_episodes: 50
+    episodes_collecting_initial_experience: 5
+    train_freq: 24h
+    gradient_steps: -1
+    batch_size: 256
+    gamma: 0.99
+    device: cpu
+    noise_sigma: 0.1
+    noise_scale: 1
+    noise_dt: 1
+    validation_episodes_interval: 5
+    early_stopping_steps: 10
+    early_stopping_threshold: 0.05
+
+  markets_config:
+    EOM:
+      operator: EOM_operator
+      product_type: energy
+      products:
+        - duration: 1h
+          count: 1
+          first_delivery: 1h
+      opening_frequency: 1h
+      opening_duration: 1h
+      volume_unit: MWh
+      maximum_bid_volume: 100000
+      maximum_bid_price: 3000
+      minimum_bid_price: -500
+      price_unit: EUR/MWh
+      market_mechanism: pay_as_clear

From 552f54928a1be73d090aa2789279014a6cf36818 Mon Sep 17 00:00:00 2001
From: ufqjh <ufqjh@student.kit.edu>
Date: Sat, 28 Sep 2024 10:35:13 +0200
Subject: [PATCH 03/23] Buffer changes, update_policy and further advancements
 in DRL related files

---
 assume/common/base.py                         |  11 +-
 .../algorithms/matd3.py                       |  70 +-
 .../reinforcement_learning/algorithms/ppo.py  | 453 ++++-------
 assume/reinforcement_learning/buffer.py       | 333 +++++---
 .../reinforcement_learning/learning_role.py   | 101 ++-
 .../learning_unit_operator.py                 |  68 +-
 .../reinforcement_learning/learning_utils.py  |  50 ++
 .../neural_network_architecture.py            |  72 +-
 assume/scenario/loader_csv.py                 | 246 ++----
 assume/strategies/learning_strategies.py      |  77 +-
 assume/world.py                               |  37 +-
 examples/inputs/example_01a/forecasts_df.csv  | 746 ++++++++++++++++++
 examples/inputs/example_02a/config.yaml       |  27 +-
 examples/inputs/example_02a/forecasts_df.csv  | 122 +++
 14 files changed, 1648 insertions(+), 765 deletions(-)
 create mode 100644 examples/inputs/example_01a/forecasts_df.csv
 create mode 100644 examples/inputs/example_02a/forecasts_df.csv

diff --git a/assume/common/base.py b/assume/common/base.py
index 5b553257d..3c6f17d60 100644
--- a/assume/common/base.py
+++ b/assume/common/base.py
@@ -70,6 +70,8 @@ def __init__(
         self.outputs["rl_observations"] = []
         self.outputs["rl_actions"] = []
         self.outputs["rl_rewards"] = []
+        # For PPO
+        self.outputs["rl_log_probs"] = []
 
         # some data is stored as series to allow to store it in the outputs
         self.outputs["actions"] = pd.Series(0.0, index=self.index, dtype=object)
@@ -781,8 +783,13 @@ def __init__(
         # them into suitable format for recurrent neural networks
         self.num_timeseries_obs_dim = num_timeseries_obs_dim
 
-        print("TEST")
-        print(kwargs.get("algorithm", "lel"))
+        self.rl_algorithm_name = kwargs.get("algorithm", "matd3")
+        if self.rl_algorithm_name == "matd3":
+            from assume.reinforcement_learning.algorithms.matd3 import get_actions
+            self.get_actions = get_actions
+        elif self.rl_algorithm_name == "ppo":
+            from assume.reinforcement_learning.algorithms.ppo import get_actions
+            self.get_actions = get_actions
 
 
 class LearningConfig(TypedDict):
diff --git a/assume/reinforcement_learning/algorithms/matd3.py b/assume/reinforcement_learning/algorithms/matd3.py
index b5100801e..2a5b3293a 100644
--- a/assume/reinforcement_learning/algorithms/matd3.py
+++ b/assume/reinforcement_learning/algorithms/matd3.py
@@ -508,7 +508,6 @@ def update_policy(self):
                     all_actions_clone[:, i, :] = action_i
                     all_actions_clone = all_actions_clone.view(self.batch_size, -1)
 
-                    # Policy gradient calculation start (different for PPO)
                     actor_loss = -critic.q1_forward(
                         all_states, all_actions_clone
                     ).mean()
@@ -516,7 +515,6 @@ def update_policy(self):
                     actor.optimizer.zero_grad()
                     actor_loss.backward()
                     actor.optimizer.step()
-                    # Policy gradient calculation end
 
                     polyak_update(
                         critic.parameters(), critic_target.parameters(), self.tau
@@ -527,3 +525,71 @@ def update_policy(self):
                 i += 1
 
 
+def get_actions(rl_strategy, next_observation):
+    """
+    Gets actions for a unit based on the observation using MATD3.
+
+    Args:
+        rl_strategy (RLStrategy): The strategy containing relevant information.
+        next_observation (torch.Tensor): The observation.
+
+    Returns:
+        torch.Tensor: The actions containing two bid prices.
+        tuple: The noise (if applicable).
+
+    Note:
+        If the agent is in learning mode, the actions are chosen by the actor neuronal net and noise is added to the action.
+        In the first x episodes, the agent is in initial exploration mode, where the action is chosen by noise only to explore 
+        the entire action space. X is defined by episodes_collecting_initial_experience.
+        If the agent is not in learning mode, the actions are chosen by the actor neuronal net without noise.
+    """
+
+    actor = rl_strategy.actor
+    device = rl_strategy.device
+    float_type = rl_strategy.float_type
+    act_dim = rl_strategy.act_dim
+    learning_mode = rl_strategy.learning_mode
+    perform_evaluation = rl_strategy.perform_evaluation
+    action_noise = rl_strategy.action_noise
+    collect_initial_experience_mode = rl_strategy.collect_initial_experience_mode
+
+    # distinction whether we are in learning mode or not to handle exploration realised with noise
+    if learning_mode and not perform_evaluation:
+        # if we are in learning mode the first x episodes we want to explore the entire action space
+        # to get a good initial experience, in the area around the costs of the agent
+        if collect_initial_experience_mode:
+            # define current action as solely noise
+            noise = (
+                th.normal(mean=0.0, std=0.2, size=(1, act_dim), dtype=float_type)
+                .to(device)
+                .squeeze()
+            )
+
+            # =============================================================================
+            # 2.1 Get Actions and handle exploration
+            # =============================================================================
+            base_bid = next_observation[-1]
+
+            # add noise to the last dimension of the observation
+            # needs to be adjusted if observation space is changed, because only makes sense
+            # if the last dimension of the observation space are the marginal cost
+            curr_action = noise + base_bid.clone().detach()
+
+        else:
+            # if we are not in the initial exploration phase we choose the action with the actor neural net
+            # and add noise to the action
+            curr_action = actor(next_observation).detach()  # calls the forward method of the actor network
+            noise = th.tensor(
+                action_noise.noise(), device=device, dtype=float_type
+            )
+            curr_action += noise
+    else:
+        # if we are not in learning mode we just use the actor neural net to get the action without adding noise
+        curr_action = actor(next_observation).detach()
+        noise = tuple(0 for _ in range(act_dim))
+
+    # Clamp actions to be within the valid action space bounds
+    curr_action = curr_action.clamp(-1, 1)
+
+    return curr_action, noise
+
diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index e2dbfe388..9de249993 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -11,7 +11,7 @@
 
 from assume.common.base import LearningStrategy
 from assume.reinforcement_learning.algorithms.base_algorithm import RLAlgorithm
-from assume.reinforcement_learning.neural_network_architecture import ActorPPO, CriticPPO
+from assume.reinforcement_learning.neural_network_architecture import CriticPPO
 
 logger = logging.getLogger(__name__)
 
@@ -34,14 +34,14 @@ def __init__(
         self,
         learning_role,
         learning_rate=1e-4,
-        batch_size=1024,
-        gamma=0.99,
-        epochs=10,  # PPO specific
-        clip_ratio=0.2,  # PPO specific
-        vf_coef=0.5,  # PPO specific
-        entropy_coef=0.01,  # PPO specific
-        max_grad_norm=0.5,  # PPO specific
-        gae_lambda=0.95,  # PPO specific
+        gamma=0.99, # Discount factor for future rewards
+        epochs=10,  # Number of epochs for updating the policy
+        clip_ratio=0.2,  # Clipping parameter for policy updates
+        vf_coef=0.5,  # Value function coefficient in the loss function
+        entropy_coef=0.02,  # Entropy coefficient for exploration
+        max_grad_norm=0.5,  # Gradient clipping value
+        gae_lambda=0.95,  # GAE lambda for advantage estimation
+        batch_size=5, # Batch size for each update, if mini-batch approach is used (currently not implemented)
         actor_architecture="mlp",
     ):
         super().__init__(
@@ -57,6 +57,7 @@ def __init__(
         self.entropy_coef = entropy_coef
         self.max_grad_norm = max_grad_norm
         self.gae_lambda = gae_lambda
+        self.n_updates = 0 # Number of updates performed
         
 
     # Unchanged method from MATD3
@@ -255,10 +256,12 @@ def create_actors(self) -> None:
         act_dim_list = []
 
         for _, unit_strategy in self.learning_role.rl_strats.items():
-            unit_strategy.actor = ActorPPO(
+            unit_strategy.actor = self.actor_architecture_class(
                 obs_dim=unit_strategy.obs_dim,
                 act_dim=unit_strategy.act_dim,
                 float_type=self.float_type,
+                unique_obs_dim=unit_strategy.unique_obs_dim,
+                num_timeseries_obs_dim=unit_strategy.num_timeseries_obs_dim,
             ).to(self.device)
 
             # unit_strategy.actor_target = Actor(
@@ -292,13 +295,13 @@ def create_actors(self) -> None:
     # Changed initialization of CriticPPO compared to MATD3 
     def create_critics(self) -> None:
         """
-        Create critic networks for reinforcement learning.
+        Create decentralized critic networks for reinforcement learning.
 
-        This method initializes critic networks for each agent in the reinforcement learning setup.
+        This method initializes a separate critic network for each agent in the reinforcement learning setup.
+        Each critic learns to predict the value function based on the individual agent's observation.
 
         Notes:
-            The observation dimension need to be the same, due to the centralized critic that all actors share.
-            If you have units with different observation dimensions. They need to have different critics and hence learning roles.
+            Each agent has its own critic, so the critic is no longer shared among all agents.
         """
         n_agents = len(self.learning_role.rl_strats)
         strategy: LearningStrategy
@@ -307,9 +310,7 @@ def create_critics(self) -> None:
         for u_id, strategy in self.learning_role.rl_strats.items():
 
             self.learning_role.critics[u_id] = CriticPPO(
-                n_agents=n_agents,
                 obs_dim=strategy.obs_dim,
-                unique_obs_dim=strategy.unique_obs_dim,
                 float_type=self.float_type,
             )
 
@@ -317,17 +318,17 @@ def create_critics(self) -> None:
                 self.learning_role.critics[u_id].parameters(), lr=self.learning_rate
             )
 
-            self.learning_role.target_critics[u_id].load_state_dict(
-                self.learning_role.critics[u_id].state_dict()
-            )
-            self.learning_role.target_critics[u_id].train(mode=False)
+            # self.learning_role.target_critics[u_id].load_state_dict(
+            #     self.learning_role.critics[u_id].state_dict()
+            # )
+            # self.learning_role.target_critics[u_id].train(mode=False)
 
             self.learning_role.critics[u_id] = self.learning_role.critics[u_id].to(
                 self.device
             )
-            self.learning_role.target_critics[u_id] = self.learning_role.target_critics[
-                u_id
-            ].to(self.device)
+            # self.learning_role.target_critics[u_id] = self.learning_role.target_critics[
+            #     u_id
+            # ].to(self.device)
 
             unique_obs_dim_list.append(strategy.unique_obs_dim)
 
@@ -356,13 +357,13 @@ def extract_policy(self) -> dict:
 
         for u_id, unit_strategy in self.learning_role.rl_strats.items():
             actors[u_id] = unit_strategy.actor
-            actor_targets[u_id] = unit_strategy.actor_target
+            # actor_targets[u_id] = unit_strategy.actor_target
 
         actors_and_critics = {
             "actors": actors,
             # "actor_targets": actor_targets,
             "critics": self.learning_role.critics,
-            "target_critics": self.learning_role.target_critics,
+            # "target_critics": self.learning_role.target_critics,
             "obs_dim": self.obs_dim,
             "act_dim": self.act_dim,
             "unique_obs_dim": self.unique_obs_dim,
@@ -374,274 +375,140 @@ def update_policy(self):
         """
         Perform policy updates using PPO with the clipped objective.
         """
+
+
         logger.debug("Updating Policy")
+        # We will iterate for multiple epochs to update both the policy (actor) and value (critic) networks
+        # The number of epochs controls how many times we update using the same collected data (from the buffer).
+        n_rl_agents = len(self.learning_role.rl_strats.keys())
+        for _ in range(self.epochs): 
+            self.n_updates += 1
+            i = 0
+
+            # Iterate through over each agent's strategy
+            # Each agent has its own actor and critic. Critic (value network) is in comparison to MATD3 decentralized, meaning each agent learns its own value function.
+            for u_id in self.learning_role.rl_strats.keys():
+                critic = self.learning_role.critics[u_id]
+                actor = self.learning_role.rl_strats[u_id].actor
+        
+                # Retrieve experiences from the buffer
+                # The collected experiences (observations, actions, rewards, log_probs) are stored in the buffer.
+                transitions = self.learning_role.buffer.get()
+                states = transitions.observations
+                actions = transitions.actions
+                rewards = transitions.rewards
+                log_probs = transitions.log_probs
 
-        for epoch in range(self.epochs):
-            # Sample a batch from the replay buffer
-            transitions = self.learning_role.buffer.sample(self.batch_size)
-            states, actions, log_probs_old, returns, advantages = (
-                transitions.observations,
-                transitions.actions,
-                transitions.log_probs,
-                transitions.returns,
-                transitions.advantages,
-            )
 
-            # Update the policy (actor)
-            log_probs_new, entropy = self.learning_role.actor.evaluate_actions(states, actions)
-
-            # Calculate the ratio of new policy probability to old policy probability
-            # This represents how much the new policy has changed compared to the old policy
-            ratio = (log_probs_new - log_probs_old).exp()
-
-            # Compute the surrogate loss without clipping
-            # This is the raw loss term based on the advantage function
-            surrogate1 = ratio * advantages
-
-            # Apply the clipping function to the ratio to prevent large policy updates
-            # The clipping function limits the ratio to be within the range [1 - clip_ratio, 1 + clip_ratio]
-            # This prevents the policy from deviating too much from the old policy
-            surrogate2 = th.clamp(ratio, 1.0 - self.clip_ratio, 1.0 + self.clip_ratio) * advantages
-
-            # Calculate the final policy loss by taking the minimum of the unclipped and clipped surrogate losses
-            # The idea is to prevent large changes in policy and ensure stability during training
-            # The final policy loss is the negative mean of this minimum value
-            policy_loss = -th.min(surrogate1, surrogate2).mean()
-
-            surrogate1 = ratio * advantages
-            surrogate2 = th.clamp(ratio, 1.0 - self.clip_ratio, 1.0 + self.clip_ratio) * advantages
-            policy_loss = -th.min(surrogate1, surrogate2).mean()
-
-            # Update the critic (value function)
-            values = self.learning_role.critic(states).squeeze()
-            value_loss = F.mse_loss(returns, values)
-
-            # Total loss
-            loss = policy_loss + self.vf_coef * value_loss - self.entropy_coef * entropy.mean()
-
-            # Optimize actor and critic
-            self.learning_role.actor.optimizer.zero_grad()
-            self.learning_role.critic.optimizer.zero_grad()
-            loss.backward()
-
-            # Clip gradients
-            th.nn.utils.clip_grad_norm_(self.learning_role.actor.parameters(), self.max_grad_norm)
-            th.nn.utils.clip_grad_norm_(self.learning_role.critic.parameters(), self.max_grad_norm)
-
-            self.learning_role.actor.optimizer.step()
-            self.learning_role.critic.optimizer.step()
-
-
-    # def update_policy(self):
-    #     """
-    #     Update the policy of the reinforcement learning agent using the Twin Delayed Deep Deterministic Policy Gradients (TD3) algorithm.
-
-    #     Notes:
-    #         This function performs the policy update step, which involves updating the actor (policy) and critic (Q-function) networks
-    #         using TD3 algorithm. It iterates over the specified number of gradient steps and performs the following steps for each
-    #         learning strategy:
-
-    #         1. Sample a batch of transitions from the replay buffer.
-    #         2. Calculate the next actions with added noise using the actor target network.
-    #         3. Compute the target Q-values based on the next states, rewards, and the target critic network.
-    #         4. Compute the critic loss as the mean squared error between current Q-values and target Q-values.
-    #         5. Optimize the critic network by performing a gradient descent step.
-    #         6. Optionally, update the actor network if the specified policy delay is reached.
-    #         7. Apply Polyak averaging to update target networks.
-
-    #         This function implements the TD3 algorithm's key step for policy improvement and exploration.
-    #     """
-
-    #     logger.debug("Updating Policy")
-    #     n_rl_agents = len(self.learning_role.rl_strats.keys())
-    #     for _ in range(self.gradient_steps):
-    #         self.n_updates += 1
-    #         i = 0
-
-    #         for u_id in self.learning_role.rl_strats.keys():
-    #             critic_target = self.learning_role.target_critics[u_id]
-    #             critic = self.learning_role.critics[u_id]
-    #             actor = self.learning_role.rl_strats[u_id].actor
-    #             actor_target = self.learning_role.rl_strats[u_id].actor_target
-
-    #             if i % 100 == 0:
-    #                 transitions = self.learning_role.buffer.sample(self.batch_size)
-    #                 states = transitions.observations
-    #                 actions = transitions.actions
-    #                 next_states = transitions.next_observations
-    #                 rewards = transitions.rewards
-
-    #                 with th.no_grad():
-    #                     # Select action according to policy and add clipped noise
-    #                     noise = actions.clone().data.normal_(
-    #                         0, self.target_policy_noise
-    #                     )
-    #                     noise = noise.clamp(
-    #                         -self.target_noise_clip, self.target_noise_clip
-    #                     )
-    #                     next_actions = [
-    #                         (actor_target(next_states[:, i, :]) + noise[:, i, :]).clamp(
-    #                             -1, 1
-    #                         )
-    #                         for i in range(n_rl_agents)
-    #                     ]
-    #                     next_actions = th.stack(next_actions)
-
-    #                     next_actions = next_actions.transpose(0, 1).contiguous()
-    #                     next_actions = next_actions.view(-1, n_rl_agents * self.act_dim)
-
-    #             all_actions = actions.view(self.batch_size, -1)
-
-    #             # this takes the unique observations from all other agents assuming that
-    #             # the unique observations are at the end of the observation vector
-    #             temp = th.cat(
-    #                 (
-    #                     states[:, :i, self.obs_dim - self.unique_obs_dim :].reshape(
-    #                         self.batch_size, -1
-    #                     ),
-    #                     states[
-    #                         :, i + 1 :, self.obs_dim - self.unique_obs_dim :
-    #                     ].reshape(self.batch_size, -1),
-    #                 ),
-    #                 axis=1,
-    #             )
-
-    #             # the final all_states vector now contains the current agent's observation
-    #             # and the unique observations from all other agents
-    #             all_states = th.cat(
-    #                 (states[:, i, :].reshape(self.batch_size, -1), temp), axis=1
-    #             ).view(self.batch_size, -1)
-    #             # all_states = states[:, i, :].reshape(self.batch_size, -1)
-
-    #             # this is the same as above but for the next states
-    #             temp = th.cat(
-    #                 (
-    #                     next_states[
-    #                         :, :i, self.obs_dim - self.unique_obs_dim :
-    #                     ].reshape(self.batch_size, -1),
-    #                     next_states[
-    #                         :, i + 1 :, self.obs_dim - self.unique_obs_dim :
-    #                     ].reshape(self.batch_size, -1),
-    #                 ),
-    #                 axis=1,
-    #             )
-
-    #             # the final all_next_states vector now contains the current agent's observation
-    #             # and the unique observations from all other agents
-    #             all_next_states = th.cat(
-    #                 (next_states[:, i, :].reshape(self.batch_size, -1), temp), axis=1
-    #             ).view(self.batch_size, -1)
-    #             # all_next_states = next_states[:, i, :].reshape(self.batch_size, -1)
-
-    #             with th.no_grad():
-    #                 # Compute the next Q-values: min over all critics targets
-    #                 next_q_values = th.cat(
-    #                     critic_target(all_next_states, next_actions), dim=1
-    #                 )
-    #                 next_q_values, _ = th.min(next_q_values, dim=1, keepdim=True)
-    #                 target_Q_values = (
-    #                     rewards[:, i].unsqueeze(1) + self.gamma * next_q_values
-    #                 )
-
-    #             # Get current Q-values estimates for each critic network
-    #             current_Q_values = critic(all_states, all_actions)
-
-    #             # Compute critic loss
-    #             critic_loss = sum(
-    #                 F.mse_loss(current_q, target_Q_values)
-    #                 for current_q in current_Q_values
-    #             )
-
-    #             # Optimize the critics
-    #             critic.optimizer.zero_grad()
-    #             critic_loss.backward()
-    #             critic.optimizer.step()
-
-    #             # Delayed policy updates
-    #             if self.n_updates % self.policy_delay == 0:
-    #                 # Compute actor loss
-    #                 state_i = states[:, i, :]
-    #                 action_i = actor(state_i)
-
-    #                 all_actions_clone = actions.clone()
-    #                 all_actions_clone[:, i, :] = action_i
-    #                 all_actions_clone = all_actions_clone.view(self.batch_size, -1)
-
-    #                 actor_loss = -critic.q1_forward(
-    #                     all_states, all_actions_clone
-    #                 ).mean()
-
-    #                 actor.optimizer.zero_grad()
-    #                 actor_loss.backward()
-    #                 actor.optimizer.step()
-
-    #                 polyak_update(
-    #                     critic.parameters(), critic_target.parameters(), self.tau
-    #                 )
-    #                 polyak_update(
-    #                     actor.parameters(), actor_target.parameters(), self.tau
-    #                 )
-
-    #             i += 1
-
-
-
-#     def save_params(self, directory):
-#         """ Save the parameters of the actor and critic networks """
-#         self.save_actor_params(directory=f"{directory}/actors")
-#         self.save_critic_params(directory=f"{directory}/critics")
-
-#     def save_actor_params(self, directory):
-#         """ Save actor parameters. """
-#         os.makedirs(directory, exist_ok=True)
-#         for u_id in self.learning_role.rl_strats.keys():
-#             obj = {
-#                 "actor": self.learning_role.rl_strats[u_id].actor.state_dict(),
-#                 "actor_optimizer": self.learning_role.rl_strats[u_id].actor.optimizer.state_dict(),
-#             }
-#             path = f"{directory}/actor_{u_id}.pt"
-#             th.save(obj, path)
-
-#     def save_critic_params(self, directory):
-#         """ Save critic parameters. """
-#         os.makedirs(directory, exist_ok=True)
-#         for u_id in self.learning_role.rl_strats.keys():
-#             obj = {
-#                 "critic": self.learning_role.critics[u_id].state_dict(),
-#                 "critic_optimizer": self.learning_role.critics[u_id].optimizer.state_dict(),
-#             }
-#             path = f"{directory}/critic_{u_id}.pt"
-#             th.save(obj, path)
-
-#     def load_params(self, directory: str) -> None:
-#         """ Load actor and critic parameters """
-#         self.load_actor_params(directory)
-#         self.load_critic_params(directory)
-
-#     def load_actor_params(self, directory: str) -> None:
-#         """ Load actor parameters from a directory """
-#         if not os.path.exists(directory):
-#             logger.warning("Actor directory does not exist! Initializing randomly.")
-#             return
-
-#         for u_id in self.learning_role.rl_strats.keys():
-#             try:
-#                 actor_params = self.load_obj(f"{directory}/actors/actor_{str(u_id)}.pt")
-#                 self.learning_role.rl_strats[u_id].actor.load_state_dict(actor_params["actor"])
-#                 self.learning_role.rl_strats[u_id].actor.optimizer.load_state_dict(actor_params["actor_optimizer"])
-#             except Exception:
-#                 logger.warning(f"No actor values loaded for agent {u_id}")
-
-#     def load_critic_params(self, directory: str) -> None:
-#         """ Load critic parameters from a directory """
-#         if not os.path.exists(directory):
-#             logger.warning("Critic directory does not exist! Initializing randomly.")
-#             return
-
-#         for u_id in self.learning_role.rl_strats.keys():
-#             try:
-#                 critic_params = self.load_obj(f"{directory}/critics/critic_{str(u_id)}.pt")
-#                 self.learning_role.critics[u_id].load_state_dict(critic_params["critic"])
-#                 self.learning_role.critics[u_id].optimizer.load_state_dict(critic_params["critic_optimizer"])
-#             except Exception:
-#                 logger.warning(f"No critic values loaded for agent {u_id}")
+
+
+                # STARTING FROM HERE, THE IMPLEMENTATION NEEDS TO BE FIXED
+                # Potentially, it could be useful to source some functionality out into methods stored in buffer.py
+
+
+
+                # Pass the current states through the critic network to get value estimates.
+                values = critic(states) 
+                
+                # Store the calculated values in the rollout buffer
+                # These values are used later to calculate the advantage estimates (for policy updates).
+                self.learning_role.buffer.values = values.detach().cpu().numpy()
+
+                print("Buffer values")
+                print(self.learning_role.buffer.values)
+        
+                # Compute advantages using Generalized Advantage Estimation (GAE)
+                advantages = []
+                last_advantage = 0
+                returns = []
+                for t in reversed(range(len(rewards))):
+                    if t == len(rewards) - 1:
+                        next_value = 0
+                    else:
+                        next_value = values[t + 1]
+
+                    # Temporal difference delta
+                    delta = rewards[t] + self.gamma * next_value - values[t]  # Use self.gamma for discount factor
+                    
+                    # GAE advantage
+                    last_advantage = delta + self.gamma * self.gae_lambda * last_advantage  # Use self.gae_lambda for advantage estimation
+                    advantages.insert(0, last_advantage)
+                    returns.insert(0, last_advantage + values[t])
+
+                # Convert advantages and returns to tensors
+                advantages = th.tensor(advantages, dtype=th.float32, device=self.device)
+                returns = th.tensor(returns, dtype=th.float32, device=self.device)
+
+                # Evaluate the new log-probabilities and entropy under the current policy
+                action_means = actor(states)
+                action_stddev = th.ones_like(action_means)  # Assuming fixed standard deviation for simplicity
+                dist = th.distributions.Normal(action_means, action_stddev)
+                new_log_probs = dist.log_prob(actions).sum(-1)
+                entropy = dist.entropy().sum(-1)
+
+                # Compute the ratio of new policy to old policy
+                ratio = (new_log_probs - log_probs).exp()
+
+                # Surrogate loss calculation
+                surrogate1 = ratio * advantages
+                surrogate2 = th.clamp(ratio, 1.0 - self.clip_ratio, 1.0 + self.clip_ratio) * advantages  # Use self.clip_ratio
+
+                # Final policy loss (clipped surrogate loss)
+                policy_loss = -th.min(surrogate1, surrogate2).mean()
+
+                # Value loss (mean squared error between the predicted values and returns)
+                value_loss = F.mse_loss(returns, values)
+
+                # Total loss: policy loss + value loss - entropy bonus
+                total_loss = policy_loss + self.vf_coef * value_loss - self.entropy_coef * entropy.mean()  # Use self.vf_coef and self.entropy_coef
+
+                # Zero the gradients and perform backpropagation for both actor and critic
+                actor.optimizer.zero_grad()
+                critic.optimizer.zero_grad()
+                total_loss.backward()
+
+                # Clip gradients to prevent gradient explosion
+                th.nn.utils.clip_grad_norm_(actor.parameters(), self.max_grad_norm)  # Use self.max_grad_norm
+                th.nn.utils.clip_grad_norm_(critic.parameters(), self.max_grad_norm)  # Use self.max_grad_norm
+
+                # Perform optimization steps
+                actor.optimizer.step()
+                critic.optimizer.step()
+
+
+def get_actions(rl_strategy, next_observation):
+    """
+    Gets actions for a unit based on the observation using PPO.
+
+    Args:
+        rl_strategy (RLStrategy): The strategy containing relevant information.
+        next_observation (torch.Tensor): The observation.
+
+    Returns:
+        torch.Tensor: The sampled actions.
+        torch.Tensor: The log probability of the sampled actions.
+    """
+
+    actor = rl_strategy.actor
+    device = rl_strategy.device
+
+    # Pass observation through the actor network to get action logits (mean of action distribution)
+    action_logits = actor(next_observation.to(device))
+
+    # Create a normal distribution for continuous actions (with assumed standard deviation of 1.0)
+    action_distribution = th.distributions.Normal(action_logits, 1.0)
+
+    # Sample an action from the distribution
+    sampled_action = action_distribution.sample()
+
+    # Get the log probability of the sampled action (for later PPO loss calculation)
+    log_prob_action = action_distribution.log_prob(sampled_action).sum(dim=-1)
+
+    # Detach the log probability tensor to stop gradient tracking (since you only need the value for later)
+    log_prob_action = log_prob_action.detach()
+
+    # Bound actions to the valid action space range
+    sampled_action = sampled_action.clamp(-1, 1)
+
+    return sampled_action, log_prob_action
+
diff --git a/assume/reinforcement_learning/buffer.py b/assume/reinforcement_learning/buffer.py
index d7184a285..b099d61fe 100644
--- a/assume/reinforcement_learning/buffer.py
+++ b/assume/reinforcement_learning/buffer.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import torch as th
+import datetime
 
 try:
     # Check memory used by replay buffer when possible
@@ -141,6 +142,8 @@ def add(
         self.rewards[self.pos : self.pos + len_obs] = reward.copy()
 
         self.pos += len_obs
+
+        # Circular buffer
         if self.pos + len_obs >= self.buffer_size:
             self.full = True
             self.pos = 0
@@ -173,111 +176,261 @@ def sample(self, batch_size: int) -> ReplayBufferSamples:
 
         return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
 
+class RolloutBufferTransitions(NamedTuple):
+    """
+    A named tuple that represents the data stored in a rollout buffer for PPO. 
+
+    Attributes:
+        observations (torch.Tensor): The observations of the agents.
+        actions (torch.Tensor): The actions taken by the agents.
+        log_probs (torch.Tensor): The log probabilities of the actions taken.
+        advantages (torch.Tensor): The advantages calculated using GAE.
+        returns (torch.Tensor): The returns (discounted rewards) calculated.
+    """
+    observations: th.Tensor
+    actions: th.Tensor
+    rewards: th.Tensor
+    log_probs: th.Tensor
 
 class RolloutBuffer:
-    def __init__(self, buffer_size, obs_dim, act_dim, n_agents, gamma=0.99, gae_lambda=0.95, device="cpu"):
+    def __init__(
+        self,
+        obs_dim: int,
+        act_dim: int,
+        n_rl_units: int,
+        device: str,
+        float_type,
+        initial_size: int = 0,
+    ):
         """
-        A class for storing rollout data for PPO in a multi-agent setting.
-        Stores the trajectories (observations, actions, rewards, log_probs) for all agents.
-
+        A class that represents a rollout buffer for storing observations, actions, and rewards.
+        The buffer starts empty and is dynamically expanded when needed.
+        
         Args:
-            buffer_size (int): Max size of the buffer (in terms of time steps).
-            obs_dim (int): Dimension of the observation space.
-            act_dim (int): Dimension of the action space.
-            n_agents (int): Number of agents.
-            gamma (float): Discount factor for rewards.
-            gae_lambda (float): Lambda parameter for Generalized Advantage Estimation (GAE).
-            device (str): Device to store the data ('cpu' or 'cuda').
+            obs_dim (int): The dimension of the observation space.
+            act_dim (int): The dimension of the action space.
+            n_rl_units (int): The number of reinforcement learning units.
+            device (str): The device to use for storing the data (e.g., 'cpu' or 'cuda').
+            float_type (torch.dtype): The data type to use for the stored data.
+            initial_size (int): The initial size of the buffer (default is 0).
         """
-        self.buffer_size = buffer_size
+        
         self.obs_dim = obs_dim
         self.act_dim = act_dim
-        self.n_agents = n_agents
+        self.n_rl_units = n_rl_units
         self.device = device
-        self.gamma = gamma
-        self.gae_lambda = gae_lambda
-
-        # Initialize buffers
-        self.observations = np.zeros((buffer_size, n_agents, obs_dim), dtype=np.float32)
-        self.actions = np.zeros((buffer_size, n_agents, act_dim), dtype=np.float32)
-        self.rewards = np.zeros((buffer_size, n_agents), dtype=np.float32)
-        self.log_probs = np.zeros((buffer_size, n_agents), dtype=np.float32)
-        self.values = np.zeros((buffer_size, n_agents), dtype=np.float32)
-        self.advantages = np.zeros((buffer_size, n_agents), dtype=np.float32)
-        self.returns = np.zeros((buffer_size, n_agents), dtype=np.float32)
-        self.masks = np.ones((buffer_size, n_agents), dtype=np.float32)  # Mask to indicate episode boundaries (1 for ongoing episode, 0 if episode ended)
+
+        # Start with no buffer (None), will be created dynamically when first data is added
+        self.observations = None  # Stores the agent's observations (states) at each timestep
+        self.actions = None  # Stores the actions taken by the agent
+        self.rewards = None  # Stores the rewards received after each action
+        self.log_probs = None  # Stores the log-probabilities of the actions, used to compute the ratio for policy update 
+
+        self.values = None  # Stores the value estimates (critic's predictions) of each state
+        self.advantages = None  # Stores the computed advantages using GAE (Generalized Advantage Estimation), central to PPO's policy updates
+        self.returns = None  # Stores the discounted rewards (also known as returns), used to compute the value loss for training the critic
 
         self.pos = 0
+        self.full = False
+
+        # Datatypes for numpy and PyTorch
+        self.np_float_type = np.float16 if float_type == th.float16 else np.float32
+        self.th_float_type = float_type
+
+    def initialize_buffer(self, size):
+        """Initializes the buffer with the given size."""
+        self.observations = np.zeros(
+            (size, self.n_rl_units, self.obs_dim), dtype=self.np_float_type
+        )
+        self.actions = np.zeros(
+            (size, self.n_rl_units, self.act_dim), dtype=self.np_float_type
+        )
+        self.rewards = np.zeros(
+            (size, self.n_rl_units), dtype=self.np_float_type
+        )
+        self.log_probs = np.zeros(
+            (size, self.n_rl_units), dtype=np.float32
+        )
+        self.values = np.zeros(
+            (size, self.n_rl_units), dtype=np.float32
+        )  
+        self.advantages = np.zeros(
+            (size, self.n_rl_units), dtype=np.float32
+        ) 
+        self.returns = np.zeros(
+            (size, self.n_rl_units), dtype=np.float32
+        ) 
+
+def expand_buffer(self, additional_size):
+    """Expands the buffer by the given additional size and checks if there is enough memory available."""
+    
+    # Calculation of the memory requirement for all 7 arrays
+    additional_memory_usage = (
+        np.zeros((additional_size, self.n_rl_units, self.obs_dim), dtype=self.np_float_type).nbytes +
+        np.zeros((additional_size, self.n_rl_units, self.act_dim), dtype=self.np_float_type).nbytes +
+        np.zeros((additional_size, self.n_rl_units), dtype=self.np_float_type).nbytes +    # rewards
+        np.zeros((additional_size, self.n_rl_units), dtype=np.float32).nbytes +           # log_probs
+        np.zeros((additional_size, self.n_rl_units), dtype=np.float32).nbytes +           # values
+        np.zeros((additional_size, self.n_rl_units), dtype=np.float32).nbytes +           # advantages
+        np.zeros((additional_size, self.n_rl_units), dtype=np.float32).nbytes             # returns
+    )
+
+    # Check whether enough memory is available
+    if psutil is not None:
+        mem_available = psutil.virtual_memory().available
+        if additional_memory_usage > mem_available:
+            # Conversion to GB
+            additional_memory_usage_gb = additional_memory_usage / 1e9
+            mem_available_gb = mem_available / 1e9
+            warnings.warn(
+                f"Not enough memory to expand the RolloutBuffer: "
+                f"{additional_memory_usage_gb:.2f}GB required, but only {mem_available_gb:.2f}GB available."
+            )
+
+        self.observations = np.concatenate(
+            (self.observations, np.zeros((additional_size, self.n_rl_units, self.obs_dim), dtype=self.np_float_type)),
+            axis=0
+        )
+        self.actions = np.concatenate(
+            (self.actions, np.zeros((additional_size, self.n_rl_units, self.act_dim), dtype=self.np_float_type)),
+            axis=0
+        )
+        self.rewards = np.concatenate(
+            (self.rewards, np.zeros((additional_size, self.n_rl_units), dtype=self.np_float_type)),
+            axis=0
+        )
+        self.log_probs = np.concatenate(
+            (self.log_probs, np.zeros((additional_size, self.n_rl_units), dtype=np.float32)),
+            axis=0
+        )
+        self.values = np.concatenate(
+            (self.values, np.zeros((additional_size, self.n_rl_units), dtype=np.float32)), 
+            axis=0
+        )  
+        self.advantages = np.concatenate(
+            (self.advantages, np.zeros((additional_size, self.n_rl_units), dtype=np.float32)),
+            axis=0
+        )
+        self.returns = np.concatenate(
+            (self.returns, np.zeros((additional_size, self.n_rl_units), dtype=np.float32)),
+            axis=0
+        )
 
-    def add(self, obs, actions, rewards, log_probs, values, dones):
+    def add(
+        self,
+        obs: np.array,
+        actions: np.array,
+        reward: np.array,
+        log_probs: np.array,
+    ):
         """
-        Add data for the current time step to the buffer.
+        Adds an observation, action, reward, and log probabilities of all agents to the rollout buffer.
+        If the buffer does not exist, it will be initialized. If the buffer is full, it will be expanded.
         
         Args:
-            obs (np.array): The observations for all agents.
-            actions (np.array): The actions taken by all agents.
-            rewards (np.array): The rewards received by all agents.
-            log_probs (np.array): The log probabilities of the actions taken.
-            values (np.array): The value estimates for all agents.
-            dones (np.array): Whether the episode has finished for each agent.
+            obs (numpy.ndarray): The observation to add.
+            actions (numpy.ndarray): The actions to add.
+            reward (numpy.ndarray): The reward to add.
+            log_probs (numpy.ndarray): The log probabilities of the actions taken.
         """
-        self.observations[self.pos] = obs
-        self.actions[self.pos] = actions
-        self.rewards[self.pos] = rewards
-        self.log_probs[self.pos] = log_probs
-        self.values[self.pos] = values
-        self.masks[self.pos] = 1.0 - dones
+        len_obs = obs.shape[0]
+
+        if self.observations is None:
+            # Initialize buffer with initial size if it's the first add
+            self.initialize_buffer(len_obs)
 
-        self.pos += 1
+        elif self.pos + len_obs > self.observations.shape[0]:
+            # If the buffer is full, expand it
+            self.expand_buffer(len_obs)
 
-    def compute_returns_and_advantages(self, last_values, dones):
+        # Add data to the buffer
+        self.observations[self.pos : self.pos + len_obs] = obs.copy()
+        self.actions[self.pos : self.pos + len_obs] = actions.copy()
+        self.rewards[self.pos : self.pos + len_obs] = reward.copy()
+        self.log_probs[self.pos : self.pos + len_obs] = log_probs.squeeze(-1).copy()
+
+        self.pos += len_obs
+
+        # print("buffer.add() in buffer.py")
+        # print(self.observations)
+        # print(self.actions)
+        # print(self.rewards)
+        # print(self.log_probs)
+
+    def reset(self):
+        """Resets the buffer, clearing all stored data."""
+        self.observations = None
+        self.actions = None
+        self.rewards = None
+        self.log_probs = None
+        self.pos = 0
+        self.full = False
+    
+    # def compute_returns_and_advantages(self, last_values, dones):
+    #     """
+    #     Compute the returns and advantages using Generalized Advantage Estimation (GAE).
+
+    #     Args:
+    #         last_values (np.array): Value estimates for the last observation.
+    #         dones (np.array): Whether the episode has finished for each agent.
+    #     """
+    #     # Initialize the last advantage to 0. This will accumulate as we move backwards in time.
+    #     last_advantage = 0
+        
+    #     # Loop backward through all the steps in the buffer to calculate returns and advantages.
+    #     # This is because GAE (Generalized Advantage Estimation) relies on future rewards,
+    #     # so we compute it from the last step back to the first step.
+    #     for step in reversed(range(self.pos)):
+            
+    #         # If we are at the last step in the buffer
+    #         if step == self.pos - 1:
+    #             # If it's the last step, check whether the episode has finished using `dones`.
+    #             # `next_non_terminal` is 0 if the episode has ended, 1 if it's ongoing.
+    #             next_non_terminal = 1.0 - dones
+    #             # Use the provided last values (value estimates for the final observation in the episode)
+    #             next_values = last_values
+    #         else:
+    #             # For other steps, use the mask to determine if the episode is ongoing.
+    #             # If `masks[step + 1]` is 1, the episode is ongoing; if it's 0, the episode has ended.
+    #             next_non_terminal = self.masks[step + 1]
+    #             # Use the value of the next time step to compute the future returns
+    #             next_values = self.values[step + 1]
+
+    #         # Temporal difference (TD) error, also known as delta:
+    #         # This is the difference between the reward obtained at this step and the estimated value of this step
+    #         # plus the discounted value of the next step (if the episode is ongoing).
+    #         # This measures how "off" the value function is at predicting the future return.
+    #         delta = self.rewards[step] + self.gamma * next_values * next_non_terminal - self.values[step]
+
+    #         # Compute the advantage for this step using GAE:
+    #         # `delta` is the immediate advantage, and we add to it the discounted future advantage,
+    #         # scaled by the factor `lambda` (from GAE). This allows for a more smooth approximation of advantage.
+    #         # `next_non_terminal` ensures that if the episode has ended, the future advantage stops accumulating.
+    #         self.advantages[step] = last_advantage = delta + self.gamma * self.gae_lambda * next_non_terminal * last_advantage
+
+    #         # The return is the advantage plus the baseline value estimate.
+    #         # This makes sure that the return includes both the immediate rewards and the learned value of future rewards.
+    #         self.returns[step] = self.advantages[step] + self.values[step]
+
+    def to_torch(self, array: np.array, copy=True):
         """
-        Compute the returns and advantages using Generalized Advantage Estimation (GAE).
+        Converts a numpy array to a PyTorch tensor. Note: It copies the data by default.
 
         Args:
-            last_values (np.array): Value estimates for the last observation.
-            dones (np.array): Whether the episode has finished for each agent.
+            array (numpy.ndarray): The numpy array to convert.
+            copy (bool, optional): Whether to copy or not the data
+                (may be useful to avoid changing things by reference). Defaults to True.
+
+        Returns:
+            torch.Tensor: The converted PyTorch tensor.
         """
-        # Initialize the last advantage to 0. This will accumulate as we move backwards in time.
-        last_advantage = 0
-        
-        # Loop backward through all the steps in the buffer to calculate returns and advantages.
-        # This is because GAE (Generalized Advantage Estimation) relies on future rewards,
-        # so we compute it from the last step back to the first step.
-        for step in reversed(range(self.pos)):
-            
-            # If we are at the last step in the buffer
-            if step == self.pos - 1:
-                # If it's the last step, check whether the episode has finished using `dones`.
-                # `next_non_terminal` is 0 if the episode has ended, 1 if it's ongoing.
-                next_non_terminal = 1.0 - dones
-                # Use the provided last values (value estimates for the final observation in the episode)
-                next_values = last_values
-            else:
-                # For other steps, use the mask to determine if the episode is ongoing.
-                # If `masks[step + 1]` is 1, the episode is ongoing; if it's 0, the episode has ended.
-                next_non_terminal = self.masks[step + 1]
-                # Use the value of the next time step to compute the future returns
-                next_values = self.values[step + 1]
-
-            # Temporal difference (TD) error, also known as delta:
-            # This is the difference between the reward obtained at this step and the estimated value of this step
-            # plus the discounted value of the next step (if the episode is ongoing).
-            # This measures how "off" the value function is at predicting the future return.
-            delta = self.rewards[step] + self.gamma * next_values * next_non_terminal - self.values[step]
-
-            # Compute the advantage for this step using GAE:
-            # `delta` is the immediate advantage, and we add to it the discounted future advantage,
-            # scaled by the factor `lambda` (from GAE). This allows for a more smooth approximation of advantage.
-            # `next_non_terminal` ensures that if the episode has ended, the future advantage stops accumulating.
-            self.advantages[step] = last_advantage = delta + self.gamma * self.gae_lambda * next_non_terminal * last_advantage
-
-            # The return is the advantage plus the baseline value estimate.
-            # This makes sure that the return includes both the immediate rewards and the learned value of future rewards.
-            self.returns[step] = self.advantages[step] + self.values[step]
-
-
-    def get(self):
+
+        if copy:
+            return th.tensor(array, dtype=self.th_float_type, device=self.device)
+
+        return th.as_tensor(array, dtype=self.th_float_type, device=self.device)
+
+    def get(self) -> RolloutBufferTransitions:
         """
         Get all data stored in the buffer and convert it to PyTorch tensors.
         Returns the observations, actions, log_probs, advantages, returns, and masks.
@@ -285,13 +438,15 @@ def get(self):
         data = (
             self.observations[:self.pos],
             self.actions[:self.pos],
-            self.log_probs[:self.pos],
-            self.advantages[:self.pos],
-            self.returns[:self.pos],
-            self.masks[:self.pos],
+            self.rewards[:self.pos],
+            self.log_probs[:self.pos]
+            # self.masks[:self.pos],
         )
-        return tuple(map(lambda x: th.tensor(x, device=self.device), data))
+
+        return RolloutBufferTransitions(*tuple(map(self.to_torch, data)))
 
     def reset(self):
         """Reset the buffer after each update."""
         self.pos = 0
+
+
diff --git a/assume/reinforcement_learning/learning_role.py b/assume/reinforcement_learning/learning_role.py
index 96c8656f5..105d4cfd3 100644
--- a/assume/reinforcement_learning/learning_role.py
+++ b/assume/reinforcement_learning/learning_role.py
@@ -36,7 +36,7 @@ def __init__(
         learning_config: LearningConfig,
     ):
         # General parameters
-        self.rl_algorithm_name = learning_config["algorithm"]
+        self.rl_algorithm_name = learning_config.get("algorithm", "matd3")
         self.early_stopping_steps = learning_config.get(self.rl_algorithm_name, {}).get("early_stopping_steps", 10)
         self.early_stopping_threshold = learning_config.get(self.rl_algorithm_name, {}).get("early_stopping_threshold", 0.05)
         self.episodes_done = 0
@@ -55,6 +55,8 @@ def __init__(
 
         self.device = th.device(learning_config["device"] if th.cuda.is_available() else "cpu")
 
+        self.learning_rate = learning_config["learning_rate"]
+
         # Algorithm-specific parameters
         if self.rl_algorithm_name == "matd3":
             self.buffer: ReplayBuffer = None
@@ -64,13 +66,9 @@ def __init__(
             self.training_episodes = learning_config["matd3"]["training_episodes"]
             self.train_freq = learning_config["matd3"]["train_freq"]
             self.gradient_steps = int(self.train_freq[:-1]) if learning_config["matd3"].get("gradient_steps", -1) == -1 else learning_config["matd3"]["gradient_steps"]
-            # self.batch_size = learning_config["matd3"]["batch_size"]
-            # self.gamma = learning_config["matd3"]["gamma"]
-
             self.batch_size = learning_config.get(self.rl_algorithm_name, {}).get("batch_size", 128)
             self.gamma = learning_config.get(self.rl_algorithm_name, {}).get("gamma", 0.99)
 
-            self.learning_rate = learning_config["matd3"]["learning_rate"]
             self.noise_sigma = learning_config["matd3"]["noise_sigma"]
             self.noise_scale = learning_config["matd3"]["noise_scale"]
             self.episodes_collecting_initial_experience = max(learning_config.get(self.rl_algorithm_name, {}).get("episodes_collecting_initial_experience", 5), 1)
@@ -79,21 +77,15 @@ def __init__(
             self.buffer: RolloutBuffer = None
             self.actor_architecture = learning_config.get(self.rl_algorithm_name, {}).get("actor_architecture", "mlp")
             self.training_episodes = learning_config["ppo"]["training_episodes"]
-            self.steps_per_epoch = learning_config["ppo"]["steps_per_epoch"]
+            self.train_freq = learning_config["ppo"]["train_freq"]
+            # self.steps_per_epoch = learning_config["ppo"]["steps_per_epoch"]
             # self.batch_size = learning_config["matd3"]["batch_size"]
             # self.gamma = learning_config["matd3"]["gamma"]
-
-            self.batch_size = learning_config.get(self.rl_algorithm_name, {}).get("batch_size", 128)
-            self.gamma = learning_config.get(self.rl_algorithm_name, {}).get("gamma", 0.99)
-
-            self.clip_ratio = learning_config["ppo"]["clip_ratio"]
-            self.entropy_coeff = learning_config["ppo"]["entropy_coeff"]
-            self.value_coeff = learning_config["ppo"]["value_coeff"]
-            self.device = th.device(learning_config["ppo"]["device"] if th.cuda.is_available() else "cpu")
-            self.learning_rate = learning_config["ppo"]["learning_rate"]
-
-        
-       
+            # self.clip_ratio = learning_config["ppo"]["clip_ratio"]
+            # self.entropy_coeff = learning_config["ppo"]["entropy_coeff"]
+            # self.value_coeff = learning_config["ppo"]["value_coeff"]
+            # self.device = th.device(learning_config["ppo"]["device"] if th.cuda.is_available() else "cpu")
+            # self.learning_rate = learning_config["ppo"]["learning_rate"]
 
         # Set up CUDA and float types
         th.backends.cuda.matmul.allow_tf32 = True
@@ -130,12 +122,13 @@ def load_inter_episodic_data(self, inter_episodic_data):
         self.avg_rewards = inter_episodic_data["avg_all_eval"]
         self.buffer = inter_episodic_data["buffer"]
 
-        # if enough initial experience was collected according to specifications in learning config
-        # turn off initial exploration and go into full learning mode
-        if self.episodes_done > self.episodes_collecting_initial_experience:
-            self.turn_off_initial_exploration()
+        if self.rl_algorithm_name == "matd3":
+            # if enough initial experience was collected according to specifications in learning config
+            # turn off initial exploration and go into full learning mode
+            if self.episodes_done > self.episodes_collecting_initial_experience:
+                self.turn_off_initial_exploration()
 
-        self.set_noise_scale(inter_episodic_data["noise_scale"])
+            self.set_noise_scale(inter_episodic_data["noise_scale"])
 
         self.initialize_policy(inter_episodic_data["actors_and_critics"])
 
@@ -186,15 +179,31 @@ def save_buffer_and_update(self, content: dict, meta: dict) -> None:
             meta (dict): The metadata associated with the message. (not needed yet)
         """
 
-        if content.get("type") == "save_buffer_and_update":
-            data = content["data"]
-            self.buffer.add(
-                obs=data[0],
-                actions=data[1],
-                reward=data[2],
-            )
+        if self.rl_algorithm_name == "matd3":
+            if content.get("type") == "save_buffer_and_update":
+                data = content["data"]
+                self.buffer.add(
+                    obs=data[0],
+                    actions=data[1],
+                    reward=data[2],
+                )
 
-        self.update_policy()
+            self.update_policy()
+
+        elif self.rl_algorithm_name == "ppo":
+            # print("save_buffer_and_update in learning_role.py")
+            if content.get("type") == "save_buffer_and_update":
+                data = content["data"]
+                self.buffer.add(
+                    obs=data[0],
+                    actions=data[1],
+                    reward=data[2],
+                    log_probs=data[3],
+                    # values=data[4],
+                    # dones=data[5],
+                )
+
+            self.update_policy()
 
     # TD3
     def turn_off_initial_exploration(self) -> None:
@@ -246,16 +255,18 @@ def create_learning_algorithm(self, algorithm: str):
             )
         elif algorithm == "ppo":
             self.rl_algorithm = PPO(
-                learning_role=self,
-                learning_rate=self.learning_rate,
-                steps_per_epoch=self.steps_per_epoch,
-                batch_size=self.batch_size,
-                gamma=self.gamma,
-                clip_ratio=self.clip_ratio,
-                entropy_coeff=self.entropy_coeff,
-                value_coeff=self.value_coeff,
-                actor_architecture=self.actor_architecture,
-            )
+            learning_role=self,
+            learning_rate=self.learning_rate,
+            gamma=self.gamma,  # Discount factor
+            epochs=self.epochs,  # Number of epochs for policy updates
+            clip_ratio=self.clip_ratio,  # PPO-specific clipping parameter
+            vf_coef=self.vf_coef,  # Coefficient for value function loss
+            entropy_coef=self.entropy_coef,  # Coefficient for entropy to encourage exploration
+            max_grad_norm=self.max_grad_norm,  # Maximum gradient norm for clipping
+            gae_lambda=self.gae_lambda,  # Lambda for Generalized Advantage Estimation (GAE)
+            batch_size=self.batch_size,  # Batch size for mini-batch updates (optional)
+            actor_architecture=self.actor_architecture,  # Actor network architecture
+        )
         else:
             logger.error(f"Learning algorithm {algorithm} not implemented!")
 
@@ -299,8 +310,12 @@ def update_policy(self) -> None:
         Notes:
             This method is typically scheduled to run periodically during training to continuously improve the agent's policy.
         """
-        if self.episodes_done > self.episodes_collecting_initial_experience:
-            self.rl_algorithm.update_policy()
+        if self.rl_algorithm_name == "ppo":
+            self.rl_algorithm.update_policy()  
+        else:
+            if self.episodes_done > self.episodes_collecting_initial_experience:
+                self.rl_algorithm.update_policy()
+
 
     def compare_and_save_policies(self, metrics: dict) -> bool:
         """
diff --git a/assume/reinforcement_learning/learning_unit_operator.py b/assume/reinforcement_learning/learning_unit_operator.py
index 3b39715e4..7a546c9c6 100644
--- a/assume/reinforcement_learning/learning_unit_operator.py
+++ b/assume/reinforcement_learning/learning_unit_operator.py
@@ -153,20 +153,28 @@ def write_learning_to_output(self, orderbook: Orderbook, market_id: str) -> None
                         }
                     )
 
+                # Only for MATD3, not for PPO
+                # Check if exploration_noise is not empty (MATD3)
                 action_tuple = unit.outputs["actions"].loc[start]
-                noise_tuple = unit.outputs["exploration_noise"].loc[start]
+                if "exploration_noise" in unit.outputs and hasattr(unit.outputs["exploration_noise"].loc[start], "numel"):
+                    noise_tuple = unit.outputs["exploration_noise"].loc[start]
+                
                 action_dim = action_tuple.numel()
 
                 for i in range(action_dim):
-                    output_dict[f"exploration_noise_{i}"] = (
-                        noise_tuple[i] if action_dim > 1 else noise_tuple
-                    )
+                    # Only for MATD3, not for PPO
+                    if "exploration_noise" in unit.outputs and hasattr(unit.outputs["exploration_noise"].loc[start], "numel"):
+                        output_dict[f"exploration_noise_{i}"] = (
+                            noise_tuple[i] if action_dim > 1 else noise_tuple
+                        )
+                    # For MATD3 and PPO
                     output_dict[f"actions_{i}"] = (
                         action_tuple[i] if action_dim > 1 else action_tuple
                     )
 
                 output_agent_list.append(output_dict)
 
+
         db_aid = self.context.data.get("learning_output_agent_id")
         db_addr = self.context.data.get("learning_output_agent_addr")
 
@@ -181,6 +189,7 @@ def write_learning_to_output(self, orderbook: Orderbook, market_id: str) -> None
                 },
             )
 
+    # Executed in the interval set by train_frequency
     async def write_to_learning_role(
         self,
     ) -> None:
@@ -188,6 +197,9 @@ async def write_to_learning_role(
         Writes learning results to the learning agent.
 
         """
+
+        # print("write_to_learning_role in learning_unit_operator.py")
+
         if len(self.rl_units) == 0:
             return
 
@@ -196,11 +208,12 @@ async def write_to_learning_role(
         device = self.learning_strategies["device"]
         learning_unit_count = len(self.rl_units)
 
+        # How many reward values are available in the first learning unit -> equals the number of steps
         values_len = len(self.rl_units[0].outputs["rl_rewards"])
         # return if no data is available
         if values_len == 0:
             return
-
+        
         all_observations = th.zeros(
             (values_len, learning_unit_count, obs_dim), device=device
         )
@@ -209,17 +222,38 @@ async def write_to_learning_role(
         )
         all_rewards = []
 
+        # For PPO
+        # dimensions: steps, learning units, one log-prob for multiple observations/actions dimensions
+        all_log_probs = th.zeros(
+            (values_len, learning_unit_count, 1), device=device
+        )
+
+        # i is the index of the learning unit, unit is the learning unit object
         for i, unit in enumerate(self.rl_units):
+
             # Convert pandas Series to torch Tensor
             obs_tensor = th.stack(unit.outputs["rl_observations"][:values_len], dim=0)
+
             actions_tensor = th.stack(
                 unit.outputs["rl_actions"][:values_len], dim=0
             ).reshape(-1, act_dim)
 
+            # In the second dimension, the tensors include the number of the learning units
+            # Three dimensions: Steps, learning units, observation/action dimensions
             all_observations[:, i, :] = obs_tensor
             all_actions[:, i, :] = actions_tensor
             all_rewards.append(unit.outputs["rl_rewards"])
 
+            # For PPO
+            # Check whether the list of tensors is not empty and whether the tensors contain elements
+            if unit.outputs["rl_log_probs"] and all(t.numel() > 0 for t in unit.outputs["rl_log_probs"][:values_len]):
+                
+                log_prob_tensor = th.stack(
+                    unit.outputs["rl_log_probs"][:values_len], dim=0
+                ).unsqueeze(-1)
+                
+                all_log_probs[:, i, :] = log_prob_tensor
+
             # reset the outputs
             unit.reset_saved_rl_data()
 
@@ -236,8 +270,30 @@ async def write_to_learning_role(
             .numpy()
             .reshape(-1, learning_unit_count, act_dim)
         )
+
+
         all_rewards = np.array(all_rewards).reshape(-1, learning_unit_count)
-        rl_agent_data = (all_observations, all_actions, all_rewards)
+
+        # For PPO
+        if unit.outputs["rl_log_probs"] and all(t.numel() > 0 for t in unit.outputs["rl_log_probs"][:values_len]):
+            all_log_probs = all_log_probs.detach().cpu().numpy().reshape(-1, learning_unit_count, 1)
+            
+            # print("ALL_OBSERVATIONS")
+            # print(all_observations)
+
+            # print("ALL_ACTIONS")
+            # print(all_actions)
+
+            # print("ALL_REWARDS")
+            # print(all_rewards)
+
+            # print("ALL_LOG_PROBS")
+            # print(all_log_probs)
+
+            rl_agent_data = (all_observations, all_actions, all_rewards, all_log_probs)
+        # For MATD3
+        else:    
+            rl_agent_data = (all_observations, all_actions, all_rewards)
 
         learning_role_id = self.context.data.get("learning_agent_id")
         learning_role_addr = self.context.data.get("learning_agent_addr")
diff --git a/assume/reinforcement_learning/learning_utils.py b/assume/reinforcement_learning/learning_utils.py
index d7d181958..2189c37bb 100644
--- a/assume/reinforcement_learning/learning_utils.py
+++ b/assume/reinforcement_learning/learning_utils.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
+from datetime import timedelta
+import pandas as pd
 from datetime import datetime
 from typing import TypedDict
 
@@ -97,3 +99,51 @@ def polyak_update(params, target_params, tau: float):
         for param, target_param in zip(params, target_params):
             target_param.data.mul_(1 - tau)
             th.add(target_param.data, param.data, alpha=tau, out=target_param.data)
+
+
+# # For non-dynamic PPO buffer size calculation (remove if buffer stays dynamic)
+# def convert_to_timedelta(time_str):
+#     # Wenn bereits ein Timedelta-Objekt, direkt zurückgeben
+#     if isinstance(time_str, pd.Timedelta):
+#         return time_str
+
+#     # Extrahiere den Zeitwert und die Einheit aus dem String
+#     time_value, time_unit = int(time_str[:-1]), time_str[-1]
+    
+#     if time_unit == 'h':
+#         return timedelta(hours=time_value)
+#     elif time_unit == 'd':
+#         return timedelta(days=time_value)
+#     elif time_unit == 'm':
+#         return timedelta(minutes=time_value)
+#     else:
+#         raise ValueError(f"Unsupported time unit: {time_unit}")
+
+# # For non-dynamic PPO buffer size calculation (remove if buffer stays dynamic)
+# def calculate_total_timesteps_per_episode(start_date, end_date, time_step):
+#     # Wenn start_date und end_date bereits Timestamps sind, direkt nutzen
+#     if isinstance(start_date, str):
+#         start_dt = datetime.strptime(start_date, "%Y-%m-%d %H:%M")
+#     else:
+#         start_dt = start_date
+
+#     if isinstance(end_date, str):
+#         end_dt = datetime.strptime(end_date, "%Y-%m-%d %H:%M")
+#     else:
+#         end_dt = end_date
+    
+#     # Berechne den gesamten Zeitraum
+#     total_time = end_dt - start_dt
+    
+#     # Konvertiere time_step in ein timedelta-Objekt, wenn es kein Timedelta ist
+#     time_step_td = convert_to_timedelta(time_step)
+    
+#     # Berechne die Gesamtanzahl der Zeitschritte für die gesamte Dauer
+#     total_timesteps = total_time // time_step_td
+    
+#     # print("Total timesteps:")
+#     # print(total_timesteps)
+
+#     return total_timesteps
+
+
diff --git a/assume/reinforcement_learning/neural_network_architecture.py b/assume/reinforcement_learning/neural_network_architecture.py
index 3cbb76aed..26718a1dc 100644
--- a/assume/reinforcement_learning/neural_network_architecture.py
+++ b/assume/reinforcement_learning/neural_network_architecture.py
@@ -92,80 +92,36 @@ def q1_forward(self, obs, actions):
 
 
 class CriticPPO(nn.Module):
-    """Critic Network for Proximal Policy Optimization (PPO) in a Multi-Agent Setting.
+    """Critic Network for Proximal Policy Optimization (PPO).
 
-    Args:
-        n_agents (int): Number of agents
-        obs_dim (int): Dimension of each state
-        unique_obs_dim (int): Unique observation dimension per agent
-        float_type: Data type for the model parameters
-    """
-    # Actor dimension missing compared to MATD3 -> not needed for PPO
-    def __init__(
-        self, 
-        n_agents: int, 
-        obs_dim: int, 
-        float_type,
-        unique_obs_dim: int, 
-       ):
-
-        super(CriticPPO, self).__init__()
-
-        # Define the combined observation dimension
-        combined_obs_dim = obs_dim + unique_obs_dim * (n_agents - 1)
-
-        # Define the architecture of the Critic network
-        self.fc1 = nn.Linear(combined_obs_dim, 256, dtype=float_type)
-        self.fc2 = nn.Linear(256, 128, dtype=float_type)
-        self.fc3 = nn.Linear(128, 1, dtype=float_type)
-
-    def forward(self, x):
-        """Forward pass through the network."""
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        value = self.fc3(x)
-        return value
-
-class ActorPPO(nn.Module):
-    """
-    Actor network for PPO using MLP architecture with action sampling.
+    Each agent has its own critic, so this class defines the architecture for an individual agent's critic.
 
     Args:
         obs_dim (int): Dimension of the observation space.
-        act_dim (int): Dimension of the action space.
         float_type: Data type for the model parameters.
     """
-    def __init__(self, obs_dim: int, act_dim: int, float_type):
+
+    def __init__(self, obs_dim: int, float_type):
         super().__init__()
-        # Define the actor network layers
+
+        # Define the architecture of the Critic network for an individual agent
         self.fc1 = nn.Linear(obs_dim, 256, dtype=float_type)
         self.fc2 = nn.Linear(256, 128, dtype=float_type)
-        self.fc3 = nn.Linear(128, act_dim, dtype=float_type)
+        self.fc3 = nn.Linear(128, 1, dtype=float_type)
 
     def forward(self, obs):
-        """Forward pass to generate action logits."""
-        x = F.relu(self.fc1(obs))
-        x = F.relu(self.fc2(x))
-        action_logits = self.fc3(x) # action_logits are mean values for continuous action space
-        return F.tanh(action_logits)  # Bound action space between [-1, 1]
-
-    def act(self, obs):
-        """
-        Samples an action and returns both the action and its log probability.
+        """Forward pass through the critic network.
 
         Args:
-            obs (torch.Tensor): The observation input.
+            obs (torch.Tensor): The observation input for the agent.
 
         Returns:
-            action (torch.Tensor): The sampled action.
-            log_prob (torch.Tensor): Log probability of the action.
+            torch.Tensor: The value output from the critic network.
         """
-        action_logits = self.forward(obs)
-        action_dist = th.distributions.Normal(action_logits, 1.0)  # Assuming standard deviation of 1 for simplicity
-        action = action_dist.sample() # Choose a random action from the distribution
-        log_prob = action_dist.log_prob(action).sum(dim=-1)  # Summing log probs across action dimensions
-        return action, log_prob
-
+        x = F.relu(self.fc1(obs))
+        x = F.relu(self.fc2(x))
+        value = self.fc3(x)
+        return value
 
 class Actor(nn.Module):
     """
diff --git a/assume/scenario/loader_csv.py b/assume/scenario/loader_csv.py
index 941229ff7..9bf60b97c 100644
--- a/assume/scenario/loader_csv.py
+++ b/assume/scenario/loader_csv.py
@@ -26,6 +26,8 @@
 from assume.strategies import BaseStrategy
 from assume.world import World
 
+#from assume.reinforcement_learning.learning_utils import calculate_total_timesteps_per_episode
+
 logger = logging.getLogger(__name__)
 
 
@@ -429,6 +431,9 @@ def load_config_and_create_forecaster(
     start = pd.Timestamp(config["start_date"])
     end = pd.Timestamp(config["end_date"])
 
+    # New addition for PPO
+    time_step = pd.Timedelta(config["time_step"])
+
     index = pd.date_range(
         start=start,
         end=end + timedelta(days=1),
@@ -513,6 +518,8 @@ def load_config_and_create_forecaster(
         "demand_units": demand_units,
         "industrial_dsm_units": industrial_dsm_units,
         "forecaster": forecaster,
+        # New addition for PPO
+        "time_step": time_step,
     }
 
 
@@ -900,7 +907,8 @@ def run_learning(
     # Load scenario data
     scenario_data = load_config_and_create_forecaster(inputs_path, scenario, study_case)
 
-    print(world.learning_role.rl_algorithm_name)
+    # For PPO buffer size calculation
+    validation_interval_from_config = world.learning_config.get("validation_episodes_interval", 5)
 
     if world.learning_role.rl_algorithm_name == "matd3":
         buffer = ReplayBuffer(
@@ -912,8 +920,12 @@ def run_learning(
             float_type=world.learning_role.float_type,
         )
     elif world.learning_role.rl_algorithm_name == "ppo":
+
+        # For non-dynamic buffer size: Calculate number of timesteps here for a full episode
+        # total_timesteps_per_episode = calculate_total_timesteps_per_episode(scenario_data['start'], scenario_data['end'], scenario_data['time_step'])
+
         buffer = RolloutBuffer(
-            buffer_size=int(world.learning_config.get("rollout_buffer_size", 2048)),
+            # buffer_size=int(total_timesteps_per_episode * validation_interval_from_config), # For non-dynamic buffer size
             obs_dim=world.learning_role.rl_algorithm.obs_dim,
             act_dim=world.learning_role.rl_algorithm.act_dim,
             n_rl_units=len(world.learning_role.rl_strats),
@@ -940,15 +952,16 @@ def run_learning(
         # external noise addition unnecessary.
         inter_episodic_data["noise_scale"] = world.learning_config.get("noise_scale", 1.0)
 
-
+    # if world.learning_role.rl_algorithm_name == "matd3":
     # Sets the validation interval: After how many episodes does validation take place
     validation_interval = min(
         world.learning_role.training_episodes,
-        world.learning_config.get("validation_episodes_interval", 5),
+        validation_interval_from_config,
     )
 
     eval_episode = 1
 
+    # Training loop with integrated validation after a certain number of episodes
     for episode in tqdm(
         range(1, world.learning_role.training_episodes + 1),
         desc="Training Episodes",
@@ -962,24 +975,22 @@ def run_learning(
             )
 
         world.learning_role.load_inter_episodic_data(inter_episodic_data)
-        world.run()
+        world.run() # triggers calculate_bids() which equals to step
 
         inter_episodic_data = world.learning_role.get_inter_episodic_data()
         inter_episodic_data["episodes_done"] = episode
 
-        # Reset the PPO Rollout Buffer after each episode
-        if world.learning_role.rl_algorithm_name == "ppo":
-            inter_episodic_data["buffer"].reset()
-
-
-
-
 
         # Perform validation at regular intervals
         if (
             episode % validation_interval == 0
-            and episode >= world.learning_role.episodes_collecting_initial_experience + validation_interval
+            and (
+                episode >= world.learning_role.episodes_collecting_initial_experience + validation_interval
+                if world.learning_role.rl_algorithm_name == "matd3"
+                else episode > validation_interval # For PPO
+            )
         ):
+            
             world.reset()
 
             setup_world(
@@ -993,20 +1004,31 @@ def run_learning(
             world.learning_role.load_inter_episodic_data(inter_episodic_data)
             world.run()
 
-            if world.learning_role.rl_algorithm_name == "ppo":
-                advantages, returns = compute_advantages_and_returns(world, inter_episodic_data)
-                world.learning_role.update_policy_with_ppo(advantages, returns)
-                surrogate_loss = compute_surrogate_loss(world, inter_episodic_data)
-                terminate = world.learning_role.compare_and_save_policies({"surrogate_loss": surrogate_loss})
-
-                # Reset the PPO Rollout Buffer after validation
-                inter_episodic_data["buffer"].reset()
-
-            elif world.learning_role.rl_algorithm_name == "matd3":
+            if world.learning_role.rl_algorithm_name == "matd3":
                 total_rewards = world.output_role.get_sum_reward()
                 avg_reward = np.mean(total_rewards)
                 terminate = world.learning_role.compare_and_save_policies({"avg_reward": avg_reward})
 
+     
+            if world.learning_role.rl_algorithm_name == "ppo":
+                # PPO uses the surrogate loss to monitor policy updates.
+                # The surrogate loss quantifies how much the new policy has changed compared to the old one.
+                # If the surrogate loss becomes too small or too large, it can indicate issues:
+                # - A very small value may mean that the policy is near its optimum.
+                # - A large value could indicate excessive policy updates, leading to instability.
+                #
+                # It may be useful to terminate the training early based on the surrogate loss, 
+                # especially if no significant improvement is expected, or if the model becomes unstable.
+                #
+                # In this example, the surrogate_loss could be computed, and then 
+                # `compare_and_save_policies` can be used to check whether the training should be terminated.
+                
+                # surrogate_loss = <code to calculate the surrogate loss>
+                # terminate = world.learning_role.compare_and_save_policies({"surrogate_loss": surrogate_loss})
+
+                # Reset the PPO Rollout Buffer after each update
+                inter_episodic_data["buffer"].reset()
+
             inter_episodic_data["eval_episodes_done"] = eval_episode
 
             if terminate:
@@ -1014,6 +1036,10 @@ def run_learning(
 
             eval_episode += 1
 
+            
+
+            
+
         world.reset()
 
         if episode == (world.learning_role.training_episodes):
@@ -1037,179 +1063,7 @@ def run_learning(
 
     world.learning_role.load_inter_episodic_data(inter_episodic_data)
 
-
-
-# def run_learning(
-#     world: World,
-#     inputs_path: str,
-#     scenario: str,
-#     study_case: str,
-#     verbose: bool = False,
-# ) -> None:
-#     """
-#     Train Deep Reinforcement Learning (DRL) agents to act in a simulated market environment.
-
-#     This function runs multiple episodes of simulation to train DRL agents, performs evaluation, and saves the best runs. It maintains the buffer and learned agents in memory to avoid resetting them with each new run.
-
-#     Args:
-#         world (World): An instance of the World class representing the simulation environment.
-#         inputs_path (str): The path to the folder containing input files necessary for the simulation.
-#         scenario (str): The name of the scenario for the simulation.
-#         study_case (str): The specific study case for the simulation.
-
-#     Note:
-#         - The function uses a ReplayBuffer to store experiences for training the DRL agents.
-#         - It iterates through training episodes, updating the agents and evaluating their performance at regular intervals.
-#         - Initial exploration is active at the beginning and is disabled after a certain number of episodes to improve the performance of DRL algorithms.
-#         - Upon completion of training, the function performs an evaluation run using the best policy learned during training.
-#         - The best policies are chosen based on the average reward obtained during the evaluation runs, and they are saved for future use.
-#     """
-#     from assume.reinforcement_learning.buffer import ReplayBuffer, RolloutBuffer
-
-#     if not verbose:
-#         logger.setLevel(logging.WARNING)
-
-#     # remove csv path so that nothing is written while learning
-#     temp_csv_path = world.export_csv_path
-#     world.export_csv_path = ""
-
-#     # initialize policies already here to set the obs_dim and act_dim in the learning role
-#     actors_and_critics = None
-#     world.learning_role.initialize_policy(actors_and_critics=actors_and_critics)
-#     world.output_role.del_similar_runs()
-
-#     # check if we already stored policies for this simualtion
-#     save_path = world.learning_config["trained_policies_save_path"]
-
-#     if Path(save_path).is_dir():
-#         # we are in learning mode and about to train new policies, which might overwrite existing ones
-#         accept = input(
-#             f"{save_path=} exists - should we overwrite current learnings? (y/N) "
-#         )
-#         if not accept.lower().startswith("y"):
-#             # stop here - do not start learning or save anything
-#             raise AssumeException("don't overwrite existing strategies")
-
-#     # -----------------------------------------
-#     # Load scenario data to reuse across episodes
-#     scenario_data = load_config_and_create_forecaster(inputs_path, scenario, study_case)
-
-#     # -----------------------------------------
-#     # Information that needs to be stored across episodes, aka one simulation run
-#     inter_episodic_data = {
-#         "buffer": ReplayBuffer(
-#             buffer_size=int(world.learning_config.get("replay_buffer_size", 5e5)),
-#             obs_dim=world.learning_role.rl_algorithm.obs_dim,
-#             act_dim=world.learning_role.rl_algorithm.act_dim,
-#             n_rl_units=len(world.learning_role.rl_strats),
-#             device=world.learning_role.device,
-#             float_type=world.learning_role.float_type,
-#         ),
-#         "actors_and_critics": None,
-#         "max_eval": defaultdict(lambda: -1e9),
-#         "all_eval": defaultdict(list),
-#         "avg_all_eval": [],
-#         "episodes_done": 0,
-#         "eval_episodes_done": 0,
-#         "noise_scale": world.learning_config.get("noise_scale", 1.0),
-#     }
-
-#     # -----------------------------------------
-
-#     validation_interval = min(
-#         world.learning_role.training_episodes,
-#         world.learning_config.get("validation_episodes_interval", 5),
-#     )
-
-#     eval_episode = 1
-
-#     for episode in tqdm(
-#         range(1, world.learning_role.training_episodes + 1),
-#         desc="Training Episodes",
-#     ):
-#         # TODO normally, loading twice should not create issues, somehow a scheduling issue is raised currently
-#         if episode != 1:
-#             setup_world(
-#                 world=world,
-#                 scenario_data=scenario_data,
-#                 study_case=study_case,
-#                 episode=episode,
-#             )
-
-#         # -----------------------------------------
-#         # Give the newly initliazed learning role the needed information across episodes
-#         world.learning_role.load_inter_episodic_data(inter_episodic_data)
-
-#         world.run()
-
-#         # -----------------------------------------
-#         # Store updated information across episodes
-#         inter_episodic_data = world.learning_role.get_inter_episodic_data()
-#         inter_episodic_data["episodes_done"] = episode
-
-#         # evaluation run:
-#         if (
-#             episode % validation_interval == 0
-#             and episode
-#             >= world.learning_role.episodes_collecting_initial_experience
-#             + validation_interval
-#         ):
-#             world.reset()
-
-#             # load evaluation run
-#             setup_world(
-#                 world=world,
-#                 scenario_data=scenario_data,
-#                 study_case=study_case,
-#                 perform_evaluation=True,
-#                 eval_episode=eval_episode,
-#             )
-
-#             world.learning_role.load_inter_episodic_data(inter_episodic_data)
-
-#             world.run()
-
-#             total_rewards = world.output_role.get_sum_reward()
-#             avg_reward = np.mean(total_rewards)
-#             # check reward improvement in evaluation run
-#             # and store best run in eval folder
-#             terminate = world.learning_role.compare_and_save_policies(
-#                 {"avg_reward": avg_reward}
-#             )
-
-#             inter_episodic_data["eval_episodes_done"] = eval_episode
-
-#             # if we have not improved in the last x evaluations, we stop loop
-#             if terminate:
-#                 break
-
-#             eval_episode += 1
-
-#         world.reset()
-
-#         # if at end of simulation save last policies
-#         if episode == (world.learning_role.training_episodes):
-#             world.learning_role.rl_algorithm.save_params(
-#                 directory=f"{world.learning_role.trained_policies_save_path}/last_policies"
-#             )
-
-#         # container shutdown implicitly with new initialisation
-#     logger.info("################")
-#     logger.info("Training finished, Start evaluation run")
-#     world.export_csv_path = temp_csv_path
-
-#     world.reset()
-
-#     # load scenario for evaluation
-#     setup_world(
-#         world=world,
-#         scenario_data=scenario_data,
-#         study_case=study_case,
-#         terminate_learning=True,
-#     )
-
-#     world.learning_role.load_inter_episodic_data(inter_episodic_data)
-
+    print("Evaluation finished")
 
 if __name__ == "__main__":
     data = read_grid(Path("examples/inputs/example_01d"))
diff --git a/assume/strategies/learning_strategies.py b/assume/strategies/learning_strategies.py
index 11b22407f..dd508df10 100644
--- a/assume/strategies/learning_strategies.py
+++ b/assume/strategies/learning_strategies.py
@@ -131,6 +131,8 @@ def calculate_bids(
 
         """
 
+        print("calculate_bids in learning_strategies.py (STEP)")
+
         bid_quantity_inflex, bid_price_inflex = 0, 0
         bid_quantity_flex, bid_price_flex = 0, 0
 
@@ -154,7 +156,10 @@ def calculate_bids(
         # =============================================================================
         # 2. Get the Actions, based on the observations
         # =============================================================================
-        actions, noise = self.get_actions(next_observation)
+        # actions, noise = self.get_actions(next_observation) # Old implementation with get_actions inside this class
+        # actions, noise = self.get_actions(self, next_observation)
+        # Depending on the algorithm, extra_info is either noise (MATD3) or log_probs (PPO)
+        actions, extra_info = self.get_actions(self, next_observation)
 
         # =============================================================================
         # 3. Transform Actions into bids
@@ -199,70 +204,20 @@ def calculate_bids(
 
         # store results in unit outputs as series to be written to the database by the unit operator
         unit.outputs["actions"][start] = actions
-        unit.outputs["exploration_noise"][start] = noise
-
-        bids = self.remove_empty_bids(bids)
-
-        return bids
-
-    def get_actions(self, next_observation):
-        """
-        Gets actions for a unit containing two bid prices depending on the observation.
-
-        Args:
-            next_observation (torch.Tensor): Next observation.
-
-        Returns:
-            Actions (torch.Tensor): Actions containing two bid prices.
-
-        Note:
-            If the agent is in learning mode, the actions are chosen by the actor neuronal net and noise is added to the action
-            In the first x episodes the agent is in initial exploration mode, where the action is chosen by noise only to explore the entire action space.
-            X is defined by episodes_collecting_initial_experience.
-            If the agent is not in learning mode, the actions are chosen by the actor neuronal net without noise.
-        """
+        # unit.outputs["exploration_noise"][start] = noise
 
-        # distinction whether we are in learning mode or not to handle exploration realised with noise
-        if self.learning_mode and not self.perform_evaluation:
-            # if we are in learning mode the first x episodes we want to explore the entire action space
-            # to get a good initial experience, in the area around the costs of the agent
-            if self.collect_initial_experience_mode:
-                # define current action as soley noise
-                noise = (
-                    th.normal(
-                        mean=0.0, std=0.2, size=(1, self.act_dim), dtype=self.float_type
-                    )
-                    .to(self.device)
-                    .squeeze()
-                )
-
-                # =============================================================================
-                # 2.1 Get Actions and handle exploration
-                # =============================================================================
-                base_bid = next_observation[-1]
-
-                # add noise to the last dimension of the observation
-                # needs to be adjusted if observation space is changed, because only makes sense
-                # if the last dimension of the observation space are the marginal cost
-                curr_action = noise + base_bid.clone().detach()
-
-            else:
-                # if we are not in the initial exploration phase we choose the action with the actor neural net
-                # and add noise to the action
-                curr_action = self.actor(next_observation).detach()
-                noise = th.tensor(
-                    self.action_noise.noise(), device=self.device, dtype=self.float_type
-                )
-                curr_action += noise
+        # Check if extra_info is noise or log_probs and store it accordingly
+        if isinstance(extra_info, th.Tensor) and extra_info.shape == actions.shape:
+            unit.outputs["exploration_noise"][start] = extra_info  # It's noise
         else:
-            # if we are not in learning mode we just use the actor neural net to get the action without adding noise
-
-            curr_action = self.actor(next_observation).detach()
-            noise = tuple(0 for _ in range(self.act_dim))
+            # print("Type of extra_info: ", extra_info)
+            # print(type(extra_info))
+            unit.outputs["rl_log_probs"].append(extra_info)  # It's log_probs
+            # unit.outputs["dones"][start] = False
 
-        curr_action = curr_action.clamp(-1, 1)
+        bids = self.remove_empty_bids(bids)
 
-        return curr_action, noise
+        return bids
 
     def create_observation(
         self,
diff --git a/assume/world.py b/assume/world.py
index aca7be516..5a9e7b79b 100644
--- a/assume/world.py
+++ b/assume/world.py
@@ -6,7 +6,7 @@
 import logging
 import sys
 import time
-from datetime import datetime
+from datetime import datetime, timedelta
 from pathlib import Path
 from sys import platform
 
@@ -384,11 +384,44 @@ def add_rl_unit_operator(self, id: str = "Operator-RL") -> None:
                     "learning_agent_id": self.learning_agent_addr[1],
                 }
             )
+            
+            
+            # Extract algorithm from the Learning_Config
+            algorithm = self.learning_config.get("algorithm", "matd3")
+
+            # Select correct train_freq based on the algorithm
+            if algorithm == "matd3":
+                train_freq = self.learning_config.get("matd3", {}).get("train_freq", "24h")
+            elif algorithm == "ppo":
+                train_freq = self.learning_config.get("ppo", {}).get("train_freq", "24h")
+            else:
+                train_freq = "24h"  # Standard value if algorithm is not defined
+
+            # Continue code with the selected frequency
             recurrency_task = create_rrule(
                 start=self.start,
                 end=self.end,
-                freq=self.learning_config.get("train_freq", "24h"),
+                freq=train_freq,
             )
+            
+            # Convert train_freq to hours for comparison
+            freq_value = int(train_freq[:-1])  # Extract the numerical value
+            freq_unit = train_freq[-1]  # Extract the time unit (h for hours, d for days)
+
+            # Convert the train_freq into hours
+            if freq_unit == "h":
+                train_freq_hours = freq_value
+            elif freq_unit == "d":
+                train_freq_hours = freq_value * 24
+            else:
+                train_freq_hours = 24  # Default to 24 hours
+
+            # Calculate time difference in hours
+            duration_hours = int((self.end - self.start) / timedelta(hours=1))
+
+            # Check if train_freq is larger than the time difference
+            if train_freq_hours > duration_hours:
+                print(f"Warning: The train frequency ({train_freq_hours}h) is larger than the time difference between start and end ({duration_hours}h).")
 
             units_operator.context.schedule_recurrent_task(
                 units_operator.write_to_learning_role, recurrency_task
diff --git a/examples/inputs/example_01a/forecasts_df.csv b/examples/inputs/example_01a/forecasts_df.csv
new file mode 100644
index 000000000..c2bae940e
--- /dev/null
+++ b/examples/inputs/example_01a/forecasts_df.csv
@@ -0,0 +1,746 @@
+,fuel_price_natural gas,availability_Unit 4,availability_Unit 3,fuel_price_co2,fuel_price_oil,residual_load_EOM,fuel_price_lignite,availability_Unit 2,availability_Unit 1,fuel_price_uranium,price_EOM,fuel_price_biomass,demand_EOM,fuel_price_hard coal
+2019-01-01 00:00:00,25.0,1.0,1.0,25.0,40.0,2163.3,2.0,1.0,1.0,1.0,45.05,20.0,2163.3,10.0
+2019-01-01 01:00:00,25.0,1.0,1.0,25.0,40.0,2082.7,2.0,1.0,1.0,1.0,45.05,20.0,2082.7,10.0
+2019-01-01 02:00:00,25.0,1.0,1.0,25.0,40.0,2005.7,2.0,1.0,1.0,1.0,45.05,20.0,2005.7,10.0
+2019-01-01 03:00:00,25.0,1.0,1.0,25.0,40.0,1965.6,2.0,1.0,1.0,1.0,25.65,20.0,1965.6,10.0
+2019-01-01 04:00:00,25.0,1.0,1.0,25.0,40.0,1954.85,2.0,1.0,1.0,1.0,25.65,20.0,1954.85,10.0
+2019-01-01 05:00:00,25.0,1.0,1.0,25.0,40.0,1931.75,2.0,1.0,1.0,1.0,25.65,20.0,1931.75,10.0
+2019-01-01 06:00:00,25.0,1.0,1.0,25.0,40.0,1906.1,2.0,1.0,1.0,1.0,25.65,20.0,1906.1,10.0
+2019-01-01 07:00:00,25.0,1.0,1.0,25.0,40.0,1943.75,2.0,1.0,1.0,1.0,25.65,20.0,1943.75,10.0
+2019-01-01 08:00:00,25.0,1.0,1.0,25.0,40.0,1984.65,2.0,1.0,1.0,1.0,25.65,20.0,1984.65,10.0
+2019-01-01 09:00:00,25.0,1.0,1.0,25.0,40.0,2111.8,2.0,1.0,1.0,1.0,45.05,20.0,2111.8,10.0
+2019-01-01 10:00:00,25.0,1.0,1.0,25.0,40.0,2246.25,2.0,1.0,1.0,1.0,45.05,20.0,2246.25,10.0
+2019-01-01 11:00:00,25.0,1.0,1.0,25.0,40.0,2389.6,2.0,1.0,1.0,1.0,45.05,20.0,2389.6,10.0
+2019-01-01 12:00:00,25.0,1.0,1.0,25.0,40.0,2456.15,2.0,1.0,1.0,1.0,45.05,20.0,2456.15,10.0
+2019-01-01 13:00:00,25.0,1.0,1.0,25.0,40.0,2439.6,2.0,1.0,1.0,1.0,45.05,20.0,2439.6,10.0
+2019-01-01 14:00:00,25.0,1.0,1.0,25.0,40.0,2426.7000000000003,2.0,1.0,1.0,1.0,45.05,20.0,2426.7000000000003,10.0
+2019-01-01 15:00:00,25.0,1.0,1.0,25.0,40.0,2448.35,2.0,1.0,1.0,1.0,45.05,20.0,2448.35,10.0
+2019-01-01 16:00:00,25.0,1.0,1.0,25.0,40.0,2534.65,2.0,1.0,1.0,1.0,45.05,20.0,2534.65,10.0
+2019-01-01 17:00:00,25.0,1.0,1.0,25.0,40.0,2694.85,2.0,1.0,1.0,1.0,45.05,20.0,2694.85,10.0
+2019-01-01 18:00:00,25.0,1.0,1.0,25.0,40.0,2733.2,2.0,1.0,1.0,1.0,45.05,20.0,2733.2,10.0
+2019-01-01 19:00:00,25.0,1.0,1.0,25.0,40.0,2682.55,2.0,1.0,1.0,1.0,45.05,20.0,2682.55,10.0
+2019-01-01 20:00:00,25.0,1.0,1.0,25.0,40.0,2567.55,2.0,1.0,1.0,1.0,45.05,20.0,2567.55,10.0
+2019-01-01 21:00:00,25.0,1.0,1.0,25.0,40.0,2486.85,2.0,1.0,1.0,1.0,45.05,20.0,2486.85,10.0
+2019-01-01 22:00:00,25.0,1.0,1.0,25.0,40.0,2437.0,2.0,1.0,1.0,1.0,45.05,20.0,2437.0,10.0
+2019-01-01 23:00:00,25.0,1.0,1.0,25.0,40.0,2297.85,2.0,1.0,1.0,1.0,45.05,20.0,2297.85,10.0
+2019-01-02 00:00:00,25.0,1.0,1.0,25.0,40.0,2191.4,2.0,1.0,1.0,1.0,45.05,20.0,2191.4,10.0
+2019-01-02 01:00:00,25.0,1.0,1.0,25.0,40.0,2116.25,2.0,1.0,1.0,1.0,45.05,20.0,2116.25,10.0
+2019-01-02 02:00:00,25.0,1.0,1.0,25.0,40.0,2096.9,2.0,1.0,1.0,1.0,45.05,20.0,2096.9,10.0
+2019-01-02 03:00:00,25.0,1.0,1.0,25.0,40.0,2121.4,2.0,1.0,1.0,1.0,45.05,20.0,2121.4,10.0
+2019-01-02 04:00:00,25.0,1.0,1.0,25.0,40.0,2192.8,2.0,1.0,1.0,1.0,45.05,20.0,2192.8,10.0
+2019-01-02 05:00:00,25.0,1.0,1.0,25.0,40.0,2346.55,2.0,1.0,1.0,1.0,45.05,20.0,2346.55,10.0
+2019-01-02 06:00:00,25.0,1.0,1.0,25.0,40.0,2635.85,2.0,1.0,1.0,1.0,45.05,20.0,2635.85,10.0
+2019-01-02 07:00:00,25.0,1.0,1.0,25.0,40.0,2908.45,2.0,1.0,1.0,1.0,45.05,20.0,2908.45,10.0
+2019-01-02 08:00:00,25.0,1.0,1.0,25.0,40.0,3075.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3075.7,10.0
+2019-01-02 09:00:00,25.0,1.0,1.0,25.0,40.0,3171.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3171.5,10.0
+2019-01-02 10:00:00,25.0,1.0,1.0,25.0,40.0,3223.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3223.6,10.0
+2019-01-02 11:00:00,25.0,1.0,1.0,25.0,40.0,3289.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3289.55,10.0
+2019-01-02 12:00:00,25.0,1.0,1.0,25.0,40.0,3302.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3302.5,10.0
+2019-01-02 13:00:00,25.0,1.0,1.0,25.0,40.0,3265.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3265.85,10.0
+2019-01-02 14:00:00,25.0,1.0,1.0,25.0,40.0,3205.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3205.7,10.0
+2019-01-02 15:00:00,25.0,1.0,1.0,25.0,40.0,3198.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3198.45,10.0
+2019-01-02 16:00:00,25.0,1.0,1.0,25.0,40.0,3259.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3259.35,10.0
+2019-01-02 17:00:00,25.0,1.0,1.0,25.0,40.0,3422.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3422.0,10.0
+2019-01-02 18:00:00,25.0,1.0,1.0,25.0,40.0,3405.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3405.75,10.0
+2019-01-02 19:00:00,25.0,1.0,1.0,25.0,40.0,3321.95,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3321.95,10.0
+2019-01-02 20:00:00,25.0,1.0,1.0,25.0,40.0,3135.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3135.05,10.0
+2019-01-02 21:00:00,25.0,1.0,1.0,25.0,40.0,2977.15,2.0,1.0,1.0,1.0,45.05,20.0,2977.15,10.0
+2019-01-02 22:00:00,25.0,1.0,1.0,25.0,40.0,2881.65,2.0,1.0,1.0,1.0,45.05,20.0,2881.65,10.0
+2019-01-02 23:00:00,25.0,1.0,1.0,25.0,40.0,2698.15,2.0,1.0,1.0,1.0,45.05,20.0,2698.15,10.0
+2019-01-03 00:00:00,25.0,1.0,1.0,25.0,40.0,2550.85,2.0,1.0,1.0,1.0,45.05,20.0,2550.85,10.0
+2019-01-03 01:00:00,25.0,1.0,1.0,25.0,40.0,2502.1,2.0,1.0,1.0,1.0,45.05,20.0,2502.1,10.0
+2019-01-03 02:00:00,25.0,1.0,1.0,25.0,40.0,2487.9,2.0,1.0,1.0,1.0,45.05,20.0,2487.9,10.0
+2019-01-03 03:00:00,25.0,1.0,1.0,25.0,40.0,2482.85,2.0,1.0,1.0,1.0,45.05,20.0,2482.85,10.0
+2019-01-03 04:00:00,25.0,1.0,1.0,25.0,40.0,2521.6,2.0,1.0,1.0,1.0,45.05,20.0,2521.6,10.0
+2019-01-03 05:00:00,25.0,1.0,1.0,25.0,40.0,2652.4,2.0,1.0,1.0,1.0,45.05,20.0,2652.4,10.0
+2019-01-03 06:00:00,25.0,1.0,1.0,25.0,40.0,2892.3,2.0,1.0,1.0,1.0,45.05,20.0,2892.3,10.0
+2019-01-03 07:00:00,25.0,1.0,1.0,25.0,40.0,3111.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3111.35,10.0
+2019-01-03 08:00:00,25.0,1.0,1.0,25.0,40.0,3240.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3240.1,10.0
+2019-01-03 09:00:00,25.0,1.0,1.0,25.0,40.0,3304.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3304.45,10.0
+2019-01-03 10:00:00,25.0,1.0,1.0,25.0,40.0,3333.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3333.7,10.0
+2019-01-03 11:00:00,25.0,1.0,1.0,25.0,40.0,3378.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3378.35,10.0
+2019-01-03 12:00:00,25.0,1.0,1.0,25.0,40.0,3390.2,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3390.2,10.0
+2019-01-03 13:00:00,25.0,1.0,1.0,25.0,40.0,3331.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3331.7,10.0
+2019-01-03 14:00:00,25.0,1.0,1.0,25.0,40.0,3278.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3278.8,10.0
+2019-01-03 15:00:00,25.0,1.0,1.0,25.0,40.0,3250.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3250.55,10.0
+2019-01-03 16:00:00,25.0,1.0,1.0,25.0,40.0,3315.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3315.5,10.0
+2019-01-03 17:00:00,25.0,1.0,1.0,25.0,40.0,3468.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3468.75,10.0
+2019-01-03 18:00:00,25.0,1.0,1.0,25.0,40.0,3452.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3452.75,10.0
+2019-01-03 19:00:00,25.0,1.0,1.0,25.0,40.0,3355.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3355.9,10.0
+2019-01-03 20:00:00,25.0,1.0,1.0,25.0,40.0,3174.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3174.25,10.0
+2019-01-03 21:00:00,25.0,1.0,1.0,25.0,40.0,3029.6000000000004,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3029.6000000000004,10.0
+2019-01-03 22:00:00,25.0,1.0,1.0,25.0,40.0,2911.75,2.0,1.0,1.0,1.0,45.05,20.0,2911.75,10.0
+2019-01-03 23:00:00,25.0,1.0,1.0,25.0,40.0,2747.45,2.0,1.0,1.0,1.0,45.05,20.0,2747.45,10.0
+2019-01-04 00:00:00,25.0,1.0,1.0,25.0,40.0,2593.5,2.0,1.0,1.0,1.0,45.05,20.0,2593.5,10.0
+2019-01-04 01:00:00,25.0,1.0,1.0,25.0,40.0,2501.8,2.0,1.0,1.0,1.0,45.05,20.0,2501.8,10.0
+2019-01-04 02:00:00,25.0,1.0,1.0,25.0,40.0,2476.3,2.0,1.0,1.0,1.0,45.05,20.0,2476.3,10.0
+2019-01-04 03:00:00,25.0,1.0,1.0,25.0,40.0,2497.55,2.0,1.0,1.0,1.0,45.05,20.0,2497.55,10.0
+2019-01-04 04:00:00,25.0,1.0,1.0,25.0,40.0,2552.15,2.0,1.0,1.0,1.0,45.05,20.0,2552.15,10.0
+2019-01-04 05:00:00,25.0,1.0,1.0,25.0,40.0,2668.9,2.0,1.0,1.0,1.0,45.05,20.0,2668.9,10.0
+2019-01-04 06:00:00,25.0,1.0,1.0,25.0,40.0,2914.55,2.0,1.0,1.0,1.0,45.05,20.0,2914.55,10.0
+2019-01-04 07:00:00,25.0,1.0,1.0,25.0,40.0,3181.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3181.35,10.0
+2019-01-04 08:00:00,25.0,1.0,1.0,25.0,40.0,3341.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3341.15,10.0
+2019-01-04 09:00:00,25.0,1.0,1.0,25.0,40.0,3417.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3417.55,10.0
+2019-01-04 10:00:00,25.0,1.0,1.0,25.0,40.0,3453.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3453.35,10.0
+2019-01-04 11:00:00,25.0,1.0,1.0,25.0,40.0,3484.95,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3484.95,10.0
+2019-01-04 12:00:00,25.0,1.0,1.0,25.0,40.0,3513.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3513.4,10.0
+2019-01-04 13:00:00,25.0,1.0,1.0,25.0,40.0,3452.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3452.75,10.0
+2019-01-04 14:00:00,25.0,1.0,1.0,25.0,40.0,3362.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3362.3,10.0
+2019-01-04 15:00:00,25.0,1.0,1.0,25.0,40.0,3327.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3327.9,10.0
+2019-01-04 16:00:00,25.0,1.0,1.0,25.0,40.0,3391.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3391.75,10.0
+2019-01-04 17:00:00,25.0,1.0,1.0,25.0,40.0,3496.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3496.3,10.0
+2019-01-04 18:00:00,25.0,1.0,1.0,25.0,40.0,3464.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3464.75,10.0
+2019-01-04 19:00:00,25.0,1.0,1.0,25.0,40.0,3365.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3365.9,10.0
+2019-01-04 20:00:00,25.0,1.0,1.0,25.0,40.0,3167.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3167.05,10.0
+2019-01-04 21:00:00,25.0,1.0,1.0,25.0,40.0,3016.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3016.75,10.0
+2019-01-04 22:00:00,25.0,1.0,1.0,25.0,40.0,2924.35,2.0,1.0,1.0,1.0,45.05,20.0,2924.35,10.0
+2019-01-04 23:00:00,25.0,1.0,1.0,25.0,40.0,2724.7,2.0,1.0,1.0,1.0,45.05,20.0,2724.7,10.0
+2019-01-05 00:00:00,25.0,1.0,1.0,25.0,40.0,2582.8,2.0,1.0,1.0,1.0,45.05,20.0,2582.8,10.0
+2019-01-05 01:00:00,25.0,1.0,1.0,25.0,40.0,2465.75,2.0,1.0,1.0,1.0,45.05,20.0,2465.75,10.0
+2019-01-05 02:00:00,25.0,1.0,1.0,25.0,40.0,2401.5,2.0,1.0,1.0,1.0,45.05,20.0,2401.5,10.0
+2019-01-05 03:00:00,25.0,1.0,1.0,25.0,40.0,2377.75,2.0,1.0,1.0,1.0,45.05,20.0,2377.75,10.0
+2019-01-05 04:00:00,25.0,1.0,1.0,25.0,40.0,2381.7,2.0,1.0,1.0,1.0,45.05,20.0,2381.7,10.0
+2019-01-05 05:00:00,25.0,1.0,1.0,25.0,40.0,2385.1,2.0,1.0,1.0,1.0,45.05,20.0,2385.1,10.0
+2019-01-05 06:00:00,25.0,1.0,1.0,25.0,40.0,2416.5,2.0,1.0,1.0,1.0,45.05,20.0,2416.5,10.0
+2019-01-05 07:00:00,25.0,1.0,1.0,25.0,40.0,2538.05,2.0,1.0,1.0,1.0,45.05,20.0,2538.05,10.0
+2019-01-05 08:00:00,25.0,1.0,1.0,25.0,40.0,2715.3,2.0,1.0,1.0,1.0,45.05,20.0,2715.3,10.0
+2019-01-05 09:00:00,25.0,1.0,1.0,25.0,40.0,2879.6,2.0,1.0,1.0,1.0,45.05,20.0,2879.6,10.0
+2019-01-05 10:00:00,25.0,1.0,1.0,25.0,40.0,2985.6,2.0,1.0,1.0,1.0,45.05,20.0,2985.6,10.0
+2019-01-05 11:00:00,25.0,1.0,1.0,25.0,40.0,3046.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3046.05,10.0
+2019-01-05 12:00:00,25.0,1.0,1.0,25.0,40.0,3039.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3039.6,10.0
+2019-01-05 13:00:00,25.0,1.0,1.0,25.0,40.0,2975.9,2.0,1.0,1.0,1.0,45.05,20.0,2975.9,10.0
+2019-01-05 14:00:00,25.0,1.0,1.0,25.0,40.0,2923.65,2.0,1.0,1.0,1.0,45.05,20.0,2923.65,10.0
+2019-01-05 15:00:00,25.0,1.0,1.0,25.0,40.0,2901.85,2.0,1.0,1.0,1.0,45.05,20.0,2901.85,10.0
+2019-01-05 16:00:00,25.0,1.0,1.0,25.0,40.0,2952.7,2.0,1.0,1.0,1.0,45.05,20.0,2952.7,10.0
+2019-01-05 17:00:00,25.0,1.0,1.0,25.0,40.0,3094.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3094.45,10.0
+2019-01-05 18:00:00,25.0,1.0,1.0,25.0,40.0,3096.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3096.85,10.0
+2019-01-05 19:00:00,25.0,1.0,1.0,25.0,40.0,3005.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3005.0,10.0
+2019-01-05 20:00:00,25.0,1.0,1.0,25.0,40.0,2814.35,2.0,1.0,1.0,1.0,45.05,20.0,2814.35,10.0
+2019-01-05 21:00:00,25.0,1.0,1.0,25.0,40.0,2696.8,2.0,1.0,1.0,1.0,45.05,20.0,2696.8,10.0
+2019-01-05 22:00:00,25.0,1.0,1.0,25.0,40.0,2644.85,2.0,1.0,1.0,1.0,45.05,20.0,2644.85,10.0
+2019-01-05 23:00:00,25.0,1.0,1.0,25.0,40.0,2496.0,2.0,1.0,1.0,1.0,45.05,20.0,2496.0,10.0
+2019-01-06 00:00:00,25.0,1.0,1.0,25.0,40.0,2330.15,2.0,1.0,1.0,1.0,45.05,20.0,2330.15,10.0
+2019-01-06 01:00:00,25.0,1.0,1.0,25.0,40.0,2227.3,2.0,1.0,1.0,1.0,45.05,20.0,2227.3,10.0
+2019-01-06 02:00:00,25.0,1.0,1.0,25.0,40.0,2160.5,2.0,1.0,1.0,1.0,45.05,20.0,2160.5,10.0
+2019-01-06 03:00:00,25.0,1.0,1.0,25.0,40.0,2152.2,2.0,1.0,1.0,1.0,45.05,20.0,2152.2,10.0
+2019-01-06 04:00:00,25.0,1.0,1.0,25.0,40.0,2154.3,2.0,1.0,1.0,1.0,45.05,20.0,2154.3,10.0
+2019-01-06 05:00:00,25.0,1.0,1.0,25.0,40.0,2132.85,2.0,1.0,1.0,1.0,45.05,20.0,2132.85,10.0
+2019-01-06 06:00:00,25.0,1.0,1.0,25.0,40.0,2112.85,2.0,1.0,1.0,1.0,45.05,20.0,2112.85,10.0
+2019-01-06 07:00:00,25.0,1.0,1.0,25.0,40.0,2167.55,2.0,1.0,1.0,1.0,45.05,20.0,2167.55,10.0
+2019-01-06 08:00:00,25.0,1.0,1.0,25.0,40.0,2297.35,2.0,1.0,1.0,1.0,45.05,20.0,2297.35,10.0
+2019-01-06 09:00:00,25.0,1.0,1.0,25.0,40.0,2472.95,2.0,1.0,1.0,1.0,45.05,20.0,2472.95,10.0
+2019-01-06 10:00:00,25.0,1.0,1.0,25.0,40.0,2622.1,2.0,1.0,1.0,1.0,45.05,20.0,2622.1,10.0
+2019-01-06 11:00:00,25.0,1.0,1.0,25.0,40.0,2750.0,2.0,1.0,1.0,1.0,45.05,20.0,2750.0,10.0
+2019-01-06 12:00:00,25.0,1.0,1.0,25.0,40.0,2773.15,2.0,1.0,1.0,1.0,45.05,20.0,2773.15,10.0
+2019-01-06 13:00:00,25.0,1.0,1.0,25.0,40.0,2704.3,2.0,1.0,1.0,1.0,45.05,20.0,2704.3,10.0
+2019-01-06 14:00:00,25.0,1.0,1.0,25.0,40.0,2673.75,2.0,1.0,1.0,1.0,45.05,20.0,2673.75,10.0
+2019-01-06 15:00:00,25.0,1.0,1.0,25.0,40.0,2635.45,2.0,1.0,1.0,1.0,45.05,20.0,2635.45,10.0
+2019-01-06 16:00:00,25.0,1.0,1.0,25.0,40.0,2722.9,2.0,1.0,1.0,1.0,45.05,20.0,2722.9,10.0
+2019-01-06 17:00:00,25.0,1.0,1.0,25.0,40.0,2898.1,2.0,1.0,1.0,1.0,45.05,20.0,2898.1,10.0
+2019-01-06 18:00:00,25.0,1.0,1.0,25.0,40.0,2943.5,2.0,1.0,1.0,1.0,45.05,20.0,2943.5,10.0
+2019-01-06 19:00:00,25.0,1.0,1.0,25.0,40.0,2864.6,2.0,1.0,1.0,1.0,45.05,20.0,2864.6,10.0
+2019-01-06 20:00:00,25.0,1.0,1.0,25.0,40.0,2747.65,2.0,1.0,1.0,1.0,45.05,20.0,2747.65,10.0
+2019-01-06 21:00:00,25.0,1.0,1.0,25.0,40.0,2670.0,2.0,1.0,1.0,1.0,45.05,20.0,2670.0,10.0
+2019-01-06 22:00:00,25.0,1.0,1.0,25.0,40.0,2636.2,2.0,1.0,1.0,1.0,45.05,20.0,2636.2,10.0
+2019-01-06 23:00:00,25.0,1.0,1.0,25.0,40.0,2501.95,2.0,1.0,1.0,1.0,45.05,20.0,2501.95,10.0
+2019-01-07 00:00:00,25.0,1.0,1.0,25.0,40.0,2353.4,2.0,1.0,1.0,1.0,45.05,20.0,2353.4,10.0
+2019-01-07 01:00:00,25.0,1.0,1.0,25.0,40.0,2265.65,2.0,1.0,1.0,1.0,45.05,20.0,2265.65,10.0
+2019-01-07 02:00:00,25.0,1.0,1.0,25.0,40.0,2241.7,2.0,1.0,1.0,1.0,45.05,20.0,2241.7,10.0
+2019-01-07 03:00:00,25.0,1.0,1.0,25.0,40.0,2275.5,2.0,1.0,1.0,1.0,45.05,20.0,2275.5,10.0
+2019-01-07 04:00:00,25.0,1.0,1.0,25.0,40.0,2367.8,2.0,1.0,1.0,1.0,45.05,20.0,2367.8,10.0
+2019-01-07 05:00:00,25.0,1.0,1.0,25.0,40.0,2585.05,2.0,1.0,1.0,1.0,45.05,20.0,2585.05,10.0
+2019-01-07 06:00:00,25.0,1.0,1.0,25.0,40.0,3015.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3015.7,10.0
+2019-01-07 07:00:00,25.0,1.0,1.0,25.0,40.0,3331.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3331.45,10.0
+2019-01-07 08:00:00,25.0,1.0,1.0,25.0,40.0,3464.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3464.15,10.0
+2019-01-07 09:00:00,25.0,1.0,1.0,25.0,40.0,3478.2,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3478.2,10.0
+2019-01-07 10:00:00,25.0,1.0,1.0,25.0,40.0,3542.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3542.6,10.0
+2019-01-07 11:00:00,25.0,1.0,1.0,25.0,40.0,3589.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3589.3,10.0
+2019-01-07 12:00:00,25.0,1.0,1.0,25.0,40.0,3585.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3585.75,10.0
+2019-01-07 13:00:00,25.0,1.0,1.0,25.0,40.0,3587.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3587.6,10.0
+2019-01-07 14:00:00,25.0,1.0,1.0,25.0,40.0,3570.95,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3570.95,10.0
+2019-01-07 15:00:00,25.0,1.0,1.0,25.0,40.0,3572.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3572.1,10.0
+2019-01-07 16:00:00,25.0,1.0,1.0,25.0,40.0,3625.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3625.75,10.0
+2019-01-07 17:00:00,25.0,1.0,1.0,25.0,40.0,3734.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3734.0,10.0
+2019-01-07 18:00:00,25.0,1.0,1.0,25.0,40.0,3691.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3691.5,10.0
+2019-01-07 19:00:00,25.0,1.0,1.0,25.0,40.0,3595.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3595.65,10.0
+2019-01-07 20:00:00,25.0,1.0,1.0,25.0,40.0,3428.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3428.9,10.0
+2019-01-07 21:00:00,25.0,1.0,1.0,25.0,40.0,3276.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3276.45,10.0
+2019-01-07 22:00:00,25.0,1.0,1.0,25.0,40.0,3114.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3114.75,10.0
+2019-01-07 23:00:00,25.0,1.0,1.0,25.0,40.0,2918.1,2.0,1.0,1.0,1.0,45.05,20.0,2918.1,10.0
+2019-01-08 00:00:00,25.0,1.0,1.0,25.0,40.0,2726.35,2.0,1.0,1.0,1.0,45.05,20.0,2726.35,10.0
+2019-01-08 01:00:00,25.0,1.0,1.0,25.0,40.0,2637.75,2.0,1.0,1.0,1.0,45.05,20.0,2637.75,10.0
+2019-01-08 02:00:00,25.0,1.0,1.0,25.0,40.0,2594.5,2.0,1.0,1.0,1.0,45.05,20.0,2594.5,10.0
+2019-01-08 03:00:00,25.0,1.0,1.0,25.0,40.0,2623.85,2.0,1.0,1.0,1.0,45.05,20.0,2623.85,10.0
+2019-01-08 04:00:00,25.0,1.0,1.0,25.0,40.0,2704.8500000000004,2.0,1.0,1.0,1.0,45.05,20.0,2704.8500000000004,10.0
+2019-01-08 05:00:00,25.0,1.0,1.0,25.0,40.0,2874.4,2.0,1.0,1.0,1.0,45.05,20.0,2874.4,10.0
+2019-01-08 06:00:00,25.0,1.0,1.0,25.0,40.0,3236.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3236.3,10.0
+2019-01-08 07:00:00,25.0,1.0,1.0,25.0,40.0,3550.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3550.7,10.0
+2019-01-08 08:00:00,25.0,1.0,1.0,25.0,40.0,3672.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3672.35,10.0
+2019-01-08 09:00:00,25.0,1.0,1.0,25.0,40.0,3677.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3677.75,10.0
+2019-01-08 10:00:00,25.0,1.0,1.0,25.0,40.0,3709.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3709.45,10.0
+2019-01-08 11:00:00,25.0,1.0,1.0,25.0,40.0,3739.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3739.6,10.0
+2019-01-08 12:00:00,25.0,1.0,1.0,25.0,40.0,3727.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3727.7,10.0
+2019-01-08 13:00:00,25.0,1.0,1.0,25.0,40.0,3717.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3717.7,10.0
+2019-01-08 14:00:00,25.0,1.0,1.0,25.0,40.0,3678.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3678.4,10.0
+2019-01-08 15:00:00,25.0,1.0,1.0,25.0,40.0,3658.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3658.05,10.0
+2019-01-08 16:00:00,25.0,1.0,1.0,25.0,40.0,3695.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3695.4,10.0
+2019-01-08 17:00:00,25.0,1.0,1.0,25.0,40.0,3790.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3790.15,10.0
+2019-01-08 18:00:00,25.0,1.0,1.0,25.0,40.0,3763.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3763.35,10.0
+2019-01-08 19:00:00,25.0,1.0,1.0,25.0,40.0,3674.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3674.5,10.0
+2019-01-08 20:00:00,25.0,1.0,1.0,25.0,40.0,3479.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3479.45,10.0
+2019-01-08 21:00:00,25.0,1.0,1.0,25.0,40.0,3301.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3301.55,10.0
+2019-01-08 22:00:00,25.0,1.0,1.0,25.0,40.0,3147.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3147.05,10.0
+2019-01-08 23:00:00,25.0,1.0,1.0,25.0,40.0,2963.15,2.0,1.0,1.0,1.0,45.05,20.0,2963.15,10.0
+2019-01-09 00:00:00,25.0,1.0,1.0,25.0,40.0,2777.55,2.0,1.0,1.0,1.0,45.05,20.0,2777.55,10.0
+2019-01-09 01:00:00,25.0,1.0,1.0,25.0,40.0,2685.7,2.0,1.0,1.0,1.0,45.05,20.0,2685.7,10.0
+2019-01-09 02:00:00,25.0,1.0,1.0,25.0,40.0,2638.6,2.0,1.0,1.0,1.0,45.05,20.0,2638.6,10.0
+2019-01-09 03:00:00,25.0,1.0,1.0,25.0,40.0,2661.5,2.0,1.0,1.0,1.0,45.05,20.0,2661.5,10.0
+2019-01-09 04:00:00,25.0,1.0,1.0,25.0,40.0,2738.35,2.0,1.0,1.0,1.0,45.05,20.0,2738.35,10.0
+2019-01-09 05:00:00,25.0,1.0,1.0,25.0,40.0,2890.0,2.0,1.0,1.0,1.0,45.05,20.0,2890.0,10.0
+2019-01-09 06:00:00,25.0,1.0,1.0,25.0,40.0,3246.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3246.9,10.0
+2019-01-09 07:00:00,25.0,1.0,1.0,25.0,40.0,3569.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3569.55,10.0
+2019-01-09 08:00:00,25.0,1.0,1.0,25.0,40.0,3703.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3703.15,10.0
+2019-01-09 09:00:00,25.0,1.0,1.0,25.0,40.0,3730.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3730.45,10.0
+2019-01-09 10:00:00,25.0,1.0,1.0,25.0,40.0,3772.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3772.75,10.0
+2019-01-09 11:00:00,25.0,1.0,1.0,25.0,40.0,3816.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3816.55,10.0
+2019-01-09 12:00:00,25.0,1.0,1.0,25.0,40.0,3792.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3792.45,10.0
+2019-01-09 13:00:00,25.0,1.0,1.0,25.0,40.0,3776.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3776.8,10.0
+2019-01-09 14:00:00,25.0,1.0,1.0,25.0,40.0,3742.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3742.7,10.0
+2019-01-09 15:00:00,25.0,1.0,1.0,25.0,40.0,3707.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3707.4,10.0
+2019-01-09 16:00:00,25.0,1.0,1.0,25.0,40.0,3728.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3728.8,10.0
+2019-01-09 17:00:00,25.0,1.0,1.0,25.0,40.0,3845.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3845.15,10.0
+2019-01-09 18:00:00,25.0,1.0,1.0,25.0,40.0,3804.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3804.35,10.0
+2019-01-09 19:00:00,25.0,1.0,1.0,25.0,40.0,3714.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3714.85,10.0
+2019-01-09 20:00:00,25.0,1.0,1.0,25.0,40.0,3503.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3503.05,10.0
+2019-01-09 21:00:00,25.0,1.0,1.0,25.0,40.0,3325.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3325.15,10.0
+2019-01-09 22:00:00,25.0,1.0,1.0,25.0,40.0,3174.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3174.7,10.0
+2019-01-09 23:00:00,25.0,1.0,1.0,25.0,40.0,2947.75,2.0,1.0,1.0,1.0,45.05,20.0,2947.75,10.0
+2019-01-10 00:00:00,25.0,1.0,1.0,25.0,40.0,2816.15,2.0,1.0,1.0,1.0,45.05,20.0,2816.15,10.0
+2019-01-10 01:00:00,25.0,1.0,1.0,25.0,40.0,2697.95,2.0,1.0,1.0,1.0,45.05,20.0,2697.95,10.0
+2019-01-10 02:00:00,25.0,1.0,1.0,25.0,40.0,2649.9,2.0,1.0,1.0,1.0,45.05,20.0,2649.9,10.0
+2019-01-10 03:00:00,25.0,1.0,1.0,25.0,40.0,2657.15,2.0,1.0,1.0,1.0,45.05,20.0,2657.15,10.0
+2019-01-10 04:00:00,25.0,1.0,1.0,25.0,40.0,2692.25,2.0,1.0,1.0,1.0,45.05,20.0,2692.25,10.0
+2019-01-10 05:00:00,25.0,1.0,1.0,25.0,40.0,2830.65,2.0,1.0,1.0,1.0,45.05,20.0,2830.65,10.0
+2019-01-10 06:00:00,25.0,1.0,1.0,25.0,40.0,3177.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3177.8,10.0
+2019-01-10 07:00:00,25.0,1.0,1.0,25.0,40.0,3467.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3467.45,10.0
+2019-01-10 08:00:00,25.0,1.0,1.0,25.0,40.0,3590.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3590.6,10.0
+2019-01-10 09:00:00,25.0,1.0,1.0,25.0,40.0,3593.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3593.15,10.0
+2019-01-10 10:00:00,25.0,1.0,1.0,25.0,40.0,3636.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3636.4,10.0
+2019-01-10 11:00:00,25.0,1.0,1.0,25.0,40.0,3665.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3665.65,10.0
+2019-01-10 12:00:00,25.0,1.0,1.0,25.0,40.0,3652.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3652.6,10.0
+2019-01-10 13:00:00,25.0,1.0,1.0,25.0,40.0,3625.95,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3625.95,10.0
+2019-01-10 14:00:00,25.0,1.0,1.0,25.0,40.0,3573.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3573.15,10.0
+2019-01-10 15:00:00,25.0,1.0,1.0,25.0,40.0,3550.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3550.55,10.0
+2019-01-10 16:00:00,25.0,1.0,1.0,25.0,40.0,3570.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3570.75,10.0
+2019-01-10 17:00:00,25.0,1.0,1.0,25.0,40.0,3688.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3688.05,10.0
+2019-01-10 18:00:00,25.0,1.0,1.0,25.0,40.0,3654.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3654.5,10.0
+2019-01-10 19:00:00,25.0,1.0,1.0,25.0,40.0,3570.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3570.0,10.0
+2019-01-10 20:00:00,25.0,1.0,1.0,25.0,40.0,3393.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3393.3,10.0
+2019-01-10 21:00:00,25.0,1.0,1.0,25.0,40.0,3210.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3210.3,10.0
+2019-01-10 22:00:00,25.0,1.0,1.0,25.0,40.0,3107.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3107.25,10.0
+2019-01-10 23:00:00,25.0,1.0,1.0,25.0,40.0,2918.55,2.0,1.0,1.0,1.0,45.05,20.0,2918.55,10.0
+2019-01-11 00:00:00,25.0,1.0,1.0,25.0,40.0,2809.6000000000004,2.0,1.0,1.0,1.0,45.05,20.0,2809.6000000000004,10.0
+2019-01-11 01:00:00,25.0,1.0,1.0,25.0,40.0,2705.1,2.0,1.0,1.0,1.0,45.05,20.0,2705.1,10.0
+2019-01-11 02:00:00,25.0,1.0,1.0,25.0,40.0,2679.4,2.0,1.0,1.0,1.0,45.05,20.0,2679.4,10.0
+2019-01-11 03:00:00,25.0,1.0,1.0,25.0,40.0,2702.0,2.0,1.0,1.0,1.0,45.05,20.0,2702.0,10.0
+2019-01-11 04:00:00,25.0,1.0,1.0,25.0,40.0,2770.4,2.0,1.0,1.0,1.0,45.05,20.0,2770.4,10.0
+2019-01-11 05:00:00,25.0,1.0,1.0,25.0,40.0,2914.55,2.0,1.0,1.0,1.0,45.05,20.0,2914.55,10.0
+2019-01-11 06:00:00,25.0,1.0,1.0,25.0,40.0,3293.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3293.9,10.0
+2019-01-11 07:00:00,25.0,1.0,1.0,25.0,40.0,3599.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3599.05,10.0
+2019-01-11 08:00:00,25.0,1.0,1.0,25.0,40.0,3736.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3736.35,10.0
+2019-01-11 09:00:00,25.0,1.0,1.0,25.0,40.0,3759.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3759.75,10.0
+2019-01-11 10:00:00,25.0,1.0,1.0,25.0,40.0,3792.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3792.85,10.0
+2019-01-11 11:00:00,25.0,1.0,1.0,25.0,40.0,3831.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3831.55,10.0
+2019-01-11 12:00:00,25.0,1.0,1.0,25.0,40.0,3803.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3803.0,10.0
+2019-01-11 13:00:00,25.0,1.0,1.0,25.0,40.0,3736.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3736.6,10.0
+2019-01-11 14:00:00,25.0,1.0,1.0,25.0,40.0,3648.95,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3648.95,10.0
+2019-01-11 15:00:00,25.0,1.0,1.0,25.0,40.0,3600.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3600.85,10.0
+2019-01-11 16:00:00,25.0,1.0,1.0,25.0,40.0,3618.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3618.25,10.0
+2019-01-11 17:00:00,25.0,1.0,1.0,25.0,40.0,3718.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3718.0,10.0
+2019-01-11 18:00:00,25.0,1.0,1.0,25.0,40.0,3661.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3661.0,10.0
+2019-01-11 19:00:00,25.0,1.0,1.0,25.0,40.0,3571.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3571.75,10.0
+2019-01-11 20:00:00,25.0,1.0,1.0,25.0,40.0,3353.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3353.9,10.0
+2019-01-11 21:00:00,25.0,1.0,1.0,25.0,40.0,3170.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3170.4,10.0
+2019-01-11 22:00:00,25.0,1.0,1.0,25.0,40.0,3049.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3049.85,10.0
+2019-01-11 23:00:00,25.0,1.0,1.0,25.0,40.0,2862.8,2.0,1.0,1.0,1.0,45.05,20.0,2862.8,10.0
+2019-01-12 00:00:00,25.0,1.0,1.0,25.0,40.0,2657.1,2.0,1.0,1.0,1.0,45.05,20.0,2657.1,10.0
+2019-01-12 01:00:00,25.0,1.0,1.0,25.0,40.0,2533.2,2.0,1.0,1.0,1.0,45.05,20.0,2533.2,10.0
+2019-01-12 02:00:00,25.0,1.0,1.0,25.0,40.0,2446.6,2.0,1.0,1.0,1.0,45.05,20.0,2446.6,10.0
+2019-01-12 03:00:00,25.0,1.0,1.0,25.0,40.0,2411.35,2.0,1.0,1.0,1.0,45.05,20.0,2411.35,10.0
+2019-01-12 04:00:00,25.0,1.0,1.0,25.0,40.0,2429.65,2.0,1.0,1.0,1.0,45.05,20.0,2429.65,10.0
+2019-01-12 05:00:00,25.0,1.0,1.0,25.0,40.0,2448.15,2.0,1.0,1.0,1.0,45.05,20.0,2448.15,10.0
+2019-01-12 06:00:00,25.0,1.0,1.0,25.0,40.0,2522.15,2.0,1.0,1.0,1.0,45.05,20.0,2522.15,10.0
+2019-01-12 07:00:00,25.0,1.0,1.0,25.0,40.0,2680.95,2.0,1.0,1.0,1.0,45.05,20.0,2680.95,10.0
+2019-01-12 08:00:00,25.0,1.0,1.0,25.0,40.0,2858.05,2.0,1.0,1.0,1.0,45.05,20.0,2858.05,10.0
+2019-01-12 09:00:00,25.0,1.0,1.0,25.0,40.0,3033.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3033.45,10.0
+2019-01-12 10:00:00,25.0,1.0,1.0,25.0,40.0,3158.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3158.5,10.0
+2019-01-12 11:00:00,25.0,1.0,1.0,25.0,40.0,3229.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3229.15,10.0
+2019-01-12 12:00:00,25.0,1.0,1.0,25.0,40.0,3208.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3208.6,10.0
+2019-01-12 13:00:00,25.0,1.0,1.0,25.0,40.0,3128.2,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3128.2,10.0
+2019-01-12 14:00:00,25.0,1.0,1.0,25.0,40.0,3074.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3074.55,10.0
+2019-01-12 15:00:00,25.0,1.0,1.0,25.0,40.0,3049.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3049.8,10.0
+2019-01-12 16:00:00,25.0,1.0,1.0,25.0,40.0,3102.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3102.55,10.0
+2019-01-12 17:00:00,25.0,1.0,1.0,25.0,40.0,3228.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3228.65,10.0
+2019-01-12 18:00:00,25.0,1.0,1.0,25.0,40.0,3214.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3214.4,10.0
+2019-01-12 19:00:00,25.0,1.0,1.0,25.0,40.0,3109.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3109.9,10.0
+2019-01-12 20:00:00,25.0,1.0,1.0,25.0,40.0,2941.1000000000004,2.0,1.0,1.0,1.0,45.05,20.0,2941.1000000000004,10.0
+2019-01-12 21:00:00,25.0,1.0,1.0,25.0,40.0,2798.75,2.0,1.0,1.0,1.0,45.05,20.0,2798.75,10.0
+2019-01-12 22:00:00,25.0,1.0,1.0,25.0,40.0,2737.7,2.0,1.0,1.0,1.0,45.05,20.0,2737.7,10.0
+2019-01-12 23:00:00,25.0,1.0,1.0,25.0,40.0,2606.9,2.0,1.0,1.0,1.0,45.05,20.0,2606.9,10.0
+2019-01-13 00:00:00,25.0,1.0,1.0,25.0,40.0,2431.55,2.0,1.0,1.0,1.0,45.05,20.0,2431.55,10.0
+2019-01-13 01:00:00,25.0,1.0,1.0,25.0,40.0,2323.05,2.0,1.0,1.0,1.0,45.05,20.0,2323.05,10.0
+2019-01-13 02:00:00,25.0,1.0,1.0,25.0,40.0,2257.5,2.0,1.0,1.0,1.0,45.05,20.0,2257.5,10.0
+2019-01-13 03:00:00,25.0,1.0,1.0,25.0,40.0,2218.85,2.0,1.0,1.0,1.0,45.05,20.0,2218.85,10.0
+2019-01-13 04:00:00,25.0,1.0,1.0,25.0,40.0,2211.95,2.0,1.0,1.0,1.0,45.05,20.0,2211.95,10.0
+2019-01-13 05:00:00,25.0,1.0,1.0,25.0,40.0,2195.75,2.0,1.0,1.0,1.0,45.05,20.0,2195.75,10.0
+2019-01-13 06:00:00,25.0,1.0,1.0,25.0,40.0,2170.25,2.0,1.0,1.0,1.0,45.05,20.0,2170.25,10.0
+2019-01-13 07:00:00,25.0,1.0,1.0,25.0,40.0,2276.3,2.0,1.0,1.0,1.0,45.05,20.0,2276.3,10.0
+2019-01-13 08:00:00,25.0,1.0,1.0,25.0,40.0,2461.55,2.0,1.0,1.0,1.0,45.05,20.0,2461.55,10.0
+2019-01-13 09:00:00,25.0,1.0,1.0,25.0,40.0,2670.55,2.0,1.0,1.0,1.0,45.05,20.0,2670.55,10.0
+2019-01-13 10:00:00,25.0,1.0,1.0,25.0,40.0,2829.45,2.0,1.0,1.0,1.0,45.05,20.0,2829.45,10.0
+2019-01-13 11:00:00,25.0,1.0,1.0,25.0,40.0,2985.7,2.0,1.0,1.0,1.0,45.05,20.0,2985.7,10.0
+2019-01-13 12:00:00,25.0,1.0,1.0,25.0,40.0,2992.1,2.0,1.0,1.0,1.0,45.05,20.0,2992.1,10.0
+2019-01-13 13:00:00,25.0,1.0,1.0,25.0,40.0,2940.1,2.0,1.0,1.0,1.0,45.05,20.0,2940.1,10.0
+2019-01-13 14:00:00,25.0,1.0,1.0,25.0,40.0,2895.55,2.0,1.0,1.0,1.0,45.05,20.0,2895.55,10.0
+2019-01-13 15:00:00,25.0,1.0,1.0,25.0,40.0,2874.55,2.0,1.0,1.0,1.0,45.05,20.0,2874.55,10.0
+2019-01-13 16:00:00,25.0,1.0,1.0,25.0,40.0,2922.4,2.0,1.0,1.0,1.0,45.05,20.0,2922.4,10.0
+2019-01-13 17:00:00,25.0,1.0,1.0,25.0,40.0,3061.95,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3061.95,10.0
+2019-01-13 18:00:00,25.0,1.0,1.0,25.0,40.0,3105.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3105.45,10.0
+2019-01-13 19:00:00,25.0,1.0,1.0,25.0,40.0,3032.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3032.35,10.0
+2019-01-13 20:00:00,25.0,1.0,1.0,25.0,40.0,2892.75,2.0,1.0,1.0,1.0,45.05,20.0,2892.75,10.0
+2019-01-13 21:00:00,25.0,1.0,1.0,25.0,40.0,2819.0,2.0,1.0,1.0,1.0,45.05,20.0,2819.0,10.0
+2019-01-13 22:00:00,25.0,1.0,1.0,25.0,40.0,2803.7,2.0,1.0,1.0,1.0,45.05,20.0,2803.7,10.0
+2019-01-13 23:00:00,25.0,1.0,1.0,25.0,40.0,2673.3,2.0,1.0,1.0,1.0,45.05,20.0,2673.3,10.0
+2019-01-14 00:00:00,25.0,1.0,1.0,25.0,40.0,2542.55,2.0,1.0,1.0,1.0,45.05,20.0,2542.55,10.0
+2019-01-14 01:00:00,25.0,1.0,1.0,25.0,40.0,2435.3500000000004,2.0,1.0,1.0,1.0,45.05,20.0,2435.3500000000004,10.0
+2019-01-14 02:00:00,25.0,1.0,1.0,25.0,40.0,2402.75,2.0,1.0,1.0,1.0,45.05,20.0,2402.75,10.0
+2019-01-14 03:00:00,25.0,1.0,1.0,25.0,40.0,2429.2,2.0,1.0,1.0,1.0,45.05,20.0,2429.2,10.0
+2019-01-14 04:00:00,25.0,1.0,1.0,25.0,40.0,2514.75,2.0,1.0,1.0,1.0,45.05,20.0,2514.75,10.0
+2019-01-14 05:00:00,25.0,1.0,1.0,25.0,40.0,2734.55,2.0,1.0,1.0,1.0,45.05,20.0,2734.55,10.0
+2019-01-14 06:00:00,25.0,1.0,1.0,25.0,40.0,3157.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3157.15,10.0
+2019-01-14 07:00:00,25.0,1.0,1.0,25.0,40.0,3498.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3498.55,10.0
+2019-01-14 08:00:00,25.0,1.0,1.0,25.0,40.0,3627.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3627.4,10.0
+2019-01-14 09:00:00,25.0,1.0,1.0,25.0,40.0,3634.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3634.6,10.0
+2019-01-14 10:00:00,25.0,1.0,1.0,25.0,40.0,3651.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3651.8,10.0
+2019-01-14 11:00:00,25.0,1.0,1.0,25.0,40.0,3692.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3692.35,10.0
+2019-01-14 12:00:00,25.0,1.0,1.0,25.0,40.0,3683.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3683.0,10.0
+2019-01-14 13:00:00,25.0,1.0,1.0,25.0,40.0,3682.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3682.7,10.0
+2019-01-14 14:00:00,25.0,1.0,1.0,25.0,40.0,3651.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3651.45,10.0
+2019-01-14 15:00:00,25.0,1.0,1.0,25.0,40.0,3604.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3604.8,10.0
+2019-01-14 16:00:00,25.0,1.0,1.0,25.0,40.0,3582.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3582.85,10.0
+2019-01-14 17:00:00,25.0,1.0,1.0,25.0,40.0,3740.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3740.6,10.0
+2019-01-14 18:00:00,25.0,1.0,1.0,25.0,40.0,3718.95,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3718.95,10.0
+2019-01-14 19:00:00,25.0,1.0,1.0,25.0,40.0,3627.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3627.7,10.0
+2019-01-14 20:00:00,25.0,1.0,1.0,25.0,40.0,3428.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3428.0,10.0
+2019-01-14 21:00:00,25.0,1.0,1.0,25.0,40.0,3256.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3256.45,10.0
+2019-01-14 22:00:00,25.0,1.0,1.0,25.0,40.0,3104.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3104.75,10.0
+2019-01-14 23:00:00,25.0,1.0,1.0,25.0,40.0,2898.65,2.0,1.0,1.0,1.0,45.05,20.0,2898.65,10.0
+2019-01-15 00:00:00,25.0,1.0,1.0,25.0,40.0,2764.5,2.0,1.0,1.0,1.0,45.05,20.0,2764.5,10.0
+2019-01-15 01:00:00,25.0,1.0,1.0,25.0,40.0,2677.45,2.0,1.0,1.0,1.0,45.05,20.0,2677.45,10.0
+2019-01-15 02:00:00,25.0,1.0,1.0,25.0,40.0,2648.0,2.0,1.0,1.0,1.0,45.05,20.0,2648.0,10.0
+2019-01-15 03:00:00,25.0,1.0,1.0,25.0,40.0,2658.05,2.0,1.0,1.0,1.0,45.05,20.0,2658.05,10.0
+2019-01-15 04:00:00,25.0,1.0,1.0,25.0,40.0,2712.55,2.0,1.0,1.0,1.0,45.05,20.0,2712.55,10.0
+2019-01-15 05:00:00,25.0,1.0,1.0,25.0,40.0,2885.3,2.0,1.0,1.0,1.0,45.05,20.0,2885.3,10.0
+2019-01-15 06:00:00,25.0,1.0,1.0,25.0,40.0,3224.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3224.55,10.0
+2019-01-15 07:00:00,25.0,1.0,1.0,25.0,40.0,3553.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3553.8,10.0
+2019-01-15 08:00:00,25.0,1.0,1.0,25.0,40.0,3676.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3676.35,10.0
+2019-01-15 09:00:00,25.0,1.0,1.0,25.0,40.0,3677.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3677.25,10.0
+2019-01-15 10:00:00,25.0,1.0,1.0,25.0,40.0,3724.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3724.55,10.0
+2019-01-15 11:00:00,25.0,1.0,1.0,25.0,40.0,3746.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3746.8,10.0
+2019-01-15 12:00:00,25.0,1.0,1.0,25.0,40.0,3715.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3715.65,10.0
+2019-01-15 13:00:00,25.0,1.0,1.0,25.0,40.0,3708.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3708.65,10.0
+2019-01-15 14:00:00,25.0,1.0,1.0,25.0,40.0,3689.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3689.45,10.0
+2019-01-15 15:00:00,25.0,1.0,1.0,25.0,40.0,3667.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3667.9,10.0
+2019-01-15 16:00:00,25.0,1.0,1.0,25.0,40.0,3667.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3667.75,10.0
+2019-01-15 17:00:00,25.0,1.0,1.0,25.0,40.0,3769.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3769.5,10.0
+2019-01-15 18:00:00,25.0,1.0,1.0,25.0,40.0,3737.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3737.35,10.0
+2019-01-15 19:00:00,25.0,1.0,1.0,25.0,40.0,3652.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3652.3,10.0
+2019-01-15 20:00:00,25.0,1.0,1.0,25.0,40.0,3463.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3463.9,10.0
+2019-01-15 21:00:00,25.0,1.0,1.0,25.0,40.0,3288.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3288.4,10.0
+2019-01-15 22:00:00,25.0,1.0,1.0,25.0,40.0,3138.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3138.75,10.0
+2019-01-15 23:00:00,25.0,1.0,1.0,25.0,40.0,2929.25,2.0,1.0,1.0,1.0,45.05,20.0,2929.25,10.0
+2019-01-16 00:00:00,25.0,1.0,1.0,25.0,40.0,2758.1,2.0,1.0,1.0,1.0,45.05,20.0,2758.1,10.0
+2019-01-16 01:00:00,25.0,1.0,1.0,25.0,40.0,2650.55,2.0,1.0,1.0,1.0,45.05,20.0,2650.55,10.0
+2019-01-16 02:00:00,25.0,1.0,1.0,25.0,40.0,2596.6,2.0,1.0,1.0,1.0,45.05,20.0,2596.6,10.0
+2019-01-16 03:00:00,25.0,1.0,1.0,25.0,40.0,2604.8500000000004,2.0,1.0,1.0,1.0,45.05,20.0,2604.8500000000004,10.0
+2019-01-16 04:00:00,25.0,1.0,1.0,25.0,40.0,2680.2,2.0,1.0,1.0,1.0,45.05,20.0,2680.2,10.0
+2019-01-16 05:00:00,25.0,1.0,1.0,25.0,40.0,2839.9,2.0,1.0,1.0,1.0,45.05,20.0,2839.9,10.0
+2019-01-16 06:00:00,25.0,1.0,1.0,25.0,40.0,3240.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3240.25,10.0
+2019-01-16 07:00:00,25.0,1.0,1.0,25.0,40.0,3586.95,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3586.95,10.0
+2019-01-16 08:00:00,25.0,1.0,1.0,25.0,40.0,3694.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3694.8,10.0
+2019-01-16 09:00:00,25.0,1.0,1.0,25.0,40.0,3700.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3700.25,10.0
+2019-01-16 10:00:00,25.0,1.0,1.0,25.0,40.0,3742.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3742.0,10.0
+2019-01-16 11:00:00,25.0,1.0,1.0,25.0,40.0,3774.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3774.15,10.0
+2019-01-16 12:00:00,25.0,1.0,1.0,25.0,40.0,3745.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3745.85,10.0
+2019-01-16 13:00:00,25.0,1.0,1.0,25.0,40.0,3737.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3737.8,10.0
+2019-01-16 14:00:00,25.0,1.0,1.0,25.0,40.0,3665.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3665.45,10.0
+2019-01-16 15:00:00,25.0,1.0,1.0,25.0,40.0,3641.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3641.9,10.0
+2019-01-16 16:00:00,25.0,1.0,1.0,25.0,40.0,3641.2,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3641.2,10.0
+2019-01-16 17:00:00,25.0,1.0,1.0,25.0,40.0,3782.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3782.85,10.0
+2019-01-16 18:00:00,25.0,1.0,1.0,25.0,40.0,3759.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3759.9,10.0
+2019-01-16 19:00:00,25.0,1.0,1.0,25.0,40.0,3677.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3677.9,10.0
+2019-01-16 20:00:00,25.0,1.0,1.0,25.0,40.0,3464.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3464.55,10.0
+2019-01-16 21:00:00,25.0,1.0,1.0,25.0,40.0,3290.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3290.15,10.0
+2019-01-16 22:00:00,25.0,1.0,1.0,25.0,40.0,3143.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3143.1,10.0
+2019-01-16 23:00:00,25.0,1.0,1.0,25.0,40.0,2955.25,2.0,1.0,1.0,1.0,45.05,20.0,2955.25,10.0
+2019-01-17 00:00:00,25.0,1.0,1.0,25.0,40.0,2806.4,2.0,1.0,1.0,1.0,45.05,20.0,2806.4,10.0
+2019-01-17 01:00:00,25.0,1.0,1.0,25.0,40.0,2709.8,2.0,1.0,1.0,1.0,45.05,20.0,2709.8,10.0
+2019-01-17 02:00:00,25.0,1.0,1.0,25.0,40.0,2648.45,2.0,1.0,1.0,1.0,45.05,20.0,2648.45,10.0
+2019-01-17 03:00:00,25.0,1.0,1.0,25.0,40.0,2648.75,2.0,1.0,1.0,1.0,45.05,20.0,2648.75,10.0
+2019-01-17 04:00:00,25.0,1.0,1.0,25.0,40.0,2731.2,2.0,1.0,1.0,1.0,45.05,20.0,2731.2,10.0
+2019-01-17 05:00:00,25.0,1.0,1.0,25.0,40.0,2890.55,2.0,1.0,1.0,1.0,45.05,20.0,2890.55,10.0
+2019-01-17 06:00:00,25.0,1.0,1.0,25.0,40.0,3265.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3265.5,10.0
+2019-01-17 07:00:00,25.0,1.0,1.0,25.0,40.0,3611.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3611.0,10.0
+2019-01-17 08:00:00,25.0,1.0,1.0,25.0,40.0,3729.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3729.15,10.0
+2019-01-17 09:00:00,25.0,1.0,1.0,25.0,40.0,3723.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3723.15,10.0
+2019-01-17 10:00:00,25.0,1.0,1.0,25.0,40.0,3752.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3752.65,10.0
+2019-01-17 11:00:00,25.0,1.0,1.0,25.0,40.0,3801.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3801.5,10.0
+2019-01-17 12:00:00,25.0,1.0,1.0,25.0,40.0,3798.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3798.05,10.0
+2019-01-17 13:00:00,25.0,1.0,1.0,25.0,40.0,3778.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3778.05,10.0
+2019-01-17 14:00:00,25.0,1.0,1.0,25.0,40.0,3732.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3732.3,10.0
+2019-01-17 15:00:00,25.0,1.0,1.0,25.0,40.0,3685.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3685.15,10.0
+2019-01-17 16:00:00,25.0,1.0,1.0,25.0,40.0,3670.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3670.45,10.0
+2019-01-17 17:00:00,25.0,1.0,1.0,25.0,40.0,3807.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3807.35,10.0
+2019-01-17 18:00:00,25.0,1.0,1.0,25.0,40.0,3804.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3804.0,10.0
+2019-01-17 19:00:00,25.0,1.0,1.0,25.0,40.0,3701.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3701.25,10.0
+2019-01-17 20:00:00,25.0,1.0,1.0,25.0,40.0,3522.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3522.15,10.0
+2019-01-17 21:00:00,25.0,1.0,1.0,25.0,40.0,3326.2,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3326.2,10.0
+2019-01-17 22:00:00,25.0,1.0,1.0,25.0,40.0,3166.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3166.9,10.0
+2019-01-17 23:00:00,25.0,1.0,1.0,25.0,40.0,2949.15,2.0,1.0,1.0,1.0,45.05,20.0,2949.15,10.0
+2019-01-18 00:00:00,25.0,1.0,1.0,25.0,40.0,2822.65,2.0,1.0,1.0,1.0,45.05,20.0,2822.65,10.0
+2019-01-18 01:00:00,25.0,1.0,1.0,25.0,40.0,2705.15,2.0,1.0,1.0,1.0,45.05,20.0,2705.15,10.0
+2019-01-18 02:00:00,25.0,1.0,1.0,25.0,40.0,2636.85,2.0,1.0,1.0,1.0,45.05,20.0,2636.85,10.0
+2019-01-18 03:00:00,25.0,1.0,1.0,25.0,40.0,2651.85,2.0,1.0,1.0,1.0,45.05,20.0,2651.85,10.0
+2019-01-18 04:00:00,25.0,1.0,1.0,25.0,40.0,2715.4,2.0,1.0,1.0,1.0,45.05,20.0,2715.4,10.0
+2019-01-18 05:00:00,25.0,1.0,1.0,25.0,40.0,2850.15,2.0,1.0,1.0,1.0,45.05,20.0,2850.15,10.0
+2019-01-18 06:00:00,25.0,1.0,1.0,25.0,40.0,3207.95,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3207.95,10.0
+2019-01-18 07:00:00,25.0,1.0,1.0,25.0,40.0,3519.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3519.1,10.0
+2019-01-18 08:00:00,25.0,1.0,1.0,25.0,40.0,3623.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3623.05,10.0
+2019-01-18 09:00:00,25.0,1.0,1.0,25.0,40.0,3612.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3612.6,10.0
+2019-01-18 10:00:00,25.0,1.0,1.0,25.0,40.0,3608.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3608.0,10.0
+2019-01-18 11:00:00,25.0,1.0,1.0,25.0,40.0,3625.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3625.5,10.0
+2019-01-18 12:00:00,25.0,1.0,1.0,25.0,40.0,3583.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3583.0,10.0
+2019-01-18 13:00:00,25.0,1.0,1.0,25.0,40.0,3514.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3514.85,10.0
+2019-01-18 14:00:00,25.0,1.0,1.0,25.0,40.0,3457.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3457.15,10.0
+2019-01-18 15:00:00,25.0,1.0,1.0,25.0,40.0,3418.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3418.0,10.0
+2019-01-18 16:00:00,25.0,1.0,1.0,25.0,40.0,3433.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3433.5,10.0
+2019-01-18 17:00:00,25.0,1.0,1.0,25.0,40.0,3599.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3599.75,10.0
+2019-01-18 18:00:00,25.0,1.0,1.0,25.0,40.0,3587.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3587.15,10.0
+2019-01-18 19:00:00,25.0,1.0,1.0,25.0,40.0,3489.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3489.6,10.0
+2019-01-18 20:00:00,25.0,1.0,1.0,25.0,40.0,3290.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3290.1,10.0
+2019-01-18 21:00:00,25.0,1.0,1.0,25.0,40.0,3139.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3139.35,10.0
+2019-01-18 22:00:00,25.0,1.0,1.0,25.0,40.0,3022.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3022.8,10.0
+2019-01-18 23:00:00,25.0,1.0,1.0,25.0,40.0,2887.4,2.0,1.0,1.0,1.0,45.05,20.0,2887.4,10.0
+2019-01-19 00:00:00,25.0,1.0,1.0,25.0,40.0,2757.35,2.0,1.0,1.0,1.0,45.05,20.0,2757.35,10.0
+2019-01-19 01:00:00,25.0,1.0,1.0,25.0,40.0,2649.3,2.0,1.0,1.0,1.0,45.05,20.0,2649.3,10.0
+2019-01-19 02:00:00,25.0,1.0,1.0,25.0,40.0,2575.25,2.0,1.0,1.0,1.0,45.05,20.0,2575.25,10.0
+2019-01-19 03:00:00,25.0,1.0,1.0,25.0,40.0,2542.55,2.0,1.0,1.0,1.0,45.05,20.0,2542.55,10.0
+2019-01-19 04:00:00,25.0,1.0,1.0,25.0,40.0,2529.4,2.0,1.0,1.0,1.0,45.05,20.0,2529.4,10.0
+2019-01-19 05:00:00,25.0,1.0,1.0,25.0,40.0,2521.0,2.0,1.0,1.0,1.0,45.05,20.0,2521.0,10.0
+2019-01-19 06:00:00,25.0,1.0,1.0,25.0,40.0,2554.15,2.0,1.0,1.0,1.0,45.05,20.0,2554.15,10.0
+2019-01-19 07:00:00,25.0,1.0,1.0,25.0,40.0,2689.6,2.0,1.0,1.0,1.0,45.05,20.0,2689.6,10.0
+2019-01-19 08:00:00,25.0,1.0,1.0,25.0,40.0,2868.35,2.0,1.0,1.0,1.0,45.05,20.0,2868.35,10.0
+2019-01-19 09:00:00,25.0,1.0,1.0,25.0,40.0,3016.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3016.45,10.0
+2019-01-19 10:00:00,25.0,1.0,1.0,25.0,40.0,3093.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3093.5,10.0
+2019-01-19 11:00:00,25.0,1.0,1.0,25.0,40.0,3114.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3114.6,10.0
+2019-01-19 12:00:00,25.0,1.0,1.0,25.0,40.0,3061.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3061.4,10.0
+2019-01-19 13:00:00,25.0,1.0,1.0,25.0,40.0,2959.45,2.0,1.0,1.0,1.0,45.05,20.0,2959.45,10.0
+2019-01-19 14:00:00,25.0,1.0,1.0,25.0,40.0,2884.05,2.0,1.0,1.0,1.0,45.05,20.0,2884.05,10.0
+2019-01-19 15:00:00,25.0,1.0,1.0,25.0,40.0,2874.35,2.0,1.0,1.0,1.0,45.05,20.0,2874.35,10.0
+2019-01-19 16:00:00,25.0,1.0,1.0,25.0,40.0,2932.25,2.0,1.0,1.0,1.0,45.05,20.0,2932.25,10.0
+2019-01-19 17:00:00,25.0,1.0,1.0,25.0,40.0,3144.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3144.0,10.0
+2019-01-19 18:00:00,25.0,1.0,1.0,25.0,40.0,3175.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3175.8,10.0
+2019-01-19 19:00:00,25.0,1.0,1.0,25.0,40.0,3086.2,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3086.2,10.0
+2019-01-19 20:00:00,25.0,1.0,1.0,25.0,40.0,2903.3,2.0,1.0,1.0,1.0,45.05,20.0,2903.3,10.0
+2019-01-19 21:00:00,25.0,1.0,1.0,25.0,40.0,2790.05,2.0,1.0,1.0,1.0,45.05,20.0,2790.05,10.0
+2019-01-19 22:00:00,25.0,1.0,1.0,25.0,40.0,2756.0,2.0,1.0,1.0,1.0,45.05,20.0,2756.0,10.0
+2019-01-19 23:00:00,25.0,1.0,1.0,25.0,40.0,2652.8,2.0,1.0,1.0,1.0,45.05,20.0,2652.8,10.0
+2019-01-20 00:00:00,25.0,1.0,1.0,25.0,40.0,2531.9,2.0,1.0,1.0,1.0,45.05,20.0,2531.9,10.0
+2019-01-20 01:00:00,25.0,1.0,1.0,25.0,40.0,2435.45,2.0,1.0,1.0,1.0,45.05,20.0,2435.45,10.0
+2019-01-20 02:00:00,25.0,1.0,1.0,25.0,40.0,2381.0,2.0,1.0,1.0,1.0,45.05,20.0,2381.0,10.0
+2019-01-20 03:00:00,25.0,1.0,1.0,25.0,40.0,2352.6,2.0,1.0,1.0,1.0,45.05,20.0,2352.6,10.0
+2019-01-20 04:00:00,25.0,1.0,1.0,25.0,40.0,2333.05,2.0,1.0,1.0,1.0,45.05,20.0,2333.05,10.0
+2019-01-20 05:00:00,25.0,1.0,1.0,25.0,40.0,2310.95,2.0,1.0,1.0,1.0,45.05,20.0,2310.95,10.0
+2019-01-20 06:00:00,25.0,1.0,1.0,25.0,40.0,2278.05,2.0,1.0,1.0,1.0,45.05,20.0,2278.05,10.0
+2019-01-20 07:00:00,25.0,1.0,1.0,25.0,40.0,2363.3,2.0,1.0,1.0,1.0,45.05,20.0,2363.3,10.0
+2019-01-20 08:00:00,25.0,1.0,1.0,25.0,40.0,2525.85,2.0,1.0,1.0,1.0,45.05,20.0,2525.85,10.0
+2019-01-20 09:00:00,25.0,1.0,1.0,25.0,40.0,2694.95,2.0,1.0,1.0,1.0,45.05,20.0,2694.95,10.0
+2019-01-20 10:00:00,25.0,1.0,1.0,25.0,40.0,2787.25,2.0,1.0,1.0,1.0,45.05,20.0,2787.25,10.0
+2019-01-20 11:00:00,25.0,1.0,1.0,25.0,40.0,2871.95,2.0,1.0,1.0,1.0,45.05,20.0,2871.95,10.0
+2019-01-20 12:00:00,25.0,1.0,1.0,25.0,40.0,2860.55,2.0,1.0,1.0,1.0,45.05,20.0,2860.55,10.0
+2019-01-20 13:00:00,25.0,1.0,1.0,25.0,40.0,2778.4,2.0,1.0,1.0,1.0,45.05,20.0,2778.4,10.0
+2019-01-20 14:00:00,25.0,1.0,1.0,25.0,40.0,2703.95,2.0,1.0,1.0,1.0,45.05,20.0,2703.95,10.0
+2019-01-20 15:00:00,25.0,1.0,1.0,25.0,40.0,2688.55,2.0,1.0,1.0,1.0,45.05,20.0,2688.55,10.0
+2019-01-20 16:00:00,25.0,1.0,1.0,25.0,40.0,2752.85,2.0,1.0,1.0,1.0,45.05,20.0,2752.85,10.0
+2019-01-20 17:00:00,25.0,1.0,1.0,25.0,40.0,2974.75,2.0,1.0,1.0,1.0,45.05,20.0,2974.75,10.0
+2019-01-20 18:00:00,25.0,1.0,1.0,25.0,40.0,3070.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3070.1,10.0
+2019-01-20 19:00:00,25.0,1.0,1.0,25.0,40.0,3011.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3011.55,10.0
+2019-01-20 20:00:00,25.0,1.0,1.0,25.0,40.0,2889.25,2.0,1.0,1.0,1.0,45.05,20.0,2889.25,10.0
+2019-01-20 21:00:00,25.0,1.0,1.0,25.0,40.0,2817.3,2.0,1.0,1.0,1.0,45.05,20.0,2817.3,10.0
+2019-01-20 22:00:00,25.0,1.0,1.0,25.0,40.0,2839.0,2.0,1.0,1.0,1.0,45.05,20.0,2839.0,10.0
+2019-01-20 23:00:00,25.0,1.0,1.0,25.0,40.0,2735.75,2.0,1.0,1.0,1.0,45.05,20.0,2735.75,10.0
+2019-01-21 00:00:00,25.0,1.0,1.0,25.0,40.0,2651.5,2.0,1.0,1.0,1.0,45.05,20.0,2651.5,10.0
+2019-01-21 01:00:00,25.0,1.0,1.0,25.0,40.0,2573.35,2.0,1.0,1.0,1.0,45.05,20.0,2573.35,10.0
+2019-01-21 02:00:00,25.0,1.0,1.0,25.0,40.0,2541.4,2.0,1.0,1.0,1.0,45.05,20.0,2541.4,10.0
+2019-01-21 03:00:00,25.0,1.0,1.0,25.0,40.0,2557.4,2.0,1.0,1.0,1.0,45.05,20.0,2557.4,10.0
+2019-01-21 04:00:00,25.0,1.0,1.0,25.0,40.0,2640.5,2.0,1.0,1.0,1.0,45.05,20.0,2640.5,10.0
+2019-01-21 05:00:00,25.0,1.0,1.0,25.0,40.0,2841.0,2.0,1.0,1.0,1.0,45.05,20.0,2841.0,10.0
+2019-01-21 06:00:00,25.0,1.0,1.0,25.0,40.0,3255.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3255.4,10.0
+2019-01-21 07:00:00,25.0,1.0,1.0,25.0,40.0,3591.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3591.05,10.0
+2019-01-21 08:00:00,25.0,1.0,1.0,25.0,40.0,3684.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3684.05,10.0
+2019-01-21 09:00:00,25.0,1.0,1.0,25.0,40.0,3685.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3685.5,10.0
+2019-01-21 10:00:00,25.0,1.0,1.0,25.0,40.0,3698.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3698.45,10.0
+2019-01-21 11:00:00,25.0,1.0,1.0,25.0,40.0,3725.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3725.05,10.0
+2019-01-21 12:00:00,25.0,1.0,1.0,25.0,40.0,3689.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3689.65,10.0
+2019-01-21 13:00:00,25.0,1.0,1.0,25.0,40.0,3664.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3664.7,10.0
+2019-01-21 14:00:00,25.0,1.0,1.0,25.0,40.0,3609.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3609.65,10.0
+2019-01-21 15:00:00,25.0,1.0,1.0,25.0,40.0,3579.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3579.65,10.0
+2019-01-21 16:00:00,25.0,1.0,1.0,25.0,40.0,3582.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3582.25,10.0
+2019-01-21 17:00:00,25.0,1.0,1.0,25.0,40.0,3765.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3765.7,10.0
+2019-01-21 18:00:00,25.0,1.0,1.0,25.0,40.0,3779.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3779.65,10.0
+2019-01-21 19:00:00,25.0,1.0,1.0,25.0,40.0,3695.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3695.25,10.0
+2019-01-21 20:00:00,25.0,1.0,1.0,25.0,40.0,3514.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3514.15,10.0
+2019-01-21 21:00:00,25.0,1.0,1.0,25.0,40.0,3348.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3348.75,10.0
+2019-01-21 22:00:00,25.0,1.0,1.0,25.0,40.0,3228.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3228.05,10.0
+2019-01-21 23:00:00,25.0,1.0,1.0,25.0,40.0,3029.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3029.15,10.0
+2019-01-22 00:00:00,25.0,1.0,1.0,25.0,40.0,2894.4,2.0,1.0,1.0,1.0,45.05,20.0,2894.4,10.0
+2019-01-22 01:00:00,25.0,1.0,1.0,25.0,40.0,2817.35,2.0,1.0,1.0,1.0,45.05,20.0,2817.35,10.0
+2019-01-22 02:00:00,25.0,1.0,1.0,25.0,40.0,2764.95,2.0,1.0,1.0,1.0,45.05,20.0,2764.95,10.0
+2019-01-22 03:00:00,25.0,1.0,1.0,25.0,40.0,2756.75,2.0,1.0,1.0,1.0,45.05,20.0,2756.75,10.0
+2019-01-22 04:00:00,25.0,1.0,1.0,25.0,40.0,2814.4,2.0,1.0,1.0,1.0,45.05,20.0,2814.4,10.0
+2019-01-22 05:00:00,25.0,1.0,1.0,25.0,40.0,2973.65,2.0,1.0,1.0,1.0,45.05,20.0,2973.65,10.0
+2019-01-22 06:00:00,25.0,1.0,1.0,25.0,40.0,3339.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3339.0,10.0
+2019-01-22 07:00:00,25.0,1.0,1.0,25.0,40.0,3662.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3662.55,10.0
+2019-01-22 08:00:00,25.0,1.0,1.0,25.0,40.0,3775.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3775.65,10.0
+2019-01-22 09:00:00,25.0,1.0,1.0,25.0,40.0,3790.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3790.05,10.0
+2019-01-22 10:00:00,25.0,1.0,1.0,25.0,40.0,3807.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3807.5,10.0
+2019-01-22 11:00:00,25.0,1.0,1.0,25.0,40.0,3826.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3826.9,10.0
+2019-01-22 12:00:00,25.0,1.0,1.0,25.0,40.0,3800.2,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3800.2,10.0
+2019-01-22 13:00:00,25.0,1.0,1.0,25.0,40.0,3773.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3773.0,10.0
+2019-01-22 14:00:00,25.0,1.0,1.0,25.0,40.0,3742.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3742.55,10.0
+2019-01-22 15:00:00,25.0,1.0,1.0,25.0,40.0,3710.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3710.55,10.0
+2019-01-22 16:00:00,25.0,1.0,1.0,25.0,40.0,3702.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3702.75,10.0
+2019-01-22 17:00:00,25.0,1.0,1.0,25.0,40.0,3840.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3840.3,10.0
+2019-01-22 18:00:00,25.0,1.0,1.0,25.0,40.0,3833.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3833.25,10.0
+2019-01-22 19:00:00,25.0,1.0,1.0,25.0,40.0,3754.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3754.35,10.0
+2019-01-22 20:00:00,25.0,1.0,1.0,25.0,40.0,3555.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3555.65,10.0
+2019-01-22 21:00:00,25.0,1.0,1.0,25.0,40.0,3375.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3375.5,10.0
+2019-01-22 22:00:00,25.0,1.0,1.0,25.0,40.0,3240.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3240.9,10.0
+2019-01-22 23:00:00,25.0,1.0,1.0,25.0,40.0,3056.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3056.35,10.0
+2019-01-23 00:00:00,25.0,1.0,1.0,25.0,40.0,2936.65,2.0,1.0,1.0,1.0,45.05,20.0,2936.65,10.0
+2019-01-23 01:00:00,25.0,1.0,1.0,25.0,40.0,2835.6,2.0,1.0,1.0,1.0,45.05,20.0,2835.6,10.0
+2019-01-23 02:00:00,25.0,1.0,1.0,25.0,40.0,2796.5,2.0,1.0,1.0,1.0,45.05,20.0,2796.5,10.0
+2019-01-23 03:00:00,25.0,1.0,1.0,25.0,40.0,2798.4,2.0,1.0,1.0,1.0,45.05,20.0,2798.4,10.0
+2019-01-23 04:00:00,25.0,1.0,1.0,25.0,40.0,2838.35,2.0,1.0,1.0,1.0,45.05,20.0,2838.35,10.0
+2019-01-23 05:00:00,25.0,1.0,1.0,25.0,40.0,2987.45,2.0,1.0,1.0,1.0,45.05,20.0,2987.45,10.0
+2019-01-23 06:00:00,25.0,1.0,1.0,25.0,40.0,3332.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3332.9,10.0
+2019-01-23 07:00:00,25.0,1.0,1.0,25.0,40.0,3663.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3663.9,10.0
+2019-01-23 08:00:00,25.0,1.0,1.0,25.0,40.0,3757.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3757.9,10.0
+2019-01-23 09:00:00,25.0,1.0,1.0,25.0,40.0,3771.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3771.7,10.0
+2019-01-23 10:00:00,25.0,1.0,1.0,25.0,40.0,3787.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3787.35,10.0
+2019-01-23 11:00:00,25.0,1.0,1.0,25.0,40.0,3797.95,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3797.95,10.0
+2019-01-23 12:00:00,25.0,1.0,1.0,25.0,40.0,3773.2,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3773.2,10.0
+2019-01-23 13:00:00,25.0,1.0,1.0,25.0,40.0,3749.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3749.0,10.0
+2019-01-23 14:00:00,25.0,1.0,1.0,25.0,40.0,3704.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3704.5,10.0
+2019-01-23 15:00:00,25.0,1.0,1.0,25.0,40.0,3666.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3666.35,10.0
+2019-01-23 16:00:00,25.0,1.0,1.0,25.0,40.0,3661.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3661.8,10.0
+2019-01-23 17:00:00,25.0,1.0,1.0,25.0,40.0,3796.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3796.5,10.0
+2019-01-23 18:00:00,25.0,1.0,1.0,25.0,40.0,3801.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3801.5,10.0
+2019-01-23 19:00:00,25.0,1.0,1.0,25.0,40.0,3716.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3716.25,10.0
+2019-01-23 20:00:00,25.0,1.0,1.0,25.0,40.0,3525.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3525.65,10.0
+2019-01-23 21:00:00,25.0,1.0,1.0,25.0,40.0,3358.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3358.3,10.0
+2019-01-23 22:00:00,25.0,1.0,1.0,25.0,40.0,3244.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3244.15,10.0
+2019-01-23 23:00:00,25.0,1.0,1.0,25.0,40.0,3046.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3046.5,10.0
+2019-01-24 00:00:00,25.0,1.0,1.0,25.0,40.0,2935.3,2.0,1.0,1.0,1.0,45.05,20.0,2935.3,10.0
+2019-01-24 01:00:00,25.0,1.0,1.0,25.0,40.0,2834.55,2.0,1.0,1.0,1.0,45.05,20.0,2834.55,10.0
+2019-01-24 02:00:00,25.0,1.0,1.0,25.0,40.0,2751.2,2.0,1.0,1.0,1.0,45.05,20.0,2751.2,10.0
+2019-01-24 03:00:00,25.0,1.0,1.0,25.0,40.0,2748.6,2.0,1.0,1.0,1.0,45.05,20.0,2748.6,10.0
+2019-01-24 04:00:00,25.0,1.0,1.0,25.0,40.0,2780.95,2.0,1.0,1.0,1.0,45.05,20.0,2780.95,10.0
+2019-01-24 05:00:00,25.0,1.0,1.0,25.0,40.0,2926.25,2.0,1.0,1.0,1.0,45.05,20.0,2926.25,10.0
+2019-01-24 06:00:00,25.0,1.0,1.0,25.0,40.0,3254.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3254.55,10.0
+2019-01-24 07:00:00,25.0,1.0,1.0,25.0,40.0,3570.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3570.3,10.0
+2019-01-24 08:00:00,25.0,1.0,1.0,25.0,40.0,3695.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3695.75,10.0
+2019-01-24 09:00:00,25.0,1.0,1.0,25.0,40.0,3721.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3721.25,10.0
+2019-01-24 10:00:00,25.0,1.0,1.0,25.0,40.0,3761.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3761.8,10.0
+2019-01-24 11:00:00,25.0,1.0,1.0,25.0,40.0,3783.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3783.55,10.0
+2019-01-24 12:00:00,25.0,1.0,1.0,25.0,40.0,3758.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3758.9,10.0
+2019-01-24 13:00:00,25.0,1.0,1.0,25.0,40.0,3741.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3741.5,10.0
+2019-01-24 14:00:00,25.0,1.0,1.0,25.0,40.0,3699.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3699.55,10.0
+2019-01-24 15:00:00,25.0,1.0,1.0,25.0,40.0,3693.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3693.6,10.0
+2019-01-24 16:00:00,25.0,1.0,1.0,25.0,40.0,3674.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3674.7,10.0
+2019-01-24 17:00:00,25.0,1.0,1.0,25.0,40.0,3785.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3785.1,10.0
+2019-01-24 18:00:00,25.0,1.0,1.0,25.0,40.0,3765.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3765.85,10.0
+2019-01-24 19:00:00,25.0,1.0,1.0,25.0,40.0,3681.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3681.55,10.0
+2019-01-24 20:00:00,25.0,1.0,1.0,25.0,40.0,3485.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3485.7,10.0
+2019-01-24 21:00:00,25.0,1.0,1.0,25.0,40.0,3319.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3319.35,10.0
+2019-01-24 22:00:00,25.0,1.0,1.0,25.0,40.0,3188.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3188.45,10.0
+2019-01-24 23:00:00,25.0,1.0,1.0,25.0,40.0,3007.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3007.1,10.0
+2019-01-25 00:00:00,25.0,1.0,1.0,25.0,40.0,2894.8,2.0,1.0,1.0,1.0,45.05,20.0,2894.8,10.0
+2019-01-25 01:00:00,25.0,1.0,1.0,25.0,40.0,2806.55,2.0,1.0,1.0,1.0,45.05,20.0,2806.55,10.0
+2019-01-25 02:00:00,25.0,1.0,1.0,25.0,40.0,2763.65,2.0,1.0,1.0,1.0,45.05,20.0,2763.65,10.0
+2019-01-25 03:00:00,25.0,1.0,1.0,25.0,40.0,2753.75,2.0,1.0,1.0,1.0,45.05,20.0,2753.75,10.0
+2019-01-25 04:00:00,25.0,1.0,1.0,25.0,40.0,2809.55,2.0,1.0,1.0,1.0,45.05,20.0,2809.55,10.0
+2019-01-25 05:00:00,25.0,1.0,1.0,25.0,40.0,2960.8,2.0,1.0,1.0,1.0,45.05,20.0,2960.8,10.0
+2019-01-25 06:00:00,25.0,1.0,1.0,25.0,40.0,3294.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3294.9,10.0
+2019-01-25 07:00:00,25.0,1.0,1.0,25.0,40.0,3606.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3606.15,10.0
+2019-01-25 08:00:00,25.0,1.0,1.0,25.0,40.0,3721.95,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3721.95,10.0
+2019-01-25 09:00:00,25.0,1.0,1.0,25.0,40.0,3751.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3751.5,10.0
+2019-01-25 10:00:00,25.0,1.0,1.0,25.0,40.0,3770.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3770.15,10.0
+2019-01-25 11:00:00,25.0,1.0,1.0,25.0,40.0,3778.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3778.15,10.0
+2019-01-25 12:00:00,25.0,1.0,1.0,25.0,40.0,3739.35,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3739.35,10.0
+2019-01-25 13:00:00,25.0,1.0,1.0,25.0,40.0,3668.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3668.5,10.0
+2019-01-25 14:00:00,25.0,1.0,1.0,25.0,40.0,3612.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3612.75,10.0
+2019-01-25 15:00:00,25.0,1.0,1.0,25.0,40.0,3570.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3570.25,10.0
+2019-01-25 16:00:00,25.0,1.0,1.0,25.0,40.0,3578.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3578.05,10.0
+2019-01-25 17:00:00,25.0,1.0,1.0,25.0,40.0,3717.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3717.75,10.0
+2019-01-25 18:00:00,25.0,1.0,1.0,25.0,40.0,3722.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3722.75,10.0
+2019-01-25 19:00:00,25.0,1.0,1.0,25.0,40.0,3642.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3642.05,10.0
+2019-01-25 20:00:00,25.0,1.0,1.0,25.0,40.0,3445.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3445.55,10.0
+2019-01-25 21:00:00,25.0,1.0,1.0,25.0,40.0,3303.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3303.65,10.0
+2019-01-25 22:00:00,25.0,1.0,1.0,25.0,40.0,3208.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3208.5,10.0
+2019-01-25 23:00:00,25.0,1.0,1.0,25.0,40.0,3034.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3034.9,10.0
+2019-01-26 00:00:00,25.0,1.0,1.0,25.0,40.0,2898.9,2.0,1.0,1.0,1.0,45.05,20.0,2898.9,10.0
+2019-01-26 01:00:00,25.0,1.0,1.0,25.0,40.0,2769.3,2.0,1.0,1.0,1.0,45.05,20.0,2769.3,10.0
+2019-01-26 02:00:00,25.0,1.0,1.0,25.0,40.0,2691.6,2.0,1.0,1.0,1.0,45.05,20.0,2691.6,10.0
+2019-01-26 03:00:00,25.0,1.0,1.0,25.0,40.0,2647.4,2.0,1.0,1.0,1.0,45.05,20.0,2647.4,10.0
+2019-01-26 04:00:00,25.0,1.0,1.0,25.0,40.0,2631.5,2.0,1.0,1.0,1.0,45.05,20.0,2631.5,10.0
+2019-01-26 05:00:00,25.0,1.0,1.0,25.0,40.0,2618.6,2.0,1.0,1.0,1.0,45.05,20.0,2618.6,10.0
+2019-01-26 06:00:00,25.0,1.0,1.0,25.0,40.0,2655.6,2.0,1.0,1.0,1.0,45.05,20.0,2655.6,10.0
+2019-01-26 07:00:00,25.0,1.0,1.0,25.0,40.0,2806.95,2.0,1.0,1.0,1.0,45.05,20.0,2806.95,10.0
+2019-01-26 08:00:00,25.0,1.0,1.0,25.0,40.0,2992.7,2.0,1.0,1.0,1.0,45.05,20.0,2992.7,10.0
+2019-01-26 09:00:00,25.0,1.0,1.0,25.0,40.0,3145.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3145.4,10.0
+2019-01-26 10:00:00,25.0,1.0,1.0,25.0,40.0,3240.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3240.45,10.0
+2019-01-26 11:00:00,25.0,1.0,1.0,25.0,40.0,3273.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3273.6,10.0
+2019-01-26 12:00:00,25.0,1.0,1.0,25.0,40.0,3245.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3245.1,10.0
+2019-01-26 13:00:00,25.0,1.0,1.0,25.0,40.0,3144.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3144.1,10.0
+2019-01-26 14:00:00,25.0,1.0,1.0,25.0,40.0,3073.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3073.8,10.0
+2019-01-26 15:00:00,25.0,1.0,1.0,25.0,40.0,3041.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3041.6,10.0
+2019-01-26 16:00:00,25.0,1.0,1.0,25.0,40.0,3050.1000000000004,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3050.1000000000004,10.0
+2019-01-26 17:00:00,25.0,1.0,1.0,25.0,40.0,3205.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3205.0,10.0
+2019-01-26 18:00:00,25.0,1.0,1.0,25.0,40.0,3238.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3238.7,10.0
+2019-01-26 19:00:00,25.0,1.0,1.0,25.0,40.0,3131.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3131.4,10.0
+2019-01-26 20:00:00,25.0,1.0,1.0,25.0,40.0,2947.2,2.0,1.0,1.0,1.0,45.05,20.0,2947.2,10.0
+2019-01-26 21:00:00,25.0,1.0,1.0,25.0,40.0,2853.8,2.0,1.0,1.0,1.0,45.05,20.0,2853.8,10.0
+2019-01-26 22:00:00,25.0,1.0,1.0,25.0,40.0,2811.6,2.0,1.0,1.0,1.0,45.05,20.0,2811.6,10.0
+2019-01-26 23:00:00,25.0,1.0,1.0,25.0,40.0,2658.4,2.0,1.0,1.0,1.0,45.05,20.0,2658.4,10.0
+2019-01-27 00:00:00,25.0,1.0,1.0,25.0,40.0,2515.4,2.0,1.0,1.0,1.0,45.05,20.0,2515.4,10.0
+2019-01-27 01:00:00,25.0,1.0,1.0,25.0,40.0,2415.35,2.0,1.0,1.0,1.0,45.05,20.0,2415.35,10.0
+2019-01-27 02:00:00,25.0,1.0,1.0,25.0,40.0,2342.15,2.0,1.0,1.0,1.0,45.05,20.0,2342.15,10.0
+2019-01-27 03:00:00,25.0,1.0,1.0,25.0,40.0,2325.15,2.0,1.0,1.0,1.0,45.05,20.0,2325.15,10.0
+2019-01-27 04:00:00,25.0,1.0,1.0,25.0,40.0,2324.15,2.0,1.0,1.0,1.0,45.05,20.0,2324.15,10.0
+2019-01-27 05:00:00,25.0,1.0,1.0,25.0,40.0,2303.3500000000004,2.0,1.0,1.0,1.0,45.05,20.0,2303.3500000000004,10.0
+2019-01-27 06:00:00,25.0,1.0,1.0,25.0,40.0,2262.6,2.0,1.0,1.0,1.0,45.05,20.0,2262.6,10.0
+2019-01-27 07:00:00,25.0,1.0,1.0,25.0,40.0,2375.0,2.0,1.0,1.0,1.0,45.05,20.0,2375.0,10.0
+2019-01-27 08:00:00,25.0,1.0,1.0,25.0,40.0,2508.5,2.0,1.0,1.0,1.0,45.05,20.0,2508.5,10.0
+2019-01-27 09:00:00,25.0,1.0,1.0,25.0,40.0,2685.5,2.0,1.0,1.0,1.0,45.05,20.0,2685.5,10.0
+2019-01-27 10:00:00,25.0,1.0,1.0,25.0,40.0,2815.7,2.0,1.0,1.0,1.0,45.05,20.0,2815.7,10.0
+2019-01-27 11:00:00,25.0,1.0,1.0,25.0,40.0,2944.8,2.0,1.0,1.0,1.0,45.05,20.0,2944.8,10.0
+2019-01-27 12:00:00,25.0,1.0,1.0,25.0,40.0,2934.5,2.0,1.0,1.0,1.0,45.05,20.0,2934.5,10.0
+2019-01-27 13:00:00,25.0,1.0,1.0,25.0,40.0,2812.5,2.0,1.0,1.0,1.0,45.05,20.0,2812.5,10.0
+2019-01-27 14:00:00,25.0,1.0,1.0,25.0,40.0,2738.9500000000003,2.0,1.0,1.0,1.0,45.05,20.0,2738.9500000000003,10.0
+2019-01-27 15:00:00,25.0,1.0,1.0,25.0,40.0,2702.65,2.0,1.0,1.0,1.0,45.05,20.0,2702.65,10.0
+2019-01-27 16:00:00,25.0,1.0,1.0,25.0,40.0,2739.95,2.0,1.0,1.0,1.0,45.05,20.0,2739.95,10.0
+2019-01-27 17:00:00,25.0,1.0,1.0,25.0,40.0,2941.15,2.0,1.0,1.0,1.0,45.05,20.0,2941.15,10.0
+2019-01-27 18:00:00,25.0,1.0,1.0,25.0,40.0,3023.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3023.1,10.0
+2019-01-27 19:00:00,25.0,1.0,1.0,25.0,40.0,2978.5,2.0,1.0,1.0,1.0,45.05,20.0,2978.5,10.0
+2019-01-27 20:00:00,25.0,1.0,1.0,25.0,40.0,2867.25,2.0,1.0,1.0,1.0,45.05,20.0,2867.25,10.0
+2019-01-27 21:00:00,25.0,1.0,1.0,25.0,40.0,2793.2,2.0,1.0,1.0,1.0,45.05,20.0,2793.2,10.0
+2019-01-27 22:00:00,25.0,1.0,1.0,25.0,40.0,2803.6,2.0,1.0,1.0,1.0,45.05,20.0,2803.6,10.0
+2019-01-27 23:00:00,25.0,1.0,1.0,25.0,40.0,2655.95,2.0,1.0,1.0,1.0,45.05,20.0,2655.95,10.0
+2019-01-28 00:00:00,25.0,1.0,1.0,25.0,40.0,2541.65,2.0,1.0,1.0,1.0,45.05,20.0,2541.65,10.0
+2019-01-28 01:00:00,25.0,1.0,1.0,25.0,40.0,2456.9,2.0,1.0,1.0,1.0,45.05,20.0,2456.9,10.0
+2019-01-28 02:00:00,25.0,1.0,1.0,25.0,40.0,2434.5,2.0,1.0,1.0,1.0,45.05,20.0,2434.5,10.0
+2019-01-28 03:00:00,25.0,1.0,1.0,25.0,40.0,2458.25,2.0,1.0,1.0,1.0,45.05,20.0,2458.25,10.0
+2019-01-28 04:00:00,25.0,1.0,1.0,25.0,40.0,2547.7,2.0,1.0,1.0,1.0,45.05,20.0,2547.7,10.0
+2019-01-28 05:00:00,25.0,1.0,1.0,25.0,40.0,2729.85,2.0,1.0,1.0,1.0,45.05,20.0,2729.85,10.0
+2019-01-28 06:00:00,25.0,1.0,1.0,25.0,40.0,3171.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3171.25,10.0
+2019-01-28 07:00:00,25.0,1.0,1.0,25.0,40.0,3522.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3522.25,10.0
+2019-01-28 08:00:00,25.0,1.0,1.0,25.0,40.0,3647.2,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3647.2,10.0
+2019-01-28 09:00:00,25.0,1.0,1.0,25.0,40.0,3695.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3695.6,10.0
+2019-01-28 10:00:00,25.0,1.0,1.0,25.0,40.0,3734.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3734.15,10.0
+2019-01-28 11:00:00,25.0,1.0,1.0,25.0,40.0,3768.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3768.0,10.0
+2019-01-28 12:00:00,25.0,1.0,1.0,25.0,40.0,3765.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3765.65,10.0
+2019-01-28 13:00:00,25.0,1.0,1.0,25.0,40.0,3752.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3752.15,10.0
+2019-01-28 14:00:00,25.0,1.0,1.0,25.0,40.0,3713.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3713.8,10.0
+2019-01-28 15:00:00,25.0,1.0,1.0,25.0,40.0,3685.2,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3685.2,10.0
+2019-01-28 16:00:00,25.0,1.0,1.0,25.0,40.0,3663.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3663.9,10.0
+2019-01-28 17:00:00,25.0,1.0,1.0,25.0,40.0,3766.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3766.4,10.0
+2019-01-28 18:00:00,25.0,1.0,1.0,25.0,40.0,3759.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3759.45,10.0
+2019-01-28 19:00:00,25.0,1.0,1.0,25.0,40.0,3646.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3646.3,10.0
+2019-01-28 20:00:00,25.0,1.0,1.0,25.0,40.0,3455.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3455.25,10.0
+2019-01-28 21:00:00,25.0,1.0,1.0,25.0,40.0,3284.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3284.4,10.0
+2019-01-28 22:00:00,25.0,1.0,1.0,25.0,40.0,3157.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3157.0,10.0
+2019-01-28 23:00:00,25.0,1.0,1.0,25.0,40.0,2950.95,2.0,1.0,1.0,1.0,45.05,20.0,2950.95,10.0
+2019-01-29 00:00:00,25.0,1.0,1.0,25.0,40.0,2789.7,2.0,1.0,1.0,1.0,45.05,20.0,2789.7,10.0
+2019-01-29 01:00:00,25.0,1.0,1.0,25.0,40.0,2701.05,2.0,1.0,1.0,1.0,45.05,20.0,2701.05,10.0
+2019-01-29 02:00:00,25.0,1.0,1.0,25.0,40.0,2658.4,2.0,1.0,1.0,1.0,45.05,20.0,2658.4,10.0
+2019-01-29 03:00:00,25.0,1.0,1.0,25.0,40.0,2682.3,2.0,1.0,1.0,1.0,45.05,20.0,2682.3,10.0
+2019-01-29 04:00:00,25.0,1.0,1.0,25.0,40.0,2740.65,2.0,1.0,1.0,1.0,45.05,20.0,2740.65,10.0
+2019-01-29 05:00:00,25.0,1.0,1.0,25.0,40.0,2923.15,2.0,1.0,1.0,1.0,45.05,20.0,2923.15,10.0
+2019-01-29 06:00:00,25.0,1.0,1.0,25.0,40.0,3293.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3293.75,10.0
+2019-01-29 07:00:00,25.0,1.0,1.0,25.0,40.0,3580.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3580.75,10.0
+2019-01-29 08:00:00,25.0,1.0,1.0,25.0,40.0,3679.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3679.8,10.0
+2019-01-29 09:00:00,25.0,1.0,1.0,25.0,40.0,3693.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3693.65,10.0
+2019-01-29 10:00:00,25.0,1.0,1.0,25.0,40.0,3696.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3696.55,10.0
+2019-01-29 11:00:00,25.0,1.0,1.0,25.0,40.0,3723.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3723.6,10.0
+2019-01-29 12:00:00,25.0,1.0,1.0,25.0,40.0,3684.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3684.75,10.0
+2019-01-29 13:00:00,25.0,1.0,1.0,25.0,40.0,3655.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3655.75,10.0
+2019-01-29 14:00:00,25.0,1.0,1.0,25.0,40.0,3597.4,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3597.4,10.0
+2019-01-29 15:00:00,25.0,1.0,1.0,25.0,40.0,3559.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3559.3,10.0
+2019-01-29 16:00:00,25.0,1.0,1.0,25.0,40.0,3508.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3508.7,10.0
+2019-01-29 17:00:00,25.0,1.0,1.0,25.0,40.0,3659.3,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3659.3,10.0
+2019-01-29 18:00:00,25.0,1.0,1.0,25.0,40.0,3720.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3720.45,10.0
+2019-01-29 19:00:00,25.0,1.0,1.0,25.0,40.0,3637.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3637.25,10.0
+2019-01-29 20:00:00,25.0,1.0,1.0,25.0,40.0,3468.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3468.85,10.0
+2019-01-29 21:00:00,25.0,1.0,1.0,25.0,40.0,3284.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3284.0,10.0
+2019-01-29 22:00:00,25.0,1.0,1.0,25.0,40.0,3158.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3158.5,10.0
+2019-01-29 23:00:00,25.0,1.0,1.0,25.0,40.0,2959.9,2.0,1.0,1.0,1.0,45.05,20.0,2959.9,10.0
+2019-01-30 00:00:00,25.0,1.0,1.0,25.0,40.0,2837.05,2.0,1.0,1.0,1.0,45.05,20.0,2837.05,10.0
+2019-01-30 01:00:00,25.0,1.0,1.0,25.0,40.0,2760.1,2.0,1.0,1.0,1.0,45.05,20.0,2760.1,10.0
+2019-01-30 02:00:00,25.0,1.0,1.0,25.0,40.0,2710.6,2.0,1.0,1.0,1.0,45.05,20.0,2710.6,10.0
+2019-01-30 03:00:00,25.0,1.0,1.0,25.0,40.0,2713.25,2.0,1.0,1.0,1.0,45.05,20.0,2713.25,10.0
+2019-01-30 04:00:00,25.0,1.0,1.0,25.0,40.0,2770.75,2.0,1.0,1.0,1.0,45.05,20.0,2770.75,10.0
+2019-01-30 05:00:00,25.0,1.0,1.0,25.0,40.0,2947.9,2.0,1.0,1.0,1.0,45.05,20.0,2947.9,10.0
+2019-01-30 06:00:00,25.0,1.0,1.0,25.0,40.0,3309.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3309.5,10.0
+2019-01-30 07:00:00,25.0,1.0,1.0,25.0,40.0,3575.2,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3575.2,10.0
+2019-01-30 08:00:00,25.0,1.0,1.0,25.0,40.0,3642.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3642.55,10.0
+2019-01-30 09:00:00,25.0,1.0,1.0,25.0,40.0,3642.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3642.1,10.0
+2019-01-30 10:00:00,25.0,1.0,1.0,25.0,40.0,3753.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3753.8,10.0
+2019-01-30 11:00:00,25.0,1.0,1.0,25.0,40.0,3796.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3796.9,10.0
+2019-01-30 12:00:00,25.0,1.0,1.0,25.0,40.0,3767.0,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3767.0,10.0
+2019-01-30 13:00:00,25.0,1.0,1.0,25.0,40.0,3757.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3757.8,10.0
+2019-01-30 14:00:00,25.0,1.0,1.0,25.0,40.0,3691.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3691.6,10.0
+2019-01-30 15:00:00,25.0,1.0,1.0,25.0,40.0,3638.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3638.6,10.0
+2019-01-30 16:00:00,25.0,1.0,1.0,25.0,40.0,3608.7,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3608.7,10.0
+2019-01-30 17:00:00,25.0,1.0,1.0,25.0,40.0,3733.15,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3733.15,10.0
+2019-01-30 18:00:00,25.0,1.0,1.0,25.0,40.0,3754.65,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3754.65,10.0
+2019-01-30 19:00:00,25.0,1.0,1.0,25.0,40.0,3675.1,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3675.1,10.0
+2019-01-30 20:00:00,25.0,1.0,1.0,25.0,40.0,3490.9,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3490.9,10.0
+2019-01-30 21:00:00,25.0,1.0,1.0,25.0,40.0,3313.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3313.05,10.0
+2019-01-30 22:00:00,25.0,1.0,1.0,25.0,40.0,3191.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3191.25,10.0
+2019-01-30 23:00:00,25.0,1.0,1.0,25.0,40.0,2999.8,2.0,1.0,1.0,1.0,45.05,20.0,2999.8,10.0
+2019-01-31 00:00:00,25.0,1.0,1.0,25.0,40.0,2855.3,2.0,1.0,1.0,1.0,45.05,20.0,2855.3,10.0
+2019-01-31 01:00:00,25.0,1.0,1.0,25.0,40.0,2764.9,2.0,1.0,1.0,1.0,45.05,20.0,2764.9,10.0
+2019-01-31 02:00:00,25.0,1.0,1.0,25.0,40.0,2717.45,2.0,1.0,1.0,1.0,45.05,20.0,2717.45,10.0
+2019-01-31 03:00:00,25.0,1.0,1.0,25.0,40.0,2719.7,2.0,1.0,1.0,1.0,45.05,20.0,2719.7,10.0
+2019-01-31 04:00:00,25.0,1.0,1.0,25.0,40.0,2775.65,2.0,1.0,1.0,1.0,45.05,20.0,2775.65,10.0
+2019-01-31 05:00:00,25.0,1.0,1.0,25.0,40.0,2917.6,2.0,1.0,1.0,1.0,45.05,20.0,2917.6,10.0
+2019-01-31 06:00:00,25.0,1.0,1.0,25.0,40.0,3240.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3240.25,10.0
+2019-01-31 07:00:00,25.0,1.0,1.0,25.0,40.0,3533.85,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3533.85,10.0
+2019-01-31 08:00:00,25.0,1.0,1.0,25.0,40.0,3623.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3623.45,10.0
+2019-01-31 09:00:00,25.0,1.0,1.0,25.0,40.0,3630.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3630.25,10.0
+2019-01-31 10:00:00,25.0,1.0,1.0,25.0,40.0,3674.6,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3674.6,10.0
+2019-01-31 11:00:00,25.0,1.0,1.0,25.0,40.0,3687.2,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3687.2,10.0
+2019-01-31 12:00:00,25.0,1.0,1.0,25.0,40.0,3647.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3647.25,10.0
+2019-01-31 13:00:00,25.0,1.0,1.0,25.0,40.0,3609.55,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3609.55,10.0
+2019-01-31 14:00:00,25.0,1.0,1.0,25.0,40.0,3554.75,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3554.75,10.0
+2019-01-31 15:00:00,25.0,1.0,1.0,25.0,40.0,3513.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3513.25,10.0
+2019-01-31 16:00:00,25.0,1.0,1.0,25.0,40.0,3482.8,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3482.8,10.0
+2019-01-31 17:00:00,25.0,1.0,1.0,25.0,40.0,3643.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3643.25,10.0
+2019-01-31 18:00:00,25.0,1.0,1.0,25.0,40.0,3688.5,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3688.5,10.0
+2019-01-31 19:00:00,25.0,1.0,1.0,25.0,40.0,3630.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3630.45,10.0
+2019-01-31 20:00:00,25.0,1.0,1.0,25.0,40.0,3467.25,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3467.25,10.0
+2019-01-31 21:00:00,25.0,1.0,1.0,25.0,40.0,3308.05,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3308.05,10.0
+2019-01-31 22:00:00,25.0,1.0,1.0,25.0,40.0,3203.45,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3203.45,10.0
+2019-01-31 23:00:00,25.0,1.0,1.0,25.0,40.0,3026.95,2.0,1.0,1.0,1.0,53.50000000000001,20.0,3026.95,10.0
+2019-02-01 00:00:00,25.0,1.0,1.0,25.0,40.0,2892.2,2.0,1.0,1.0,1.0,45.05,20.0,2892.2,10.0
diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index 0026b8bfd..95b6d7b75 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -4,7 +4,7 @@
 
 tiny:
   start_date: 2019-01-01 00:00
-  end_date: 2019-01-05 00:00
+  end_date: 2019-01-02 00:00  # Changed from 2019-01-05 00:00 to 2019-01-02 00:00
   time_step: 1h
   save_frequency_hours: null
   learning_mode: True
@@ -13,14 +13,14 @@ tiny:
     continue_learning: False
     trained_policies_save_path: null
     max_bid_price: 100
-    algorithm: matd3
+    algorithm: ppo
     device: cpu
+    learning_rate: 0.001
     matd3:
       actor_architecture: mlp
-      learning_rate: 0.001
       training_episodes: 10
       episodes_collecting_initial_experience: 3
-      train_freq: 24h
+      train_freq: 48h
       gradient_steps: -1
       batch_size: 64
       gamma: 0.99
@@ -30,16 +30,17 @@ tiny:
       validation_episodes_interval: 5
     ppo: 
       actor_architecture: mlp
-      learning_rate: 0.001
       training_episodes: 10
-      train_freq: 24h
-      gradient_steps: -1
-      batch_size: 64
-      gamma: 0.99
-      noise_sigma: 0.1
-      noise_scale: 1
-      noise_dt: 1
-      validation_episodes_interval: 5
+      validation_episodes_interval: 5 # after how many episodes the validation starts and the policy is updated
+      train_freq: 24h # how often write_to_learning_role gets called
+      gamma: 0.99  # Discount factor for future rewards
+      epochs: 4  # Number of epochs for updating the policy
+      clip_ratio: 0.2  # Clipping parameter for policy updates
+      vf_coef: 0.5  # Value function coefficient in the loss function
+      entropy_coef: 0.01  # Entropy coefficient for exploration
+      max_grad_norm: 0.5  # Gradient clipping value
+      gae_lambda: 0.95  # GAE lambda for advantage estimation
+      batch_size: 5  # Batch size for each update, if mini-batch approach is used (currently not implemented)
 
   markets_config:
     EOM:
diff --git a/examples/inputs/example_02a/forecasts_df.csv b/examples/inputs/example_02a/forecasts_df.csv
new file mode 100644
index 000000000..24ef716d7
--- /dev/null
+++ b/examples/inputs/example_02a/forecasts_df.csv
@@ -0,0 +1,122 @@
+,fuel_price_natural gas,fuel_price_co2,availability_pp_4,availability_pp_1,availability_pp_5,fuel_price_oil,residual_load_EOM,price_EOM,availability_pp_7,availability_pp_2,availability_pp_6,fuel_price_hard coal,fuel_price_biomass,fuel_price_uranium,demand_EOM,fuel_price_lignite,availability_pp_3
+2019-01-01 00:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4352.7,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4352.7,1.8,1.0
+2019-01-01 01:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4180.2,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4180.2,1.8,1.0
+2019-01-01 02:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4011.3,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4011.3,1.8,1.0
+2019-01-01 03:00:00,26.0,25.0,1.0,1.0,1.0,22.0,3949.0,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,3949.0,1.8,1.0
+2019-01-01 04:00:00,26.0,25.0,1.0,1.0,1.0,22.0,3927.3,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,3927.3,1.8,1.0
+2019-01-01 05:00:00,26.0,25.0,1.0,1.0,1.0,22.0,3881.8,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,3881.8,1.8,1.0
+2019-01-01 06:00:00,26.0,25.0,1.0,1.0,1.0,22.0,3816.8,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,3816.8,1.8,1.0
+2019-01-01 07:00:00,26.0,25.0,1.0,1.0,1.0,22.0,3889.7,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,3889.7,1.8,1.0
+2019-01-01 08:00:00,26.0,25.0,1.0,1.0,1.0,22.0,3967.4,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,3967.4,1.8,1.0
+2019-01-01 09:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4221.5,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4221.5,1.8,1.0
+2019-01-01 10:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4491.2,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4491.2,1.8,1.0
+2019-01-01 11:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4787.2,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4787.2,1.8,1.0
+2019-01-01 12:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4916.5,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4916.5,1.8,1.0
+2019-01-01 13:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4888.5,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4888.5,1.8,1.0
+2019-01-01 14:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4865.0,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4865.0,1.8,1.0
+2019-01-01 15:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4896.9,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4896.9,1.8,1.0
+2019-01-01 16:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5081.1,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5081.1,1.8,1.0
+2019-01-01 17:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5396.799999999999,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5396.799999999999,1.8,1.0
+2019-01-01 18:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5468.9,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5468.9,1.8,1.0
+2019-01-01 19:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5377.8,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5377.8,1.8,1.0
+2019-01-01 20:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5154.7,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5154.7,1.8,1.0
+2019-01-01 21:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4988.9,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4988.9,1.8,1.0
+2019-01-01 22:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4884.1,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4884.1,1.8,1.0
+2019-01-01 23:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4611.0,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4611.0,1.8,1.0
+2019-01-02 00:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4406.4,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4406.4,1.8,1.0
+2019-01-02 01:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4238.200000000001,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4238.200000000001,1.8,1.0
+2019-01-02 02:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4187.0,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4187.0,1.8,1.0
+2019-01-02 03:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4237.8,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4237.8,1.8,1.0
+2019-01-02 04:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4408.8,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4408.8,1.8,1.0
+2019-01-02 05:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4709.7,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4709.7,1.8,1.0
+2019-01-02 06:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5273.2,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5273.2,1.8,1.0
+2019-01-02 07:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5812.7,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5812.7,1.8,1.0
+2019-01-02 08:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6132.2,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6132.2,1.8,1.0
+2019-01-02 09:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6299.5,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6299.5,1.8,1.0
+2019-01-02 10:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6415.1,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6415.1,1.8,1.0
+2019-01-02 11:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6544.9,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6544.9,1.8,1.0
+2019-01-02 12:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6573.9,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6573.9,1.8,1.0
+2019-01-02 13:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6504.3,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6504.3,1.8,1.0
+2019-01-02 14:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6382.2,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6382.2,1.8,1.0
+2019-01-02 15:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6370.4,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6370.4,1.8,1.0
+2019-01-02 16:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6489.2,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6489.2,1.8,1.0
+2019-01-02 17:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6808.3,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6808.3,1.8,1.0
+2019-01-02 18:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6782.9,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6782.9,1.8,1.0
+2019-01-02 19:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6604.8,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6604.8,1.8,1.0
+2019-01-02 20:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6233.6,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6233.6,1.8,1.0
+2019-01-02 21:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5921.2,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5921.2,1.8,1.0
+2019-01-02 22:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5728.8,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5728.8,1.8,1.0
+2019-01-02 23:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5362.3,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5362.3,1.8,1.0
+2019-01-03 00:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5068.0,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5068.0,1.8,1.0
+2019-01-03 01:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4972.4,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4972.4,1.8,1.0
+2019-01-03 02:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4943.7,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4943.7,1.8,1.0
+2019-01-03 03:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4932.7,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4932.7,1.8,1.0
+2019-01-03 04:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5011.2,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5011.2,1.8,1.0
+2019-01-03 05:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5273.1,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5273.1,1.8,1.0
+2019-01-03 06:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5752.0,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5752.0,1.8,1.0
+2019-01-03 07:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6190.0,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6190.0,1.8,1.0
+2019-01-03 08:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6442.3,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6442.3,1.8,1.0
+2019-01-03 09:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6578.9,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6578.9,1.8,1.0
+2019-01-03 10:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6644.6,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6644.6,1.8,1.0
+2019-01-03 11:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6742.900000000001,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6742.900000000001,1.8,1.0
+2019-01-03 12:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6764.3,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6764.3,1.8,1.0
+2019-01-03 13:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6643.6,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6643.6,1.8,1.0
+2019-01-03 14:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6557.3,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6557.3,1.8,1.0
+2019-01-03 15:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6492.1,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6492.1,1.8,1.0
+2019-01-03 16:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6617.0,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6617.0,1.8,1.0
+2019-01-03 17:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6923.8,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6923.8,1.8,1.0
+2019-01-03 18:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6893.5,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6893.5,1.8,1.0
+2019-01-03 19:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6697.5,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6697.5,1.8,1.0
+2019-01-03 20:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6326.8,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6326.8,1.8,1.0
+2019-01-03 21:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6031.2,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6031.2,1.8,1.0
+2019-01-03 22:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5809.7,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5809.7,1.8,1.0
+2019-01-03 23:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5481.8,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5481.8,1.8,1.0
+2019-01-04 00:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5165.6,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5165.6,1.8,1.0
+2019-01-04 01:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4975.3,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4975.3,1.8,1.0
+2019-01-04 02:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4919.3,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4919.3,1.8,1.0
+2019-01-04 03:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4965.0,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4965.0,1.8,1.0
+2019-01-04 04:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5069.7,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5069.7,1.8,1.0
+2019-01-04 05:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5311.5,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5311.5,1.8,1.0
+2019-01-04 06:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5803.299999999999,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5803.299999999999,1.8,1.0
+2019-01-04 07:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6339.1,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6339.1,1.8,1.0
+2019-01-04 08:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6653.5,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6653.5,1.8,1.0
+2019-01-04 09:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6818.5,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6818.5,1.8,1.0
+2019-01-04 10:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6939.7,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6939.7,1.8,1.0
+2019-01-04 11:00:00,26.0,25.0,1.0,1.0,1.0,22.0,7007.6,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,7007.6,1.8,1.0
+2019-01-04 12:00:00,26.0,25.0,1.0,1.0,1.0,22.0,7040.2,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,7040.2,1.8,1.0
+2019-01-04 13:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6935.3,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6935.3,1.8,1.0
+2019-01-04 14:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6755.7,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6755.7,1.8,1.0
+2019-01-04 15:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6618.8,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6618.8,1.8,1.0
+2019-01-04 16:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6752.8,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6752.8,1.8,1.0
+2019-01-04 17:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6955.6,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6955.6,1.8,1.0
+2019-01-04 18:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6888.9,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6888.9,1.8,1.0
+2019-01-04 19:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6687.1,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6687.1,1.8,1.0
+2019-01-04 20:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6292.6,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6292.6,1.8,1.0
+2019-01-04 21:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5997.8,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5997.8,1.8,1.0
+2019-01-04 22:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5814.3,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5814.3,1.8,1.0
+2019-01-04 23:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5421.9,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5421.9,1.8,1.0
+2019-01-05 00:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5116.6,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5116.6,1.8,1.0
+2019-01-05 01:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4899.1,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4899.1,1.8,1.0
+2019-01-05 02:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4777.1,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4777.1,1.8,1.0
+2019-01-05 03:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4728.8,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4728.8,1.8,1.0
+2019-01-05 04:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4742.1,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4742.1,1.8,1.0
+2019-01-05 05:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4749.4,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4749.4,1.8,1.0
+2019-01-05 06:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4845.3,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4845.3,1.8,1.0
+2019-01-05 07:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5087.599999999999,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5087.599999999999,1.8,1.0
+2019-01-05 08:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5428.1,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5428.1,1.8,1.0
+2019-01-05 09:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5772.8,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5772.8,1.8,1.0
+2019-01-05 10:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5991.7,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5991.7,1.8,1.0
+2019-01-05 11:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6112.1,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6112.1,1.8,1.0
+2019-01-05 12:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6101.0,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6101.0,1.8,1.0
+2019-01-05 13:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5979.6,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5979.6,1.8,1.0
+2019-01-05 14:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5861.8,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5861.8,1.8,1.0
+2019-01-05 15:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5816.8,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5816.8,1.8,1.0
+2019-01-05 16:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5921.299999999999,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5921.299999999999,1.8,1.0
+2019-01-05 17:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6206.0,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6206.0,1.8,1.0
+2019-01-05 18:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6212.5,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6212.5,1.8,1.0
+2019-01-05 19:00:00,26.0,25.0,1.0,1.0,1.0,22.0,6030.3,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,6030.3,1.8,1.0
+2019-01-05 20:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5661.5,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5661.5,1.8,1.0
+2019-01-05 21:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5425.0,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5425.0,1.8,1.0
+2019-01-05 22:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5319.6,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5319.6,1.8,1.0
+2019-01-05 23:00:00,26.0,25.0,1.0,1.0,1.0,22.0,5019.5,55.708333333333336,1.0,1.0,1.0,8.5,21.0,0.9,5019.5,1.8,1.0
+2019-01-06 00:00:00,26.0,25.0,1.0,1.0,1.0,22.0,4692.1,36.15625,1.0,1.0,1.0,8.5,21.0,0.9,4692.1,1.8,1.0

From 128727fa827f707eb42a3ce9a8f3ee75e40a3446 Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Tue, 1 Oct 2024 17:52:46 +0200
Subject: [PATCH 04/23] first RUNABLE BUT NOT VALIDATED Version of PPO - readd
 buffer_size to rollout buffer and include warning if we exceed it - switch
 default values to condig reading - started debugging policy update - adjusted
 dimensions of output

---
 .../reinforcement_learning/algorithms/ppo.py  | 127 +++++-----
 assume/reinforcement_learning/buffer.py       | 229 +++++++++++-------
 .../reinforcement_learning/learning_role.py   | 199 +++++----------
 assume/reinforcement_learning/raw_ppo.py      |  74 ++++--
 assume/scenario/loader_csv.py                 |  94 +++----
 assume/strategies/learning_strategies.py      |   9 +-
 examples/examples.py                          |   7 +-
 .../example_01a/forecasts_df.csv.license      |   3 +
 .../example_02a/forecasts_df.csv.license      |   3 +
 9 files changed, 370 insertions(+), 375 deletions(-)
 create mode 100644 examples/inputs/example_01a/forecasts_df.csv.license
 create mode 100644 examples/inputs/example_02a/forecasts_df.csv.license

diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index 9de249993..01bfbe4b8 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -18,38 +18,36 @@
 
 class PPO(RLAlgorithm):
     """
-    Proximal Policy Optimization (PPO) is a robust and efficient policy gradient method for reinforcement learning. 
-    It strikes a balance between trust-region methods and simpler approaches by using clipped objective functions. 
-    PPO avoids large updates to the policy by restricting changes to stay within a specified range, which helps stabilize training. 
-    The key improvements include the introduction of a surrogate objective that limits policy updates and ensures efficient learning, 
+    Proximal Policy Optimization (PPO) is a robust and efficient policy gradient method for reinforcement learning.
+    It strikes a balance between trust-region methods and simpler approaches by using clipped objective functions.
+    PPO avoids large updates to the policy by restricting changes to stay within a specified range, which helps stabilize training.
+    The key improvements include the introduction of a surrogate objective that limits policy updates and ensures efficient learning,
     as well as the use of multiple epochs of stochastic gradient descent on batches of data.
 
     Open AI Spinning guide: https://spinningup.openai.com/en/latest/algorithms/ppo.html#
 
     Original paper: https://arxiv.org/pdf/1802.09477.pdf
     """
-    
+
     # Change order and mandatory parameters in the superclass, removed and newly added parameters
     def __init__(
         self,
         learning_role,
-        learning_rate=1e-4,
-        gamma=0.99, # Discount factor for future rewards
-        epochs=10,  # Number of epochs for updating the policy
-        clip_ratio=0.2,  # Clipping parameter for policy updates
-        vf_coef=0.5,  # Value function coefficient in the loss function
-        entropy_coef=0.02,  # Entropy coefficient for exploration
-        max_grad_norm=0.5,  # Gradient clipping value
-        gae_lambda=0.95,  # GAE lambda for advantage estimation
-        batch_size=5, # Batch size for each update, if mini-batch approach is used (currently not implemented)
-        actor_architecture="mlp",
+        learning_rate: float,
+        gamma: float,  # Discount factor for future rewards
+        epochs: int,  # Number of epochs for updating the policy
+        clip_ratio: float,  # Clipping parameter for policy updates
+        vf_coef: float,  # Value function coefficient in the loss function
+        entropy_coef: float,  # Entropy coefficient for exploration
+        max_grad_norm: float,  # Gradient clipping value
+        gae_lambda: float,  # GAE lambda for advantage estimation
+        actor_architecture: str,
     ):
         super().__init__(
-            learning_role,
-            learning_rate,
-            batch_size,
-            gamma,
-            actor_architecture,
+            learning_role=learning_role,
+            learning_rate=learning_rate,
+            gamma=gamma,
+            actor_architecture=actor_architecture,
         )
         self.epochs = epochs
         self.clip_ratio = clip_ratio
@@ -57,8 +55,7 @@ def __init__(
         self.entropy_coef = entropy_coef
         self.max_grad_norm = max_grad_norm
         self.gae_lambda = gae_lambda
-        self.n_updates = 0 # Number of updates performed
-        
+        self.n_updates = 0  # Number of updates performed
 
     # Unchanged method from MATD3
     def save_params(self, directory):
@@ -208,7 +205,6 @@ def load_actor_params(self, directory: str) -> None:
             except Exception:
                 logger.warning(f"No actor values loaded for agent {u_id}")
 
-
     # Removed target_critics and actor_target in comparison to MATD3
     def initialize_policy(self, actors_and_critics: dict = None) -> None:
         """
@@ -264,14 +260,6 @@ def create_actors(self) -> None:
                 num_timeseries_obs_dim=unit_strategy.num_timeseries_obs_dim,
             ).to(self.device)
 
-            # unit_strategy.actor_target = Actor(
-            #     obs_dim=unit_strategy.obs_dim,
-            #     act_dim=unit_strategy.act_dim,
-            #     float_type=self.float_type,
-            # ).to(self.device)
-            # unit_strategy.actor_target.load_state_dict(unit_strategy.actor.state_dict())
-            # unit_strategy.actor_target.train(mode=False)
-
             unit_strategy.actor.optimizer = Adam(
                 unit_strategy.actor.parameters(), lr=self.learning_rate
             )
@@ -292,7 +280,7 @@ def create_actors(self) -> None:
             self.act_dim = act_dim_list[0]
 
     # Removed target_critics in comparison to MATD3
-    # Changed initialization of CriticPPO compared to MATD3 
+    # Changed initialization of CriticPPO compared to MATD3
     def create_critics(self) -> None:
         """
         Create decentralized critic networks for reinforcement learning.
@@ -303,12 +291,11 @@ def create_critics(self) -> None:
         Notes:
             Each agent has its own critic, so the critic is no longer shared among all agents.
         """
-        n_agents = len(self.learning_role.rl_strats)
+
         strategy: LearningStrategy
         unique_obs_dim_list = []
 
         for u_id, strategy in self.learning_role.rl_strats.items():
-
             self.learning_role.critics[u_id] = CriticPPO(
                 obs_dim=strategy.obs_dim,
                 float_type=self.float_type,
@@ -318,17 +305,9 @@ def create_critics(self) -> None:
                 self.learning_role.critics[u_id].parameters(), lr=self.learning_rate
             )
 
-            # self.learning_role.target_critics[u_id].load_state_dict(
-            #     self.learning_role.critics[u_id].state_dict()
-            # )
-            # self.learning_role.target_critics[u_id].train(mode=False)
-
             self.learning_role.critics[u_id] = self.learning_role.critics[u_id].to(
                 self.device
             )
-            # self.learning_role.target_critics[u_id] = self.learning_role.target_critics[
-            #     u_id
-            # ].to(self.device)
 
             unique_obs_dim_list.append(strategy.unique_obs_dim)
 
@@ -353,44 +332,38 @@ def extract_policy(self) -> dict:
             dict: The extracted actor and critic networks.
         """
         actors = {}
-        actor_targets = {}
 
         for u_id, unit_strategy in self.learning_role.rl_strats.items():
             actors[u_id] = unit_strategy.actor
-            # actor_targets[u_id] = unit_strategy.actor_target
 
         actors_and_critics = {
             "actors": actors,
-            # "actor_targets": actor_targets,
             "critics": self.learning_role.critics,
-            # "target_critics": self.learning_role.target_critics,
             "obs_dim": self.obs_dim,
             "act_dim": self.act_dim,
             "unique_obs_dim": self.unique_obs_dim,
         }
 
         return actors_and_critics
-    
+
     def update_policy(self):
         """
         Perform policy updates using PPO with the clipped objective.
         """
 
-
         logger.debug("Updating Policy")
         # We will iterate for multiple epochs to update both the policy (actor) and value (critic) networks
         # The number of epochs controls how many times we update using the same collected data (from the buffer).
-        n_rl_agents = len(self.learning_role.rl_strats.keys())
-        for _ in range(self.epochs): 
+
+        for _ in range(self.epochs):
             self.n_updates += 1
-            i = 0
 
             # Iterate through over each agent's strategy
             # Each agent has its own actor and critic. Critic (value network) is in comparison to MATD3 decentralized, meaning each agent learns its own value function.
             for u_id in self.learning_role.rl_strats.keys():
                 critic = self.learning_role.critics[u_id]
                 actor = self.learning_role.rl_strats[u_id].actor
-        
+
                 # Retrieve experiences from the buffer
                 # The collected experiences (observations, actions, rewards, log_probs) are stored in the buffer.
                 transitions = self.learning_role.buffer.get()
@@ -399,28 +372,22 @@ def update_policy(self):
                 rewards = transitions.rewards
                 log_probs = transitions.log_probs
 
-
-
-
                 # STARTING FROM HERE, THE IMPLEMENTATION NEEDS TO BE FIXED
                 # Potentially, it could be useful to source some functionality out into methods stored in buffer.py
 
-
-
                 # Pass the current states through the critic network to get value estimates.
-                values = critic(states) 
-                
+                values = critic(states).squeeze(dim=2)
+
                 # Store the calculated values in the rollout buffer
                 # These values are used later to calculate the advantage estimates (for policy updates).
                 self.learning_role.buffer.values = values.detach().cpu().numpy()
 
-                print("Buffer values")
-                print(self.learning_role.buffer.values)
-        
                 # Compute advantages using Generalized Advantage Estimation (GAE)
                 advantages = []
                 last_advantage = 0
                 returns = []
+
+                # Iterate through the collected experiences in reverse order to calculate advantages and returns
                 for t in reversed(range(len(rewards))):
                     if t == len(rewards) - 1:
                         next_value = 0
@@ -428,10 +395,14 @@ def update_policy(self):
                         next_value = values[t + 1]
 
                     # Temporal difference delta
-                    delta = rewards[t] + self.gamma * next_value - values[t]  # Use self.gamma for discount factor
-                    
+                    delta = (
+                        rewards[t] + self.gamma * next_value - values[t]
+                    )  # Use self.gamma for discount factor
+
                     # GAE advantage
-                    last_advantage = delta + self.gamma * self.gae_lambda * last_advantage  # Use self.gae_lambda for advantage estimation
+                    last_advantage = (
+                        delta + self.gamma * self.gae_lambda * last_advantage
+                    )  # Use self.gae_lambda for advantage estimation
                     advantages.insert(0, last_advantage)
                     returns.insert(0, last_advantage + values[t])
 
@@ -441,7 +412,9 @@ def update_policy(self):
 
                 # Evaluate the new log-probabilities and entropy under the current policy
                 action_means = actor(states)
-                action_stddev = th.ones_like(action_means)  # Assuming fixed standard deviation for simplicity
+                action_stddev = th.ones_like(
+                    action_means
+                )  # Assuming fixed standard deviation for simplicity
                 dist = th.distributions.Normal(action_means, action_stddev)
                 new_log_probs = dist.log_prob(actions).sum(-1)
                 entropy = dist.entropy().sum(-1)
@@ -451,16 +424,23 @@ def update_policy(self):
 
                 # Surrogate loss calculation
                 surrogate1 = ratio * advantages
-                surrogate2 = th.clamp(ratio, 1.0 - self.clip_ratio, 1.0 + self.clip_ratio) * advantages  # Use self.clip_ratio
+                surrogate2 = (
+                    th.clamp(ratio, 1.0 - self.clip_ratio, 1.0 + self.clip_ratio)
+                    * advantages
+                )  # Use self.clip_ratio
 
                 # Final policy loss (clipped surrogate loss)
                 policy_loss = -th.min(surrogate1, surrogate2).mean()
 
                 # Value loss (mean squared error between the predicted values and returns)
-                value_loss = F.mse_loss(returns, values)
+                value_loss = F.mse_loss(returns, values.squeeze())
 
                 # Total loss: policy loss + value loss - entropy bonus
-                total_loss = policy_loss + self.vf_coef * value_loss - self.entropy_coef * entropy.mean()  # Use self.vf_coef and self.entropy_coef
+                total_loss = (
+                    policy_loss
+                    + self.vf_coef * value_loss
+                    - self.entropy_coef * entropy.mean()
+                )  # Use self.vf_coef and self.entropy_coef
 
                 # Zero the gradients and perform backpropagation for both actor and critic
                 actor.optimizer.zero_grad()
@@ -468,8 +448,12 @@ def update_policy(self):
                 total_loss.backward()
 
                 # Clip gradients to prevent gradient explosion
-                th.nn.utils.clip_grad_norm_(actor.parameters(), self.max_grad_norm)  # Use self.max_grad_norm
-                th.nn.utils.clip_grad_norm_(critic.parameters(), self.max_grad_norm)  # Use self.max_grad_norm
+                th.nn.utils.clip_grad_norm_(
+                    actor.parameters(), self.max_grad_norm
+                )  # Use self.max_grad_norm
+                th.nn.utils.clip_grad_norm_(
+                    critic.parameters(), self.max_grad_norm
+                )  # Use self.max_grad_norm
 
                 # Perform optimization steps
                 actor.optimizer.step()
@@ -511,4 +495,3 @@ def get_actions(rl_strategy, next_observation):
     sampled_action = sampled_action.clamp(-1, 1)
 
     return sampled_action, log_prob_action
-
diff --git a/assume/reinforcement_learning/buffer.py b/assume/reinforcement_learning/buffer.py
index b099d61fe..12b2b1a57 100644
--- a/assume/reinforcement_learning/buffer.py
+++ b/assume/reinforcement_learning/buffer.py
@@ -7,7 +7,6 @@
 
 import numpy as np
 import torch as th
-import datetime
 
 try:
     # Check memory used by replay buffer when possible
@@ -176,9 +175,10 @@ def sample(self, batch_size: int) -> ReplayBufferSamples:
 
         return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
 
+
 class RolloutBufferTransitions(NamedTuple):
     """
-    A named tuple that represents the data stored in a rollout buffer for PPO. 
+    A named tuple that represents the data stored in a rollout buffer for PPO.
 
     Attributes:
         observations (torch.Tensor): The observations of the agents.
@@ -187,11 +187,13 @@ class RolloutBufferTransitions(NamedTuple):
         advantages (torch.Tensor): The advantages calculated using GAE.
         returns (torch.Tensor): The returns (discounted rewards) calculated.
     """
+
     observations: th.Tensor
     actions: th.Tensor
     rewards: th.Tensor
     log_probs: th.Tensor
 
+
 class RolloutBuffer:
     def __init__(
         self,
@@ -200,33 +202,38 @@ def __init__(
         n_rl_units: int,
         device: str,
         float_type,
-        initial_size: int = 0,
+        buffer_size: int,
     ):
         """
         A class that represents a rollout buffer for storing observations, actions, and rewards.
         The buffer starts empty and is dynamically expanded when needed.
-        
+
         Args:
             obs_dim (int): The dimension of the observation space.
             act_dim (int): The dimension of the action space.
             n_rl_units (int): The number of reinforcement learning units.
             device (str): The device to use for storing the data (e.g., 'cpu' or 'cuda').
             float_type (torch.dtype): The data type to use for the stored data.
-            initial_size (int): The initial size of the buffer (default is 0).
+            buffer_size (int): The maximal size of the buffer
         """
-        
+
         self.obs_dim = obs_dim
         self.act_dim = act_dim
         self.n_rl_units = n_rl_units
         self.device = device
+        self.buffer_size = buffer_size
 
         # Start with no buffer (None), will be created dynamically when first data is added
-        self.observations = None  # Stores the agent's observations (states) at each timestep
+        self.observations = (
+            None  # Stores the agent's observations (states) at each timestep
+        )
         self.actions = None  # Stores the actions taken by the agent
         self.rewards = None  # Stores the rewards received after each action
-        self.log_probs = None  # Stores the log-probabilities of the actions, used to compute the ratio for policy update 
+        self.log_probs = None  # Stores the log-probabilities of the actions, used to compute the ratio for policy update
 
-        self.values = None  # Stores the value estimates (critic's predictions) of each state
+        self.values = (
+            None  # Stores the value estimates (critic's predictions) of each state
+        )
         self.advantages = None  # Stores the computed advantages using GAE (Generalized Advantage Estimation), central to PPO's policy updates
         self.returns = None  # Stores the discounted rewards (also known as returns), used to compute the value loss for training the critic
 
@@ -245,76 +252,116 @@ def initialize_buffer(self, size):
         self.actions = np.zeros(
             (size, self.n_rl_units, self.act_dim), dtype=self.np_float_type
         )
-        self.rewards = np.zeros(
-            (size, self.n_rl_units), dtype=self.np_float_type
-        )
-        self.log_probs = np.zeros(
-            (size, self.n_rl_units), dtype=np.float32
+        self.rewards = np.zeros((size, self.n_rl_units), dtype=self.np_float_type)
+        self.log_probs = np.zeros((size, self.n_rl_units), dtype=np.float32)
+        self.values = np.zeros((size, self.n_rl_units), dtype=np.float32)
+        self.advantages = np.zeros((size, self.n_rl_units), dtype=np.float32)
+        self.returns = np.zeros((size, self.n_rl_units), dtype=np.float32)
+
+    def expand_buffer(self, additional_size):
+        """Expands the buffer by the given additional size and checks if there is enough memory available."""
+
+        # Calculation of the memory requirement for all 7 arrays
+        additional_memory_usage = (
+            np.zeros(
+                (additional_size, self.n_rl_units, self.obs_dim),
+                dtype=self.np_float_type,
+            ).nbytes
+            + np.zeros(
+                (additional_size, self.n_rl_units, self.act_dim),
+                dtype=self.np_float_type,
+            ).nbytes
+            + np.zeros(
+                (additional_size, self.n_rl_units), dtype=self.np_float_type
+            ).nbytes  # rewards
+            + np.zeros(
+                (additional_size, self.n_rl_units), dtype=np.float32
+            ).nbytes  # log_probs
+            + np.zeros(
+                (additional_size, self.n_rl_units), dtype=np.float32
+            ).nbytes  # values
+            + np.zeros(
+                (additional_size, self.n_rl_units), dtype=np.float32
+            ).nbytes  # advantages
+            + np.zeros(
+                (additional_size, self.n_rl_units), dtype=np.float32
+            ).nbytes  # returns
         )
-        self.values = np.zeros(
-            (size, self.n_rl_units), dtype=np.float32
-        )  
-        self.advantages = np.zeros(
-            (size, self.n_rl_units), dtype=np.float32
-        ) 
-        self.returns = np.zeros(
-            (size, self.n_rl_units), dtype=np.float32
-        ) 
-
-def expand_buffer(self, additional_size):
-    """Expands the buffer by the given additional size and checks if there is enough memory available."""
-    
-    # Calculation of the memory requirement for all 7 arrays
-    additional_memory_usage = (
-        np.zeros((additional_size, self.n_rl_units, self.obs_dim), dtype=self.np_float_type).nbytes +
-        np.zeros((additional_size, self.n_rl_units, self.act_dim), dtype=self.np_float_type).nbytes +
-        np.zeros((additional_size, self.n_rl_units), dtype=self.np_float_type).nbytes +    # rewards
-        np.zeros((additional_size, self.n_rl_units), dtype=np.float32).nbytes +           # log_probs
-        np.zeros((additional_size, self.n_rl_units), dtype=np.float32).nbytes +           # values
-        np.zeros((additional_size, self.n_rl_units), dtype=np.float32).nbytes +           # advantages
-        np.zeros((additional_size, self.n_rl_units), dtype=np.float32).nbytes             # returns
-    )
-
-    # Check whether enough memory is available
-    if psutil is not None:
-        mem_available = psutil.virtual_memory().available
-        if additional_memory_usage > mem_available:
-            # Conversion to GB
-            additional_memory_usage_gb = additional_memory_usage / 1e9
-            mem_available_gb = mem_available / 1e9
-            warnings.warn(
-                f"Not enough memory to expand the RolloutBuffer: "
-                f"{additional_memory_usage_gb:.2f}GB required, but only {mem_available_gb:.2f}GB available."
-            )
 
-        self.observations = np.concatenate(
-            (self.observations, np.zeros((additional_size, self.n_rl_units, self.obs_dim), dtype=self.np_float_type)),
-            axis=0
-        )
-        self.actions = np.concatenate(
-            (self.actions, np.zeros((additional_size, self.n_rl_units, self.act_dim), dtype=self.np_float_type)),
-            axis=0
-        )
-        self.rewards = np.concatenate(
-            (self.rewards, np.zeros((additional_size, self.n_rl_units), dtype=self.np_float_type)),
-            axis=0
-        )
-        self.log_probs = np.concatenate(
-            (self.log_probs, np.zeros((additional_size, self.n_rl_units), dtype=np.float32)),
-            axis=0
-        )
-        self.values = np.concatenate(
-            (self.values, np.zeros((additional_size, self.n_rl_units), dtype=np.float32)), 
-            axis=0
-        )  
-        self.advantages = np.concatenate(
-            (self.advantages, np.zeros((additional_size, self.n_rl_units), dtype=np.float32)),
-            axis=0
-        )
-        self.returns = np.concatenate(
-            (self.returns, np.zeros((additional_size, self.n_rl_units), dtype=np.float32)),
-            axis=0
-        )
+        # Check whether enough memory is available
+        if psutil is not None:
+            mem_available = psutil.virtual_memory().available
+            if additional_memory_usage > mem_available:
+                # Conversion to GB
+                additional_memory_usage_gb = additional_memory_usage / 1e9
+                mem_available_gb = mem_available / 1e9
+                raise MemoryError(
+                    f"{additional_memory_usage_gb:.2f}GB required, but only {mem_available_gb:.2f}GB available."
+                )
+
+            if self.pos + additional_size > self.buffer_size:
+                warnings.warn(
+                    f"Expanding the buffer will exceed the maximum buffer size of {self.buffer_size}. "
+                    f"Current position: {self.pos}, additional size: {additional_size}."
+                )
+
+            self.observations = np.concatenate(
+                (
+                    self.observations,
+                    np.zeros(
+                        (additional_size, self.n_rl_units, self.obs_dim),
+                        dtype=self.np_float_type,
+                    ),
+                ),
+                axis=0,
+            )
+            self.actions = np.concatenate(
+                (
+                    self.actions,
+                    np.zeros(
+                        (additional_size, self.n_rl_units, self.act_dim),
+                        dtype=self.np_float_type,
+                    ),
+                ),
+                axis=0,
+            )
+            self.rewards = np.concatenate(
+                (
+                    self.rewards,
+                    np.zeros(
+                        (additional_size, self.n_rl_units), dtype=self.np_float_type
+                    ),
+                ),
+                axis=0,
+            )
+            self.log_probs = np.concatenate(
+                (
+                    self.log_probs,
+                    np.zeros((additional_size, self.n_rl_units), dtype=np.float32),
+                ),
+                axis=0,
+            )
+            self.values = np.concatenate(
+                (
+                    self.values,
+                    np.zeros((additional_size, self.n_rl_units), dtype=np.float32),
+                ),
+                axis=0,
+            )
+            self.advantages = np.concatenate(
+                (
+                    self.advantages,
+                    np.zeros((additional_size, self.n_rl_units), dtype=np.float32),
+                ),
+                axis=0,
+            )
+            self.returns = np.concatenate(
+                (
+                    self.returns,
+                    np.zeros((additional_size, self.n_rl_units), dtype=np.float32),
+                ),
+                axis=0,
+            )
 
     def add(
         self,
@@ -326,7 +373,7 @@ def add(
         """
         Adds an observation, action, reward, and log probabilities of all agents to the rollout buffer.
         If the buffer does not exist, it will be initialized. If the buffer is full, it will be expanded.
-        
+
         Args:
             obs (numpy.ndarray): The observation to add.
             actions (numpy.ndarray): The actions to add.
@@ -358,14 +405,18 @@ def add(
         # print(self.log_probs)
 
     def reset(self):
-        """Resets the buffer, clearing all stored data."""
+        """
+        Resets the buffer, clearing all stored data.
+        Might be needed if policy is changed within one episode, then it needs to be killed and initalized again.
+
+        """
         self.observations = None
         self.actions = None
         self.rewards = None
         self.log_probs = None
         self.pos = 0
         self.full = False
-    
+
     # def compute_returns_and_advantages(self, last_values, dones):
     #     """
     #     Compute the returns and advantages using Generalized Advantage Estimation (GAE).
@@ -376,12 +427,12 @@ def reset(self):
     #     """
     #     # Initialize the last advantage to 0. This will accumulate as we move backwards in time.
     #     last_advantage = 0
-        
+
     #     # Loop backward through all the steps in the buffer to calculate returns and advantages.
     #     # This is because GAE (Generalized Advantage Estimation) relies on future rewards,
     #     # so we compute it from the last step back to the first step.
     #     for step in reversed(range(self.pos)):
-            
+
     #         # If we are at the last step in the buffer
     #         if step == self.pos - 1:
     #             # If it's the last step, check whether the episode has finished using `dones`.
@@ -436,17 +487,11 @@ def get(self) -> RolloutBufferTransitions:
         Returns the observations, actions, log_probs, advantages, returns, and masks.
         """
         data = (
-            self.observations[:self.pos],
-            self.actions[:self.pos],
-            self.rewards[:self.pos],
-            self.log_probs[:self.pos]
+            self.observations[: self.pos],
+            self.actions[: self.pos],
+            self.rewards[: self.pos],
+            self.log_probs[: self.pos],
             # self.masks[:self.pos],
         )
 
         return RolloutBufferTransitions(*tuple(map(self.to_torch, data)))
-
-    def reset(self):
-        """Reset the buffer after each update."""
-        self.pos = 0
-
-
diff --git a/assume/reinforcement_learning/learning_role.py b/assume/reinforcement_learning/learning_role.py
index 105d4cfd3..2ad8f6a2f 100644
--- a/assume/reinforcement_learning/learning_role.py
+++ b/assume/reinforcement_learning/learning_role.py
@@ -10,11 +10,9 @@
 from mango import Role
 
 from assume.common.base import LearningConfig, LearningStrategy
-from assume.reinforcement_learning.algorithms.base_algorithm import RLAlgorithm
 from assume.reinforcement_learning.algorithms.matd3 import TD3
 from assume.reinforcement_learning.algorithms.ppo import PPO
-from assume.reinforcement_learning.buffer import ReplayBuffer
-from assume.reinforcement_learning.buffer import RolloutBuffer
+from assume.reinforcement_learning.buffer import ReplayBuffer, RolloutBuffer
 
 logger = logging.getLogger(__name__)
 
@@ -37,11 +35,15 @@ def __init__(
     ):
         # General parameters
         self.rl_algorithm_name = learning_config.get("algorithm", "matd3")
-        self.early_stopping_steps = learning_config.get(self.rl_algorithm_name, {}).get("early_stopping_steps", 10)
-        self.early_stopping_threshold = learning_config.get(self.rl_algorithm_name, {}).get("early_stopping_threshold", 0.05)
+        self.early_stopping_steps = learning_config.get(self.rl_algorithm_name, {}).get(
+            "early_stopping_steps", 10
+        )
+        self.early_stopping_threshold = learning_config.get(
+            self.rl_algorithm_name, {}
+        ).get("early_stopping_threshold", 0.05)
         self.episodes_done = 0
         self.rl_strats: dict[int, LearningStrategy] = {}
-        
+
         self.critics = {}
 
         # define whether we train model or evaluate it
@@ -53,44 +55,57 @@ def __init__(
             "trained_policies_load_path", self.trained_policies_save_path
         )
 
-        self.device = th.device(learning_config["device"] if th.cuda.is_available() else "cpu")
+        self.device = th.device(
+            learning_config["device"] if th.cuda.is_available() else "cpu"
+        )
 
         self.learning_rate = learning_config["learning_rate"]
+        self.actor_architecture = learning_config.get(self.rl_algorithm_name, {}).get(
+            "actor_architecture", "mlp"
+        )
+        self.training_episodes = learning_config[self.rl_algorithm_name][
+            "training_episodes"
+        ]
+        self.train_freq = learning_config[self.rl_algorithm_name]["train_freq"]
+        self.batch_size = learning_config.get(self.rl_algorithm_name, {}).get(
+            "batch_size", 128
+        )
+        self.gamma = learning_config.get(self.rl_algorithm_name, {}).get("gamma", 0.99)
+        self.episodes_collecting_initial_experience = max(
+            learning_config.get(self.rl_algorithm_name, {}).get(
+                "episodes_collecting_initial_experience", 5
+            ),
+            1,
+        )
 
         # Algorithm-specific parameters
         if self.rl_algorithm_name == "matd3":
             self.buffer: ReplayBuffer = None
             self.target_critics = {}
 
-            self.actor_architecture = learning_config.get(self.rl_algorithm_name, {}).get("actor_architecture", "mlp")
-            self.training_episodes = learning_config["matd3"]["training_episodes"]
-            self.train_freq = learning_config["matd3"]["train_freq"]
-            self.gradient_steps = int(self.train_freq[:-1]) if learning_config["matd3"].get("gradient_steps", -1) == -1 else learning_config["matd3"]["gradient_steps"]
-            self.batch_size = learning_config.get(self.rl_algorithm_name, {}).get("batch_size", 128)
-            self.gamma = learning_config.get(self.rl_algorithm_name, {}).get("gamma", 0.99)
-
+            self.gradient_steps = (
+                int(self.train_freq[:-1])
+                if learning_config["matd3"].get("gradient_steps", -1) == -1
+                else learning_config["matd3"]["gradient_steps"]
+            )
             self.noise_sigma = learning_config["matd3"]["noise_sigma"]
             self.noise_scale = learning_config["matd3"]["noise_scale"]
-            self.episodes_collecting_initial_experience = max(learning_config.get(self.rl_algorithm_name, {}).get("episodes_collecting_initial_experience", 5), 1)
-            
+
         elif self.rl_algorithm_name == "ppo":
             self.buffer: RolloutBuffer = None
-            self.actor_architecture = learning_config.get(self.rl_algorithm_name, {}).get("actor_architecture", "mlp")
-            self.training_episodes = learning_config["ppo"]["training_episodes"]
-            self.train_freq = learning_config["ppo"]["train_freq"]
-            # self.steps_per_epoch = learning_config["ppo"]["steps_per_epoch"]
-            # self.batch_size = learning_config["matd3"]["batch_size"]
-            # self.gamma = learning_config["matd3"]["gamma"]
-            # self.clip_ratio = learning_config["ppo"]["clip_ratio"]
-            # self.entropy_coeff = learning_config["ppo"]["entropy_coeff"]
-            # self.value_coeff = learning_config["ppo"]["value_coeff"]
-            # self.device = th.device(learning_config["ppo"]["device"] if th.cuda.is_available() else "cpu")
-            # self.learning_rate = learning_config["ppo"]["learning_rate"]
+
+            # Potentially more parameters for PPO
+            self.steps_per_epoch = learning_config["ppo"].get("steps_per_epoch", 10)
+            self.clip_ratio = learning_config["ppo"].get("clip_ratio", 0.2)
+            self.entropy_coeff = learning_config["ppo"].get("entropy_coeff", 0.02)
+            self.value_coeff = learning_config["ppo"].get("value_coeff", 0.5)
+            self.max_grad_norm = learning_config["ppo"].get("max_grad_norm", 0.5)
+            self.gae_lambda = learning_config["ppo"].get("gae_lambda", 0.95)
 
         # Set up CUDA and float types
         th.backends.cuda.matmul.allow_tf32 = True
         th.backends.cudnn.allow_tf32 = True
-        
+
         # future: add option to choose between float16 and float32
         # float_type = learning_config.get("float_type", "float32")
         self.float_type = th.float
@@ -104,7 +119,6 @@ def __init__(
         self.rl_eval = defaultdict(list)
         # List of avg changes
         self.avg_rewards = []
-   
 
     # TD3 and PPO
     def load_inter_episodic_data(self, inter_episodic_data):
@@ -115,6 +129,7 @@ def load_inter_episodic_data(self, inter_episodic_data):
             inter_episodic_data (dict): The inter-episodic data to be loaded.
 
         """
+        # TODO: Make this function of algorithm so that we loose case sensitivity here
         self.episodes_done = inter_episodic_data["episodes_done"]
         self.eval_episodes_done = inter_episodic_data["eval_episodes_done"]
         self.max_eval = inter_episodic_data["max_eval"]
@@ -238,7 +253,7 @@ def get_noise_scale(self) -> None:
         stored_scale = list(self.rl_strats.values())[0].action_noise.scale
 
         return stored_scale
-    
+
     def create_learning_algorithm(self, algorithm: str):
         """
         Algorithm initialization depending on the type
@@ -255,27 +270,24 @@ def create_learning_algorithm(self, algorithm: str):
             )
         elif algorithm == "ppo":
             self.rl_algorithm = PPO(
-            learning_role=self,
-            learning_rate=self.learning_rate,
-            gamma=self.gamma,  # Discount factor
-            epochs=self.epochs,  # Number of epochs for policy updates
-            clip_ratio=self.clip_ratio,  # PPO-specific clipping parameter
-            vf_coef=self.vf_coef,  # Coefficient for value function loss
-            entropy_coef=self.entropy_coef,  # Coefficient for entropy to encourage exploration
-            max_grad_norm=self.max_grad_norm,  # Maximum gradient norm for clipping
-            gae_lambda=self.gae_lambda,  # Lambda for Generalized Advantage Estimation (GAE)
-            batch_size=self.batch_size,  # Batch size for mini-batch updates (optional)
-            actor_architecture=self.actor_architecture,  # Actor network architecture
-        )
+                learning_role=self,
+                learning_rate=self.learning_rate,
+                gamma=self.gamma,  # Discount factor
+                epochs=self.steps_per_epoch,  # Number of epochs for policy updates
+                clip_ratio=self.clip_ratio,  # PPO-specific clipping parameter
+                vf_coef=self.value_coeff,  # Coefficient for value function loss
+                entropy_coef=self.entropy_coeff,  # Coefficient for entropy to encourage exploration
+                max_grad_norm=self.max_grad_norm,  # Maximum gradient norm for clipping
+                gae_lambda=self.gae_lambda,  # Lambda for Generalized Advantage Estimation (GAE)
+                actor_architecture=self.actor_architecture,  # Actor network architecture
+            )
         else:
             logger.error(f"Learning algorithm {algorithm} not implemented!")
 
         # Loop over rl_strats
-        # self.rl_algorithm an die Learning Strategy übergeben 
+        # self.rl_algorithm an die Learning Strategy übergeben
         # Damit die Learning Strategy auf act/get_actions zugreifen kann
 
-
-
     # TD3
     def initialize_policy(self, actors_and_critics: dict = None) -> None:
         """
@@ -311,12 +323,11 @@ def update_policy(self) -> None:
             This method is typically scheduled to run periodically during training to continuously improve the agent's policy.
         """
         if self.rl_algorithm_name == "ppo":
-            self.rl_algorithm.update_policy()  
+            self.rl_algorithm.update_policy()
         else:
             if self.episodes_done > self.episodes_collecting_initial_experience:
                 self.rl_algorithm.update_policy()
 
-
     def compare_and_save_policies(self, metrics: dict) -> bool:
         """
         Compare evaluation metrics and save policies based on the best achieved performance according to the metrics calculated.
@@ -385,97 +396,3 @@ def compare_and_save_policies(self, metrics: dict) -> bool:
 
                         return True
             return False
-
- # def __init__(
-    #     self,
-    #     learning_config: LearningConfig,
-    # ):
-    #     # how many learning roles do exist and how are they named
-    #     self.buffer: ReplayBuffer = None
-    #     self.early_stopping_steps = learning_config.get("early_stopping_steps", 10)
-    #     self.early_stopping_threshold = learning_config.get(
-    #         "early_stopping_threshold", 0.05
-    #     )
-    #     self.episodes_done = 0
-    #     self.rl_strats: dict[int, LearningStrategy] = {}
-    #     self.rl_algorithm = learning_config["algorithm"]
-    #     self.actor_architecture = learning_config["actor_architecture"]
-    #     self.critics = {}
-    #     self.target_critics = {}
-
-    #     # define whether we train model or evaluate it
-    #     self.training_episodes = learning_config["training_episodes"]
-    #     self.learning_mode = learning_config["learning_mode"]
-    #     self.continue_learning = learning_config["continue_learning"]
-    #     self.perform_evaluation = learning_config["perform_evaluation"]
-    #     self.trained_policies_save_path = learning_config["trained_policies_save_path"]
-    #     self.trained_policies_load_path = learning_config.get(
-    #         "trained_policies_load_path", self.trained_policies_save_path
-    #     )
-
-    #     cuda_device = (
-    #         learning_config["device"]
-    #         if "cuda" in learning_config.get("device", "cpu")
-    #         else "cpu"
-    #     )
-    #     self.device = th.device(cuda_device if th.cuda.is_available() else "cpu")
-
-    #     # future: add option to choose between float16 and float32
-    #     # float_type = learning_config.get("float_type", "float32")
-    #     self.float_type = th.float
-
-    #     th.backends.cuda.matmul.allow_tf32 = True
-    #     th.backends.cudnn.allow_tf32 = True
-
-    #     self.learning_rate = learning_config.get("learning_rate", 1e-4)
-
-    #     # if we do not have initital experience collected we will get an error as no samples are avaiable on the
-    #     # buffer from which we can draw exprience to adapt the strategy, hence we set it to minium one episode
-
-    #     self.episodes_collecting_initial_experience = max(
-    #         learning_config.get("episodes_collecting_initial_experience", 5), 1
-    #     )
-
-    #     self.train_freq = learning_config.get("train_freq", "1h")
-    #     self.gradient_steps = (
-    #         int(self.train_freq[:-1])
-    #         if learning_config.get("gradient_steps", -1) == -1
-    #         else learning_config["gradient_steps"]
-    #     )
-    #     self.batch_size = learning_config.get("batch_size", 128)
-    #     self.gamma = learning_config.get("gamma", 0.99)
-
-    #     self.eval_episodes_done = 0
-
-    #     # function that initializes learning, needs to be an extra function so that it can be called after buffer is given to Role
-    #     self.create_learning_algorithm(self.rl_algorithm)
-
-    #     # store evaluation values
-    #     self.max_eval = defaultdict(lambda: -1e9)
-    #     self.rl_eval = defaultdict(list)
-    #     # list of avg_changes
-    #     self.avg_rewards = []
-
-    # MATD3 version
-    # def create_learning_algorithm(self, algorithm: RLAlgorithm):
-    #     """
-    #     Create and initialize the reinforcement learning algorithm.
-
-    #     This method creates and initializes the reinforcement learning algorithm based on the specified algorithm name. The algorithm
-    #     is associated with the learning role and configured with relevant hyperparameters.
-
-    #     Args:
-    #         algorithm (RLAlgorithm): The name of the reinforcement learning algorithm.
-    #     """
-    #     if algorithm == "matd3":
-    #         self.rl_algorithm = TD3(
-    #             learning_role=self,
-    #             learning_rate=self.learning_rate,
-    #             episodes_collecting_initial_experience=self.episodes_collecting_initial_experience,
-    #             gradient_steps=self.gradient_steps,
-    #             batch_size=self.batch_size,
-    #             gamma=self.gamma,
-    #             actor_architecture=self.actor_architecture,
-    #         )
-    #     else:
-    #         logger.error(f"Learning algorithm {algorithm} not implemented!")
\ No newline at end of file
diff --git a/assume/reinforcement_learning/raw_ppo.py b/assume/reinforcement_learning/raw_ppo.py
index 5af03b5dd..0a59a1e59 100644
--- a/assume/reinforcement_learning/raw_ppo.py
+++ b/assume/reinforcement_learning/raw_ppo.py
@@ -1,21 +1,25 @@
+# SPDX-FileCopyrightText: ASSUME Developers
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+from collections import deque
+
 import gym
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.optim as optim
-from collections import deque
-import numpy as np
+
 
 class MLPActorCritic(nn.Module):
     """
     Simple MLP Actor-Critic network with separate actor and critic heads.
     """
+
     def __init__(self, obs_dim, act_dim):
-        super(MLPActorCritic, self).__init__()
+        super().__init__()
         self.shared = nn.Sequential(
-            nn.Linear(obs_dim, 64),
-            nn.ReLU(),
-            nn.Linear(64, 64),
-            nn.ReLU()
+            nn.Linear(obs_dim, 64), nn.ReLU(), nn.Linear(64, 64), nn.ReLU()
         )
         # Actor head
         self.actor = nn.Linear(64, act_dim)
@@ -44,7 +48,19 @@ class PPO:
     """
     Proximal Policy Optimization (PPO) implementation in PyTorch.
     """
-    def __init__(self, env, actor_critic, clip_param=0.2, entcoeff=0.01, optim_stepsize=1e-3, optim_epochs=4, gamma=0.99, lam=0.95, batch_size=64):
+
+    def __init__(
+        self,
+        env,
+        actor_critic,
+        clip_param=0.2,
+        entcoeff=0.01,
+        optim_stepsize=1e-3,
+        optim_epochs=4,
+        gamma=0.99,
+        lam=0.95,
+        batch_size=64,
+    ):
         self.env = env
         self.actor_critic = actor_critic
         self.clip_param = clip_param
@@ -54,7 +70,9 @@ def __init__(self, env, actor_critic, clip_param=0.2, entcoeff=0.01, optim_steps
         self.gamma = gamma
         self.lam = lam
         self.batch_size = batch_size
-        self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=self.optim_stepsize)
+        self.optimizer = optim.Adam(
+            self.actor_critic.parameters(), lr=self.optim_stepsize
+        )
 
     def discount_rewards(self, rewards, dones, gamma):
         """
@@ -87,7 +105,14 @@ def rollout(self, timesteps_per_actorbatch):
         """
         # Reset env
         obs = self.env.reset()
-        obs_list, actions_list, rewards_list, dones_list, log_probs_list, values_list = [], [], [], [], [], []
+        (
+            obs_list,
+            actions_list,
+            rewards_list,
+            dones_list,
+            log_probs_list,
+            values_list,
+        ) = [], [], [], [], [], []
         for _ in range(timesteps_per_actorbatch):
             obs_tensor = torch.FloatTensor(obs).unsqueeze(0)
             action, log_prob, value = self.actor_critic.act(obs_tensor)
@@ -126,7 +151,9 @@ def ppo_update(self, batch, clip_param, entcoeff):
         observations, actions, old_log_probs, returns, advantages = batch
 
         for _ in range(self.optim_epochs):
-            new_log_probs, values, entropy = self.actor_critic.evaluate_actions(observations, actions)
+            new_log_probs, values, entropy = self.actor_critic.evaluate_actions(
+                observations, actions
+            )
 
             ratio = torch.exp(new_log_probs - old_log_probs)
             surr1 = ratio * advantages
@@ -160,8 +187,12 @@ def train(self, total_timesteps, timesteps_per_actorbatch, log_interval=100):
             values = batch["values"].detach()
 
             # Compute discounted rewards and advantages
-            returns = torch.FloatTensor(self.discount_rewards(rewards, dones, self.gamma))
-            advantages = torch.FloatTensor(self.compute_gae(rewards, values.numpy(), dones, self.gamma, self.lam))
+            returns = torch.FloatTensor(
+                self.discount_rewards(rewards, dones, self.gamma)
+            )
+            advantages = torch.FloatTensor(
+                self.compute_gae(rewards, values.numpy(), dones, self.gamma, self.lam)
+            )
 
             # Normalize advantages
             advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
@@ -175,15 +206,26 @@ def train(self, total_timesteps, timesteps_per_actorbatch, log_interval=100):
             reward_history.append(avg_reward)
 
             if total_timesteps_done % log_interval == 0:
-                print(f"Timesteps: {total_timesteps_done}, Avg Reward: {np.mean(reward_history)}")
+                print(
+                    f"Timesteps: {total_timesteps_done}, Avg Reward: {np.mean(reward_history)}"
+                )
 
 
 # Example usage with CartPole environment
-env = gym.make('CartPole-v1')
+env = gym.make("CartPole-v1")
 obs_dim = env.observation_space.shape[0]
 act_dim = env.action_space.n
 
 actor_critic = MLPActorCritic(obs_dim, act_dim)
-ppo = PPO(env, actor_critic, clip_param=0.2, entcoeff=0.01, optim_stepsize=1e-3, optim_epochs=4, gamma=0.99, lam=0.95)
+ppo = PPO(
+    env,
+    actor_critic,
+    clip_param=0.2,
+    entcoeff=0.01,
+    optim_stepsize=1e-3,
+    optim_epochs=4,
+    gamma=0.99,
+    lam=0.95,
+)
 
 ppo.train(total_timesteps=10000, timesteps_per_actorbatch=256)
diff --git a/assume/scenario/loader_csv.py b/assume/scenario/loader_csv.py
index 9bf60b97c..50508caf1 100644
--- a/assume/scenario/loader_csv.py
+++ b/assume/scenario/loader_csv.py
@@ -26,7 +26,7 @@
 from assume.strategies import BaseStrategy
 from assume.world import World
 
-#from assume.reinforcement_learning.learning_utils import calculate_total_timesteps_per_episode
+# from assume.reinforcement_learning.learning_utils import calculate_total_timesteps_per_episode
 
 logger = logging.getLogger(__name__)
 
@@ -894,13 +894,17 @@ def run_learning(
     world.export_csv_path = ""
 
     actors_and_critics = None
-    world.learning_role.initialize_policy(actors_and_critics=actors_and_critics) # Leads to the initialization of the Learning role, makes world.learning_role.rl_algorithm_name accessible
+    world.learning_role.initialize_policy(
+        actors_and_critics=actors_and_critics
+    )  # Leads to the initialization of the Learning role, makes world.learning_role.rl_algorithm_name accessible
     world.output_role.del_similar_runs()
 
     save_path = world.learning_config["trained_policies_save_path"]
 
     if Path(save_path).is_dir():
-        accept = input(f"{save_path=} exists - should we overwrite current learnings? (y/N) ")
+        accept = input(
+            f"{save_path=} exists - should we overwrite current learnings? (y/N) "
+        )
         if not accept.lower().startswith("y"):
             raise AssumeException("Don't overwrite existing strategies")
 
@@ -908,30 +912,23 @@ def run_learning(
     scenario_data = load_config_and_create_forecaster(inputs_path, scenario, study_case)
 
     # For PPO buffer size calculation
-    validation_interval_from_config = world.learning_config.get("validation_episodes_interval", 5)
+    validation_interval_from_config = world.learning_config.get(
+        "validation_episodes_interval", 5
+    )
 
-    if world.learning_role.rl_algorithm_name == "matd3":
-        buffer = ReplayBuffer(
-            buffer_size=int(world.learning_config.get("replay_buffer_size", 5e5)),
-            obs_dim=world.learning_role.rl_algorithm.obs_dim,
-            act_dim=world.learning_role.rl_algorithm.act_dim,
-            n_rl_units=len(world.learning_role.rl_strats),
-            device=world.learning_role.device,
-            float_type=world.learning_role.float_type,
-        )
-    elif world.learning_role.rl_algorithm_name == "ppo":
-
-        # For non-dynamic buffer size: Calculate number of timesteps here for a full episode
-        # total_timesteps_per_episode = calculate_total_timesteps_per_episode(scenario_data['start'], scenario_data['end'], scenario_data['time_step'])
-
-        buffer = RolloutBuffer(
-            # buffer_size=int(total_timesteps_per_episode * validation_interval_from_config), # For non-dynamic buffer size
-            obs_dim=world.learning_role.rl_algorithm.obs_dim,
-            act_dim=world.learning_role.rl_algorithm.act_dim,
-            n_rl_units=len(world.learning_role.rl_strats),
-            device=world.learning_role.device,
-            float_type=world.learning_role.float_type,
-        )
+    buffer_cls = (
+        ReplayBuffer
+        if world.learning_role.rl_algorithm_name == "matd3"
+        else RolloutBuffer
+    )
+    buffer = buffer_cls(
+        buffer_size=int(world.learning_config.get("replay_buffer_size", 5e5)),
+        obs_dim=world.learning_role.rl_algorithm.obs_dim,
+        act_dim=world.learning_role.rl_algorithm.act_dim,
+        n_rl_units=len(world.learning_role.rl_strats),
+        device=world.learning_role.device,
+        float_type=world.learning_role.float_type,
+    )
 
     inter_episodic_data = {
         "buffer": buffer,
@@ -950,7 +947,10 @@ def run_learning(
         # Noise is added to the actions to encourage exploration. In contrast, PPO uses stochastic policies
         # that naturally explore the environment by sampling actions from a probability distribution, making
         # external noise addition unnecessary.
-        inter_episodic_data["noise_scale"] = world.learning_config.get("noise_scale", 1.0)
+        # TODO: delete noise scale here as new sheduler makes it obsolete in future, leave like this now
+        inter_episodic_data["noise_scale"] = world.learning_config.get(
+            "noise_scale", 1.0
+        )
 
     # if world.learning_role.rl_algorithm_name == "matd3":
     # Sets the validation interval: After how many episodes does validation take place
@@ -975,22 +975,19 @@ def run_learning(
             )
 
         world.learning_role.load_inter_episodic_data(inter_episodic_data)
-        world.run() # triggers calculate_bids() which equals to step
+        world.run()  # triggers calculate_bids() which equals to step
 
         inter_episodic_data = world.learning_role.get_inter_episodic_data()
         inter_episodic_data["episodes_done"] = episode
 
-
         # Perform validation at regular intervals
-        if (
-            episode % validation_interval == 0
-            and (
-                episode >= world.learning_role.episodes_collecting_initial_experience + validation_interval
-                if world.learning_role.rl_algorithm_name == "matd3"
-                else episode > validation_interval # For PPO
-            )
+        if episode % validation_interval == 0 and (
+            episode
+            >= world.learning_role.episodes_collecting_initial_experience
+            + validation_interval
+            if world.learning_role.rl_algorithm_name == "matd3"
+            else episode > validation_interval  # For PPO
         ):
-            
             world.reset()
 
             setup_world(
@@ -1007,9 +1004,10 @@ def run_learning(
             if world.learning_role.rl_algorithm_name == "matd3":
                 total_rewards = world.output_role.get_sum_reward()
                 avg_reward = np.mean(total_rewards)
-                terminate = world.learning_role.compare_and_save_policies({"avg_reward": avg_reward})
+                terminate = world.learning_role.compare_and_save_policies(
+                    {"avg_reward": avg_reward}
+                )
 
-     
             if world.learning_role.rl_algorithm_name == "ppo":
                 # PPO uses the surrogate loss to monitor policy updates.
                 # The surrogate loss quantifies how much the new policy has changed compared to the old one.
@@ -1017,18 +1015,25 @@ def run_learning(
                 # - A very small value may mean that the policy is near its optimum.
                 # - A large value could indicate excessive policy updates, leading to instability.
                 #
-                # It may be useful to terminate the training early based on the surrogate loss, 
+                # It may be useful to terminate the training early based on the surrogate loss,
                 # especially if no significant improvement is expected, or if the model becomes unstable.
                 #
-                # In this example, the surrogate_loss could be computed, and then 
+                # In this example, the surrogate_loss could be computed, and then
                 # `compare_and_save_policies` can be used to check whether the training should be terminated.
-                
+
                 # surrogate_loss = <code to calculate the surrogate loss>
                 # terminate = world.learning_role.compare_and_save_policies({"surrogate_loss": surrogate_loss})
 
                 # Reset the PPO Rollout Buffer after each update
+                # TODO: add surrogate loss as a parameter to compare_and_save_policies
                 inter_episodic_data["buffer"].reset()
 
+                total_rewards = world.output_role.get_sum_reward()
+                avg_reward = np.mean(total_rewards)
+                terminate = world.learning_role.compare_and_save_policies(
+                    {"avg_reward": avg_reward}
+                )
+
             inter_episodic_data["eval_episodes_done"] = eval_episode
 
             if terminate:
@@ -1036,10 +1041,6 @@ def run_learning(
 
             eval_episode += 1
 
-            
-
-            
-
         world.reset()
 
         if episode == (world.learning_role.training_episodes):
@@ -1065,5 +1066,6 @@ def run_learning(
 
     print("Evaluation finished")
 
+
 if __name__ == "__main__":
     data = read_grid(Path("examples/inputs/example_01d"))
diff --git a/assume/strategies/learning_strategies.py b/assume/strategies/learning_strategies.py
index dd508df10..b5837d9e7 100644
--- a/assume/strategies/learning_strategies.py
+++ b/assume/strategies/learning_strategies.py
@@ -131,8 +131,6 @@ def calculate_bids(
 
         """
 
-        print("calculate_bids in learning_strategies.py (STEP)")
-
         bid_quantity_inflex, bid_price_inflex = 0, 0
         bid_quantity_flex, bid_price_flex = 0, 0
 
@@ -156,9 +154,8 @@ def calculate_bids(
         # =============================================================================
         # 2. Get the Actions, based on the observations
         # =============================================================================
-        # actions, noise = self.get_actions(next_observation) # Old implementation with get_actions inside this class
-        # actions, noise = self.get_actions(self, next_observation)
-        # Depending on the algorithm, extra_info is either noise (MATD3) or log_probs (PPO)
+        # Depending on the algorithm, we call specific function that passes obs through actor and generates actions
+        # extra_info is either noise (MATD3) or log_probs (PPO)
         actions, extra_info = self.get_actions(self, next_observation)
 
         # =============================================================================
@@ -205,7 +202,7 @@ def calculate_bids(
         # store results in unit outputs as series to be written to the database by the unit operator
         unit.outputs["actions"][start] = actions
         # unit.outputs["exploration_noise"][start] = noise
-
+        # TODO: Make this algo specific function
         # Check if extra_info is noise or log_probs and store it accordingly
         if isinstance(extra_info, th.Tensor) and extra_info.shape == actions.shape:
             unit.outputs["exploration_noise"][start] = extra_info  # It's noise
diff --git a/examples/examples.py b/examples/examples.py
index 79c291cfa..46f9d20da 100644
--- a/examples/examples.py
+++ b/examples/examples.py
@@ -11,7 +11,7 @@
 
 log = logging.getLogger(__name__)
 
-csv_path = "C:/Users/manuk/OneDrive - bwedu/01_Studium/Master/6. Semester/Spezialveranstaltung/CSVs"
+csv_path = ""
 
 os.makedirs("./examples/local_db", exist_ok=True)
 
@@ -65,7 +65,10 @@
     # example_01g is used in the tutorial notebook #6: Advanced order types example
     #
     # DRL references case for learning advancement testing
-    "small_learning_1": {"scenario": "example_02a", "study_case": "tiny"}, # Changed from base to tiny for testing
+    "small_learning_1": {
+        "scenario": "example_02a",
+        "study_case": "tiny",
+    },  # Changed from base to tiny for testing
     "small_learning_2": {"scenario": "example_02b", "study_case": "base"},
     "small_learning_3": {"scenario": "example_02c", "study_case": "base"},
     # DRL cases with lstm instead of mlp as actor neural network architecture
diff --git a/examples/inputs/example_01a/forecasts_df.csv.license b/examples/inputs/example_01a/forecasts_df.csv.license
new file mode 100644
index 000000000..a6ae06366
--- /dev/null
+++ b/examples/inputs/example_01a/forecasts_df.csv.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: ASSUME Developers
+
+SPDX-License-Identifier: AGPL-3.0-or-later
diff --git a/examples/inputs/example_02a/forecasts_df.csv.license b/examples/inputs/example_02a/forecasts_df.csv.license
new file mode 100644
index 000000000..a6ae06366
--- /dev/null
+++ b/examples/inputs/example_02a/forecasts_df.csv.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: ASSUME Developers
+
+SPDX-License-Identifier: AGPL-3.0-or-later

From 991c3114a18e39c891f17d328c811db8c8609c2d Mon Sep 17 00:00:00 2001
From: ufqjh <ufqjh@student.kit.edu>
Date: Sat, 19 Oct 2024 14:16:02 +0200
Subject: [PATCH 05/23] -	Removed unused parts of the RolloutBuffer -
 Established fully decentralized Critic for PPO -	Added debug statements
 -	Changed activation layer of Actor Neural Network from softsign to
 sigmoid

---
 .../algorithms/matd3.py                       |   2 +-
 .../reinforcement_learning/algorithms/ppo.py  | 269 +++++++++++++++---
 assume/reinforcement_learning/buffer.py       |  93 +++---
 .../reinforcement_learning/learning_role.py   |   8 +-
 .../learning_unit_operator.py                 |  16 +-
 .../neural_network_architecture.py            |   7 +-
 assume/scenario/loader_csv.py                 |  15 +-
 assume/strategies/learning_strategies.py      |   7 +-
 examples/examples.py                          |   4 +-
 examples/inputs/example_02a/config.yaml       |  50 ++--
 10 files changed, 331 insertions(+), 140 deletions(-)

diff --git a/assume/reinforcement_learning/algorithms/matd3.py b/assume/reinforcement_learning/algorithms/matd3.py
index 2a5b3293a..a5e9b9f27 100644
--- a/assume/reinforcement_learning/algorithms/matd3.py
+++ b/assume/reinforcement_learning/algorithms/matd3.py
@@ -292,7 +292,7 @@ def create_critics(self) -> None:
         This method initializes critic networks for each agent in the reinforcement learning setup.
 
         Notes:
-            The observation dimension need to be the same, due to the centralized criic that all actors share.
+            The observation dimension need to be the same, due to the centralized critic that all actors share.
             If you have units with different observation dimensions. They need to have different critics and hence learning roles.
         """
         n_agents = len(self.learning_role.rl_strats)
diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index 01bfbe4b8..080d93a6b 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -69,13 +69,15 @@ def save_params(self, directory):
         self.save_critic_params(directory=f"{directory}/critics")
         self.save_actor_params(directory=f"{directory}/actors")
 
+ 
     # Removed critic_target in comparison to MATD3
+    # Decentralized
     def save_critic_params(self, directory):
         """
         Save the parameters of critic networks.
 
-        This method saves the parameters of the critic networks, including the critic's state_dict, critic_target's state_dict. It organizes the saved parameters into a directory structure specific to the critic
-        associated with each learning   strategy.
+        This method saves the parameters of the critic networks, including the critic's state_dict and the critic's optimizer state_dict. 
+        It organizes the saved parameters into a directory structure specific to the critic associated with each learning strategy.
 
         Args:
             directory (str): The base directory for saving the parameters.
@@ -83,15 +85,36 @@ def save_critic_params(self, directory):
         os.makedirs(directory, exist_ok=True)
         for u_id in self.learning_role.rl_strats.keys():
             obj = {
-                "critic": self.learning_role.critics[u_id].state_dict(),
-                # "critic_target": self.learning_role.target_critics[u_id].state_dict(),
-                "critic_optimizer": self.learning_role.critics[
-                    u_id
-                ].optimizer.state_dict(),
+                "critic": self.learning_role.rl_strats[u_id].critic.state_dict(),
+                "critic_optimizer": self.learning_role.rl_strats[u_id].critic.optimizer.state_dict(),
             }
             path = f"{directory}/critic_{u_id}.pt"
             th.save(obj, path)
 
+
+    # Centralized
+    # def save_critic_params(self, directory):
+    #     """
+    #     Save the parameters of critic networks.
+
+    #     This method saves the parameters of the critic networks, including the critic's state_dict, critic_target's state_dict. It organizes the saved parameters into a directory structure specific to the critic
+    #     associated with each learning   strategy.
+
+    #     Args:
+    #         directory (str): The base directory for saving the parameters.
+    #     """
+    #     os.makedirs(directory, exist_ok=True)
+    #     for u_id in self.learning_role.rl_strats.keys():
+    #         obj = {
+    #             "critic": self.learning_role.critics[u_id].state_dict(),
+    #             # "critic_target": self.learning_role.target_critics[u_id].state_dict(),
+    #             "critic_optimizer": self.learning_role.critics[
+    #                 u_id
+    #             ].optimizer.state_dict(),
+    #         }
+    #         path = f"{directory}/critic_{u_id}.pt"
+    #         th.save(obj, path)
+
     # Removed actor_target in comparison to MATD3 (Actor network = policy network)
     def save_actor_params(self, directory):
         """
@@ -133,13 +156,14 @@ def load_params(self, directory: str) -> None:
         self.load_actor_params(directory)
 
     # Removed critic_target in comparison to MATD3 (critic network = value function network)
+    # Decentralized
     def load_critic_params(self, directory: str) -> None:
         """
         Load the parameters of critic networks from a specified directory.
 
-        This method loads the parameters of critic networks, including the critic's state_dict, critic_target's state_dict, and
+        This method loads the parameters of critic networks, including the critic's state_dict and
         the critic's optimizer state_dict, from the specified directory. It iterates through the learning strategies associated
-        with the learning role, loads the respective parameters, and updates the critic and target critic networks accordingly.
+        with the learning role, loads the respective parameters, and updates the critic networks accordingly.
 
         Args:
             directory (str): The directory from which the parameters should be loaded.
@@ -157,18 +181,53 @@ def load_critic_params(self, directory: str) -> None:
                 critic_params = self.load_obj(
                     directory=f"{directory}/critics/critic_{str(u_id)}.pt"
                 )
-                self.learning_role.critics[u_id].load_state_dict(
+                self.learning_role.rl_strats[u_id].critic.load_state_dict(
                     critic_params["critic"]
                 )
-                # self.learning_role.target_critics[u_id].load_state_dict(
-                #     critic_params["critic_target"]
-                # )
-                self.learning_role.critics[u_id].optimizer.load_state_dict(
+                self.learning_role.rl_strats[u_id].critic.optimizer.load_state_dict(
                     critic_params["critic_optimizer"]
                 )
             except Exception:
                 logger.warning(f"No critic values loaded for agent {u_id}")
 
+
+    # Centralized
+    # def load_critic_params(self, directory: str) -> None:
+    #     """
+    #     Load the parameters of critic networks from a specified directory.
+
+    #     This method loads the parameters of critic networks, including the critic's state_dict, critic_target's state_dict, and
+    #     the critic's optimizer state_dict, from the specified directory. It iterates through the learning strategies associated
+    #     with the learning role, loads the respective parameters, and updates the critic and target critic networks accordingly.
+
+    #     Args:
+    #         directory (str): The directory from which the parameters should be loaded.
+    #     """
+    #     logger.info("Loading critic parameters...")
+
+    #     if not os.path.exists(directory):
+    #         logger.warning(
+    #             "Specified directory for loading the critics does not exist! Starting with randomly initialized values!"
+    #         )
+    #         return
+
+    #     for u_id in self.learning_role.rl_strats.keys():
+    #         try:
+    #             critic_params = self.load_obj(
+    #                 directory=f"{directory}/critics/critic_{str(u_id)}.pt"
+    #             )
+    #             self.learning_role.critics[u_id].load_state_dict(
+    #                 critic_params["critic"]
+    #             )
+    #             # self.learning_role.target_critics[u_id].load_state_dict(
+    #             #     critic_params["critic_target"]
+    #             # )
+    #             self.learning_role.critics[u_id].optimizer.load_state_dict(
+    #                 critic_params["critic_optimizer"]
+    #             )
+    #         except Exception:
+    #             logger.warning(f"No critic values loaded for agent {u_id}")
+
     # Removed actor_target in comparison to MATD3
     def load_actor_params(self, directory: str) -> None:
         """
@@ -206,6 +265,7 @@ def load_actor_params(self, directory: str) -> None:
                 logger.warning(f"No actor values loaded for agent {u_id}")
 
     # Removed target_critics and actor_target in comparison to MATD3
+    # Decentralized
     def initialize_policy(self, actors_and_critics: dict = None) -> None:
         """
         Create actor and critic networks for reinforcement learning.
@@ -215,23 +275,50 @@ def initialize_policy(self, actors_and_critics: dict = None) -> None:
 
         Args:
             actors_and_critics (dict): The actor and critic networks to be assigned.
-
         """
         if actors_and_critics is None:
             self.create_actors()
             self.create_critics()
-
         else:
-            self.learning_role.critics = actors_and_critics["critics"]
-            # self.learning_role.target_critics = actors_and_critics["target_critics"]
+            # Decentralized initialization of actors and critics
             for u_id, unit_strategy in self.learning_role.rl_strats.items():
                 unit_strategy.actor = actors_and_critics["actors"][u_id]
                 # unit_strategy.actor_target = actors_and_critics["actor_targets"][u_id]
+                unit_strategy.critic = actors_and_critics["critics"][u_id]
+                # unit_strategy.critic_target = actors_and_critics["critic_targets"][u_id]
 
+            # Assign shared dimensions
             self.obs_dim = actors_and_critics["obs_dim"]
             self.act_dim = actors_and_critics["act_dim"]
             self.unique_obs_dim = actors_and_critics["unique_obs_dim"]
 
+    # Centralized
+    # def initialize_policy(self, actors_and_critics: dict = None) -> None:
+    #     """
+    #     Create actor and critic networks for reinforcement learning.
+
+    #     If `actors_and_critics` is None, this method creates new actor and critic networks.
+    #     If `actors_and_critics` is provided, it assigns existing networks to the respective attributes.
+
+    #     Args:
+    #         actors_and_critics (dict): The actor and critic networks to be assigned.
+
+    #     """
+    #     if actors_and_critics is None:
+    #         self.create_actors()
+    #         self.create_critics()
+
+    #     else:
+    #         self.learning_role.critics = actors_and_critics["critics"]
+    #         # self.learning_role.target_critics = actors_and_critics["target_critics"]
+    #         for u_id, unit_strategy in self.learning_role.rl_strats.items():
+    #             unit_strategy.actor = actors_and_critics["actors"][u_id]
+    #             # unit_strategy.actor_target = actors_and_critics["actor_targets"][u_id]
+
+    #         self.obs_dim = actors_and_critics["obs_dim"]
+    #         self.act_dim = actors_and_critics["act_dim"]
+    #         self.unique_obs_dim = actors_and_critics["unique_obs_dim"]
+
     # Removed actor_target in comparison to MATD3
     def create_actors(self) -> None:
         """
@@ -281,6 +368,7 @@ def create_actors(self) -> None:
 
     # Removed target_critics in comparison to MATD3
     # Changed initialization of CriticPPO compared to MATD3
+    # Decentralized
     def create_critics(self) -> None:
         """
         Create decentralized critic networks for reinforcement learning.
@@ -292,27 +380,22 @@ def create_critics(self) -> None:
             Each agent has its own critic, so the critic is no longer shared among all agents.
         """
 
-        strategy: LearningStrategy
         unique_obs_dim_list = []
 
-        for u_id, strategy in self.learning_role.rl_strats.items():
-            self.learning_role.critics[u_id] = CriticPPO(
-                obs_dim=strategy.obs_dim,
+        for _, unit_strategy in self.learning_role.rl_strats.items():
+            unit_strategy.critic = CriticPPO(
+                obs_dim=unit_strategy.obs_dim,
                 float_type=self.float_type,
-            )
-
-            self.learning_role.critics[u_id].optimizer = Adam(
-                self.learning_role.critics[u_id].parameters(), lr=self.learning_rate
-            )
+            ).to(self.device)
 
-            self.learning_role.critics[u_id] = self.learning_role.critics[u_id].to(
-                self.device
+            unit_strategy.critic.optimizer = Adam(
+                unit_strategy.critic.parameters(), lr=self.learning_rate
             )
 
-            unique_obs_dim_list.append(strategy.unique_obs_dim)
+            unique_obs_dim_list.append(unit_strategy.unique_obs_dim)
 
-        # check if all unique_obs_dim are the same and raise an error if not
-        # if they are all the same, set the unique_obs_dim attribute
+        # Check if all unique_obs_dim are the same and raise an error if not
+        # If they are all the same, set the unique_obs_dim attribute
         if len(set(unique_obs_dim_list)) > 1:
             raise ValueError(
                 "All unique_obs_dim values must be the same for all RL agents"
@@ -320,25 +403,70 @@ def create_critics(self) -> None:
         else:
             self.unique_obs_dim = unique_obs_dim_list[0]
 
+
+
+    # Centralized
+    # def create_critics(self) -> None:
+    #     """
+    #     Create decentralized critic networks for reinforcement learning.
+
+    #     This method initializes a separate critic network for each agent in the reinforcement learning setup.
+    #     Each critic learns to predict the value function based on the individual agent's observation.
+
+    #     Notes:
+    #         Each agent has its own critic, so the critic is no longer shared among all agents.
+    #     """
+
+    #     strategy: LearningStrategy
+    #     unique_obs_dim_list = []
+
+    #     for u_id, strategy in self.learning_role.rl_strats.items():
+    #         self.learning_role.critics[u_id] = CriticPPO(
+    #             obs_dim=strategy.obs_dim,
+    #             float_type=self.float_type,
+    #         )
+
+    #         self.learning_role.critics[u_id].optimizer = Adam(
+    #             self.learning_role.critics[u_id].parameters(), lr=self.learning_rate
+    #         )
+
+    #         self.learning_role.critics[u_id] = self.learning_role.critics[u_id].to(
+    #             self.device
+    #         )
+
+    #         unique_obs_dim_list.append(strategy.unique_obs_dim)
+
+    #     # check if all unique_obs_dim are the same and raise an error if not
+    #     # if they are all the same, set the unique_obs_dim attribute
+    #     if len(set(unique_obs_dim_list)) > 1:
+    #         raise ValueError(
+    #             "All unique_obs_dim values must be the same for all RL agents"
+    #         )
+    #     else:
+    #         self.unique_obs_dim = unique_obs_dim_list[0]
+
+    # Decentralized
     def extract_policy(self) -> dict:
         """
         Extract actor and critic networks.
 
         This method extracts the actor and critic networks associated with each learning strategy and organizes them into a
-        dictionary structure. The extracted networks include actors, actor_targets, critics, and target_critics. The resulting
+        dictionary structure. The extracted networks include actors and critics. The resulting
         dictionary is typically used for saving and sharing these networks.
 
         Returns:
             dict: The extracted actor and critic networks.
         """
         actors = {}
+        critics = {}
 
         for u_id, unit_strategy in self.learning_role.rl_strats.items():
             actors[u_id] = unit_strategy.actor
+            critics[u_id] = unit_strategy.critic
 
         actors_and_critics = {
             "actors": actors,
-            "critics": self.learning_role.critics,
+            "critics": critics,
             "obs_dim": self.obs_dim,
             "act_dim": self.act_dim,
             "unique_obs_dim": self.unique_obs_dim,
@@ -346,6 +474,33 @@ def extract_policy(self) -> dict:
 
         return actors_and_critics
 
+    # Centralized
+    # def extract_policy(self) -> dict:
+    #     """
+    #     Extract actor and critic networks.
+
+    #     This method extracts the actor and critic networks associated with each learning strategy and organizes them into a
+    #     dictionary structure. The extracted networks include actors, and critics. The resulting
+    #     dictionary is typically used for saving and sharing these networks.
+
+    #     Returns:
+    #         dict: The extracted actor and critic networks.
+    #     """
+    #     actors = {}
+
+    #     for u_id, unit_strategy in self.learning_role.rl_strats.items():
+    #         actors[u_id] = unit_strategy.actor
+
+    #     actors_and_critics = {
+    #         "actors": actors,
+    #         "critics": self.learning_role.critics,
+    #         "obs_dim": self.obs_dim,
+    #         "act_dim": self.act_dim,
+    #         "unique_obs_dim": self.unique_obs_dim,
+    #     }
+
+    #     return actors_and_critics
+
     def update_policy(self):
         """
         Perform policy updates using PPO with the clipped objective.
@@ -361,7 +516,12 @@ def update_policy(self):
             # Iterate through over each agent's strategy
             # Each agent has its own actor and critic. Critic (value network) is in comparison to MATD3 decentralized, meaning each agent learns its own value function.
             for u_id in self.learning_role.rl_strats.keys():
-                critic = self.learning_role.critics[u_id]
+                
+                
+                # Centralized
+                # critic = self.learning_role.critics[u_id]
+                # Decentralized
+                critic = self.learning_role.rl_strats[u_id].critic
                 actor = self.learning_role.rl_strats[u_id].actor
 
                 # Retrieve experiences from the buffer
@@ -378,6 +538,8 @@ def update_policy(self):
                 # Pass the current states through the critic network to get value estimates.
                 values = critic(states).squeeze(dim=2)
 
+                logger.debug(f"Values: {values}")
+
                 # Store the calculated values in the rollout buffer
                 # These values are used later to calculate the advantage estimates (for policy updates).
                 self.learning_role.buffer.values = values.detach().cpu().numpy()
@@ -389,6 +551,9 @@ def update_policy(self):
 
                 # Iterate through the collected experiences in reverse order to calculate advantages and returns
                 for t in reversed(range(len(rewards))):
+                    
+                    logger.debug(f"Reward: {t}")    
+
                     if t == len(rewards) - 1:
                         next_value = 0
                     else:
@@ -399,10 +564,15 @@ def update_policy(self):
                         rewards[t] + self.gamma * next_value - values[t]
                     )  # Use self.gamma for discount factor
 
+                    logger.debug(f"Delta: {delta}")
+
                     # GAE advantage
                     last_advantage = (
                         delta + self.gamma * self.gae_lambda * last_advantage
                     )  # Use self.gae_lambda for advantage estimation
+
+                    logger.debug(f"Last_advantage: {last_advantage}")
+
                     advantages.insert(0, last_advantage)
                     returns.insert(0, last_advantage + values[t])
 
@@ -422,6 +592,8 @@ def update_policy(self):
                 # Compute the ratio of new policy to old policy
                 ratio = (new_log_probs - log_probs).exp()
 
+                logger.debug(f"Ratio: {ratio}")
+
                 # Surrogate loss calculation
                 surrogate1 = ratio * advantages
                 surrogate2 = (
@@ -429,12 +601,19 @@ def update_policy(self):
                     * advantages
                 )  # Use self.clip_ratio
 
+                logger.debug(f"surrogate1: {surrogate1}")
+                logger.debug(f"surrogate2: {surrogate2}")
+
                 # Final policy loss (clipped surrogate loss)
                 policy_loss = -th.min(surrogate1, surrogate2).mean()
 
+                logger.debug(f"policy_loss: {policy_loss}")
+
                 # Value loss (mean squared error between the predicted values and returns)
                 value_loss = F.mse_loss(returns, values.squeeze())
 
+                logger.debug(f"value loss: {value_loss}")
+
                 # Total loss: policy loss + value loss - entropy bonus
                 total_loss = (
                     policy_loss
@@ -442,6 +621,8 @@ def update_policy(self):
                     - self.entropy_coef * entropy.mean()
                 )  # Use self.vf_coef and self.entropy_coef
 
+                logger.debug(f"total loss: {total_loss}")
+
                 # Zero the gradients and perform backpropagation for both actor and critic
                 actor.optimizer.zero_grad()
                 critic.optimizer.zero_grad()
@@ -472,6 +653,7 @@ def get_actions(rl_strategy, next_observation):
         torch.Tensor: The sampled actions.
         torch.Tensor: The log probability of the sampled actions.
     """
+    logger.debug("ppo.py: Get_actions method")
 
     actor = rl_strategy.actor
     device = rl_strategy.device
@@ -479,19 +661,30 @@ def get_actions(rl_strategy, next_observation):
     # Pass observation through the actor network to get action logits (mean of action distribution)
     action_logits = actor(next_observation.to(device))
 
+    logger.debug(f"Action logits: {action_logits}")
+
     # Create a normal distribution for continuous actions (with assumed standard deviation of 1.0)
     action_distribution = th.distributions.Normal(action_logits, 1.0)
 
+    logger.debug(f"Action distribution: {action_distribution}")
+
     # Sample an action from the distribution
     sampled_action = action_distribution.sample()
 
+    logger.debug(f"Sampled action: {sampled_action}")
+
     # Get the log probability of the sampled action (for later PPO loss calculation)
     log_prob_action = action_distribution.log_prob(sampled_action).sum(dim=-1)
 
-    # Detach the log probability tensor to stop gradient tracking (since you only need the value for later)
+    # Detach the log probability tensor to stop gradient tracking (since we only need the value for later)
     log_prob_action = log_prob_action.detach()
 
-    # Bound actions to the valid action space range
-    sampled_action = sampled_action.clamp(-1, 1)
+    logger.debug(f"Detached log probability of the sampled action: {log_prob_action}")
+
+    # PREVIOUSLY SET TO (-1, 1)
+    # Bound actions to [0, 1] range
+    sampled_action = sampled_action.clamp(0, 1)
+
+    logger.debug(f"Clamped sampled action: {sampled_action}")
 
     return sampled_action, log_prob_action
diff --git a/assume/reinforcement_learning/buffer.py b/assume/reinforcement_learning/buffer.py
index 12b2b1a57..0566c28a5 100644
--- a/assume/reinforcement_learning/buffer.py
+++ b/assume/reinforcement_learning/buffer.py
@@ -167,9 +167,9 @@ def sample(self, batch_size: int) -> ReplayBufferSamples:
         batch_inds = np.random.randint(0, upper_bound - 1, size=batch_size)
 
         data = (
-            self.observations[batch_inds, :, :],
+            self.observations[batch_inds, :, :], # current observation
             self.actions[batch_inds, :, :],
-            self.observations[batch_inds + 1, :, :],
+            self.observations[batch_inds + 1, :, :], # next observation
             self.rewards[batch_inds],
         )
 
@@ -231,11 +231,11 @@ def __init__(
         self.rewards = None  # Stores the rewards received after each action
         self.log_probs = None  # Stores the log-probabilities of the actions, used to compute the ratio for policy update
 
-        self.values = (
-            None  # Stores the value estimates (critic's predictions) of each state
-        )
-        self.advantages = None  # Stores the computed advantages using GAE (Generalized Advantage Estimation), central to PPO's policy updates
-        self.returns = None  # Stores the discounted rewards (also known as returns), used to compute the value loss for training the critic
+        # self.values = (
+        #     None  # Stores the value estimates (critic's predictions) of each state
+        # )
+        # self.advantages = None  # Stores the computed advantages using GAE (Generalized Advantage Estimation), central to PPO's policy updates
+        # self.returns = None  # Stores the discounted rewards (also known as returns), used to compute the value loss for training the critic
 
         self.pos = 0
         self.full = False
@@ -254,9 +254,9 @@ def initialize_buffer(self, size):
         )
         self.rewards = np.zeros((size, self.n_rl_units), dtype=self.np_float_type)
         self.log_probs = np.zeros((size, self.n_rl_units), dtype=np.float32)
-        self.values = np.zeros((size, self.n_rl_units), dtype=np.float32)
-        self.advantages = np.zeros((size, self.n_rl_units), dtype=np.float32)
-        self.returns = np.zeros((size, self.n_rl_units), dtype=np.float32)
+        # self.values = np.zeros((size, self.n_rl_units), dtype=np.float32)
+        # self.advantages = np.zeros((size, self.n_rl_units), dtype=np.float32)
+        # self.returns = np.zeros((size, self.n_rl_units), dtype=np.float32)
 
     def expand_buffer(self, additional_size):
         """Expands the buffer by the given additional size and checks if there is enough memory available."""
@@ -277,15 +277,15 @@ def expand_buffer(self, additional_size):
             + np.zeros(
                 (additional_size, self.n_rl_units), dtype=np.float32
             ).nbytes  # log_probs
-            + np.zeros(
-                (additional_size, self.n_rl_units), dtype=np.float32
-            ).nbytes  # values
-            + np.zeros(
-                (additional_size, self.n_rl_units), dtype=np.float32
-            ).nbytes  # advantages
-            + np.zeros(
-                (additional_size, self.n_rl_units), dtype=np.float32
-            ).nbytes  # returns
+            # + np.zeros(
+            #     (additional_size, self.n_rl_units), dtype=np.float32
+            # ).nbytes  # values
+            # + np.zeros(
+            #     (additional_size, self.n_rl_units), dtype=np.float32
+            # ).nbytes  # advantages
+            # + np.zeros(
+            #     (additional_size, self.n_rl_units), dtype=np.float32
+            # ).nbytes  # returns
         )
 
         # Check whether enough memory is available
@@ -341,27 +341,27 @@ def expand_buffer(self, additional_size):
                 ),
                 axis=0,
             )
-            self.values = np.concatenate(
-                (
-                    self.values,
-                    np.zeros((additional_size, self.n_rl_units), dtype=np.float32),
-                ),
-                axis=0,
-            )
-            self.advantages = np.concatenate(
-                (
-                    self.advantages,
-                    np.zeros((additional_size, self.n_rl_units), dtype=np.float32),
-                ),
-                axis=0,
-            )
-            self.returns = np.concatenate(
-                (
-                    self.returns,
-                    np.zeros((additional_size, self.n_rl_units), dtype=np.float32),
-                ),
-                axis=0,
-            )
+            # self.values = np.concatenate(
+            #     (
+            #         self.values,
+            #         np.zeros((additional_size, self.n_rl_units), dtype=np.float32),
+            #     ),
+            #     axis=0,
+            # )
+            # self.advantages = np.concatenate(
+            #     (
+            #         self.advantages,
+            #         np.zeros((additional_size, self.n_rl_units), dtype=np.float32),
+            #     ),
+            #     axis=0,
+            # )
+            # self.returns = np.concatenate(
+            #     (
+            #         self.returns,
+            #         np.zeros((additional_size, self.n_rl_units), dtype=np.float32),
+            #     ),
+            #     axis=0,
+            # )
 
     def add(
         self,
@@ -398,12 +398,6 @@ def add(
 
         self.pos += len_obs
 
-        # print("buffer.add() in buffer.py")
-        # print(self.observations)
-        # print(self.actions)
-        # print(self.rewards)
-        # print(self.log_probs)
-
     def reset(self):
         """
         Resets the buffer, clearing all stored data.
@@ -414,8 +408,11 @@ def reset(self):
         self.actions = None
         self.rewards = None
         self.log_probs = None
-        self.pos = 0
-        self.full = False
+        # self.values = None
+        # self.advantages = None
+        # self.returns = None
+        # self.pos = 0
+        # self.full = False
 
     # def compute_returns_and_advantages(self, last_values, dones):
     #     """
diff --git a/assume/reinforcement_learning/learning_role.py b/assume/reinforcement_learning/learning_role.py
index 2ad8f6a2f..48e9d8210 100644
--- a/assume/reinforcement_learning/learning_role.py
+++ b/assume/reinforcement_learning/learning_role.py
@@ -42,8 +42,10 @@ def __init__(
             self.rl_algorithm_name, {}
         ).get("early_stopping_threshold", 0.05)
         self.episodes_done = 0
+        # dict[key, value]
         self.rl_strats: dict[int, LearningStrategy] = {}
 
+        # For centralized critic in MATD3
         self.critics = {}
 
         # define whether we train model or evaluate it
@@ -206,7 +208,9 @@ def save_buffer_and_update(self, content: dict, meta: dict) -> None:
             self.update_policy()
 
         elif self.rl_algorithm_name == "ppo":
-            # print("save_buffer_and_update in learning_role.py")
+            
+            logger.debug("save_buffer_and_update in learning_role.py")
+
             if content.get("type") == "save_buffer_and_update":
                 data = content["data"]
                 self.buffer.add(
@@ -214,8 +218,6 @@ def save_buffer_and_update(self, content: dict, meta: dict) -> None:
                     actions=data[1],
                     reward=data[2],
                     log_probs=data[3],
-                    # values=data[4],
-                    # dones=data[5],
                 )
 
             self.update_policy()
diff --git a/assume/reinforcement_learning/learning_unit_operator.py b/assume/reinforcement_learning/learning_unit_operator.py
index 7a546c9c6..a6debdf88 100644
--- a/assume/reinforcement_learning/learning_unit_operator.py
+++ b/assume/reinforcement_learning/learning_unit_operator.py
@@ -246,7 +246,7 @@ async def write_to_learning_role(
 
             # For PPO
             # Check whether the list of tensors is not empty and whether the tensors contain elements
-            if unit.outputs["rl_log_probs"] and all(t.numel() > 0 for t in unit.outputs["rl_log_probs"][:values_len]):
+            if unit.outputs["rl_log_probs"]: # and all(t.numel() > 0 for t in unit.outputs["rl_log_probs"][:values_len]):
                 
                 log_prob_tensor = th.stack(
                     unit.outputs["rl_log_probs"][:values_len], dim=0
@@ -275,20 +275,8 @@ async def write_to_learning_role(
         all_rewards = np.array(all_rewards).reshape(-1, learning_unit_count)
 
         # For PPO
-        if unit.outputs["rl_log_probs"] and all(t.numel() > 0 for t in unit.outputs["rl_log_probs"][:values_len]):
+        if unit.outputs["rl_log_probs"]: # and all(t.numel() > 0 for t in unit.outputs["rl_log_probs"][:values_len]):
             all_log_probs = all_log_probs.detach().cpu().numpy().reshape(-1, learning_unit_count, 1)
-            
-            # print("ALL_OBSERVATIONS")
-            # print(all_observations)
-
-            # print("ALL_ACTIONS")
-            # print(all_actions)
-
-            # print("ALL_REWARDS")
-            # print(all_rewards)
-
-            # print("ALL_LOG_PROBS")
-            # print(all_log_probs)
 
             rl_agent_data = (all_observations, all_actions, all_rewards, all_log_probs)
         # For MATD3
diff --git a/assume/reinforcement_learning/neural_network_architecture.py b/assume/reinforcement_learning/neural_network_architecture.py
index 26718a1dc..eac22dcfe 100644
--- a/assume/reinforcement_learning/neural_network_architecture.py
+++ b/assume/reinforcement_learning/neural_network_architecture.py
@@ -147,9 +147,14 @@ def __init__(self, obs_dim: int, act_dim: int, float_type, *args, **kwargs):
     def forward(self, obs):
         x = F.relu(self.FC1(obs))
         x = F.relu(self.FC2(x))
-        x = F.softsign(self.FC3(x))
+        # Works with MATD3, output of softsign: [-1, 1]
+        # x = F.softsign(self.FC3(x))
+        
         # x = th.tanh(self.FC3(x))
 
+        # Tested for PPO, scales the output to [0, 1] range
+        x = th.sigmoid(self.FC3(x))
+
         return x
 
 
diff --git a/assume/scenario/loader_csv.py b/assume/scenario/loader_csv.py
index 50508caf1..8886b9b85 100644
--- a/assume/scenario/loader_csv.py
+++ b/assume/scenario/loader_csv.py
@@ -922,7 +922,7 @@ def run_learning(
         else RolloutBuffer
     )
     buffer = buffer_cls(
-        buffer_size=int(world.learning_config.get("replay_buffer_size", 5e5)),
+        buffer_size=int(world.learning_config.get("buffer_size", 5e5)),
         obs_dim=world.learning_role.rl_algorithm.obs_dim,
         act_dim=world.learning_role.rl_algorithm.act_dim,
         n_rl_units=len(world.learning_role.rl_strats),
@@ -966,6 +966,9 @@ def run_learning(
         range(1, world.learning_role.training_episodes + 1),
         desc="Training Episodes",
     ):
+        
+        # print("loader_csv: Episode: ", episode)
+        
         if episode != 1:
             setup_world(
                 world=world,
@@ -975,6 +978,7 @@ def run_learning(
             )
 
         world.learning_role.load_inter_episodic_data(inter_episodic_data)
+
         world.run()  # triggers calculate_bids() which equals to step
 
         inter_episodic_data = world.learning_role.get_inter_episodic_data()
@@ -986,15 +990,18 @@ def run_learning(
             >= world.learning_role.episodes_collecting_initial_experience
             + validation_interval
             if world.learning_role.rl_algorithm_name == "matd3"
-            else episode > validation_interval  # For PPO
+            else episode >= validation_interval  # For PPO
         ):
+            
+            logger.debug(f"Validation of loader_csv after episode {episode}")
+            
             world.reset()
 
             setup_world(
                 world=world,
                 scenario_data=scenario_data,
                 study_case=study_case,
-                perform_evaluation=True,
+                perform_evaluation=True, # perform evaluation triggers save_buffer_and_update, which triggers update_policy()
                 eval_episode=eval_episode,
             )
 
@@ -1064,7 +1071,7 @@ def run_learning(
 
     world.learning_role.load_inter_episodic_data(inter_episodic_data)
 
-    print("Evaluation finished")
+    logger.debug("Evaluation finished")
 
 
 if __name__ == "__main__":
diff --git a/assume/strategies/learning_strategies.py b/assume/strategies/learning_strategies.py
index b5837d9e7..559ee3cae 100644
--- a/assume/strategies/learning_strategies.py
+++ b/assume/strategies/learning_strategies.py
@@ -105,11 +105,15 @@ def __init__(self, *args, **kwargs):
 
         elif Path(kwargs["trained_policies_save_path"]).is_dir():
             self.load_actor_params(load_path=kwargs["trained_policies_save_path"])
+            # Ensure action_noise is defined even when not in learning or evaluation mode
+            self.action_noise = None
+            self.collect_initial_experience_mode = None
         else:
             raise FileNotFoundError(
                 f"No policies were provided for DRL unit {self.unit_id}!. Please provide a valid path to the trained policies."
             )
 
+
     def calculate_bids(
         self,
         unit: SupportsMinMax,
@@ -204,11 +208,10 @@ def calculate_bids(
         # unit.outputs["exploration_noise"][start] = noise
         # TODO: Make this algo specific function
         # Check if extra_info is noise or log_probs and store it accordingly
+
         if isinstance(extra_info, th.Tensor) and extra_info.shape == actions.shape:
             unit.outputs["exploration_noise"][start] = extra_info  # It's noise
         else:
-            # print("Type of extra_info: ", extra_info)
-            # print(type(extra_info))
             unit.outputs["rl_log_probs"].append(extra_info)  # It's log_probs
             # unit.outputs["dones"][start] = False
 
diff --git a/examples/examples.py b/examples/examples.py
index 46f9d20da..98df6b68e 100644
--- a/examples/examples.py
+++ b/examples/examples.py
@@ -67,8 +67,8 @@
     # DRL references case for learning advancement testing
     "small_learning_1": {
         "scenario": "example_02a",
-        "study_case": "tiny",
-    },  # Changed from base to tiny for testing
+        "study_case": "base",
+    }, 
     "small_learning_2": {"scenario": "example_02b", "study_case": "base"},
     "small_learning_3": {"scenario": "example_02c", "study_case": "base"},
     # DRL cases with lstm instead of mlp as actor neural network architecture
diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index 95b6d7b75..7779be514 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -4,7 +4,7 @@
 
 tiny:
   start_date: 2019-01-01 00:00
-  end_date: 2019-01-02 00:00  # Changed from 2019-01-05 00:00 to 2019-01-02 00:00
+  end_date: 2019-01-05 00:00 
   time_step: 1h
   save_frequency_hours: null
   learning_mode: True
@@ -16,6 +16,7 @@ tiny:
     algorithm: ppo
     device: cpu
     learning_rate: 0.001
+    buffer_size: 5e5
     matd3:
       actor_architecture: mlp
       training_episodes: 10
@@ -32,9 +33,9 @@ tiny:
       actor_architecture: mlp
       training_episodes: 10
       validation_episodes_interval: 5 # after how many episodes the validation starts and the policy is updated
-      train_freq: 24h # how often write_to_learning_role gets called
+      train_freq: 48h # how often write_to_learning_role gets called
       gamma: 0.99  # Discount factor for future rewards
-      epochs: 4  # Number of epochs for updating the policy
+      epochs: 5 # #4  # Number of epochs for updating the policy
       clip_ratio: 0.2  # Clipping parameter for policy updates
       vf_coef: 0.5  # Value function coefficient in the loss function
       entropy_coef: 0.01  # Entropy coefficient for exploration
@@ -71,39 +72,34 @@ base:
     continue_learning: False
     trained_policies_save_path: null
     max_bid_price: 100
-    algorithm: matd3
+    algorithm: ppo
+    device: cpu
+    learning_rate: 0.001
     matd3:
       actor_architecture: mlp
-      learning_rate: 0.001
-      training_episodes: 50
-      episodes_collecting_initial_experience: 5
-      train_freq: 24h
+      training_episodes: 10
+      episodes_collecting_initial_experience: 3
+      train_freq: 48h
       gradient_steps: -1
-      batch_size: 256
+      batch_size: 64
       gamma: 0.99
-      device: cpu
       noise_sigma: 0.1
       noise_scale: 1
       noise_dt: 1
       validation_episodes_interval: 5
-      early_stopping_steps: 10
-      early_stopping_threshold: 0.05
-    ppo:
+    ppo: 
       actor_architecture: mlp
-      learning_rate: 0.001
-      training_episodes: 50
-      episodes_collecting_initial_experience: 5
-      train_freq: 24h
-      gradient_steps: -1
-      batch_size: 256
-      gamma: 0.99
-      device: cpu
-      noise_sigma: 0.1
-      noise_scale: 1
-      noise_dt: 1
-      validation_episodes_interval: 5
-      early_stopping_steps: 10
-      early_stopping_threshold: 0.05
+      training_episodes: 10
+      validation_episodes_interval: 5 # after how many episodes the validation starts and the policy is updated
+      train_freq: 24h # how often write_to_learning_role gets called
+      gamma: 0.99  # Discount factor for future rewards
+      epochs: 5 # #4  # Number of epochs for updating the policy
+      clip_ratio: 0.2  # Clipping parameter for policy updates
+      vf_coef: 0.5  # Value function coefficient in the loss function
+      entropy_coef: 0.01  # Entropy coefficient for exploration
+      max_grad_norm: 0.5  # Gradient clipping value
+      gae_lambda: 0.95  # GAE lambda for advantage estimation
+      batch_size: 5  # Batch size for each update, if mini-batch approach is used (currently not implemented)
 
   markets_config:
     EOM:

From aba37f165ae6a9a7fe8278a175170db798cb404d Mon Sep 17 00:00:00 2001
From: ufqjh <ufqjh@student.kit.edu>
Date: Wed, 23 Oct 2024 16:33:48 +0200
Subject: [PATCH 06/23] Implemented centralized critic

---
 .../reinforcement_learning/algorithms/ppo.py  |  365 +--
 .../neural_network_architecture.py            |   49 +-
 assume/scenario/loader_csv.py                 |    2 +-
 .../ASSUME_Actor_Comparison.json              | 2279 +++++++++++++++++
 4 files changed, 2495 insertions(+), 200 deletions(-)
 create mode 100644 docker_configs/dashboard-definitions/ASSUME_Actor_Comparison.json

diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index 080d93a6b..a46b7dfab 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -72,33 +72,12 @@ def save_params(self, directory):
  
     # Removed critic_target in comparison to MATD3
     # Decentralized
-    def save_critic_params(self, directory):
-        """
-        Save the parameters of critic networks.
-
-        This method saves the parameters of the critic networks, including the critic's state_dict and the critic's optimizer state_dict. 
-        It organizes the saved parameters into a directory structure specific to the critic associated with each learning strategy.
-
-        Args:
-            directory (str): The base directory for saving the parameters.
-        """
-        os.makedirs(directory, exist_ok=True)
-        for u_id in self.learning_role.rl_strats.keys():
-            obj = {
-                "critic": self.learning_role.rl_strats[u_id].critic.state_dict(),
-                "critic_optimizer": self.learning_role.rl_strats[u_id].critic.optimizer.state_dict(),
-            }
-            path = f"{directory}/critic_{u_id}.pt"
-            th.save(obj, path)
-
-
-    # Centralized
     # def save_critic_params(self, directory):
     #     """
     #     Save the parameters of critic networks.
 
-    #     This method saves the parameters of the critic networks, including the critic's state_dict, critic_target's state_dict. It organizes the saved parameters into a directory structure specific to the critic
-    #     associated with each learning   strategy.
+    #     This method saves the parameters of the critic networks, including the critic's state_dict and the critic's optimizer state_dict. 
+    #     It organizes the saved parameters into a directory structure specific to the critic associated with each learning strategy.
 
     #     Args:
     #         directory (str): The base directory for saving the parameters.
@@ -106,16 +85,37 @@ def save_critic_params(self, directory):
     #     os.makedirs(directory, exist_ok=True)
     #     for u_id in self.learning_role.rl_strats.keys():
     #         obj = {
-    #             "critic": self.learning_role.critics[u_id].state_dict(),
-    #             # "critic_target": self.learning_role.target_critics[u_id].state_dict(),
-    #             "critic_optimizer": self.learning_role.critics[
-    #                 u_id
-    #             ].optimizer.state_dict(),
+    #             "critic": self.learning_role.rl_strats[u_id].critic.state_dict(),
+    #             "critic_optimizer": self.learning_role.rl_strats[u_id].critic.optimizer.state_dict(),
     #         }
     #         path = f"{directory}/critic_{u_id}.pt"
     #         th.save(obj, path)
 
-    # Removed actor_target in comparison to MATD3 (Actor network = policy network)
+
+    # Centralized
+    def save_critic_params(self, directory):
+        """
+        Save the parameters of critic networks.
+
+        This method saves the parameters of the critic networks, including the critic's state_dict, critic_target's state_dict. It organizes the saved parameters into a directory structure specific to the critic
+        associated with each learning   strategy.
+
+        Args:
+            directory (str): The base directory for saving the parameters.
+        """
+        os.makedirs(directory, exist_ok=True)
+        for u_id in self.learning_role.rl_strats.keys():
+            obj = {
+                "critic": self.learning_role.critics[u_id].state_dict(),
+                # "critic_target": self.learning_role.target_critics[u_id].state_dict(),
+                "critic_optimizer": self.learning_role.critics[
+                    u_id
+                ].optimizer.state_dict(),
+            }
+            path = f"{directory}/critic_{u_id}.pt"
+            th.save(obj, path)
+
+    # Removed actor_target in comparison to MATD3
     def save_actor_params(self, directory):
         """
         Save the parameters of actor networks.
@@ -157,48 +157,13 @@ def load_params(self, directory: str) -> None:
 
     # Removed critic_target in comparison to MATD3 (critic network = value function network)
     # Decentralized
-    def load_critic_params(self, directory: str) -> None:
-        """
-        Load the parameters of critic networks from a specified directory.
-
-        This method loads the parameters of critic networks, including the critic's state_dict and
-        the critic's optimizer state_dict, from the specified directory. It iterates through the learning strategies associated
-        with the learning role, loads the respective parameters, and updates the critic networks accordingly.
-
-        Args:
-            directory (str): The directory from which the parameters should be loaded.
-        """
-        logger.info("Loading critic parameters...")
-
-        if not os.path.exists(directory):
-            logger.warning(
-                "Specified directory for loading the critics does not exist! Starting with randomly initialized values!"
-            )
-            return
-
-        for u_id in self.learning_role.rl_strats.keys():
-            try:
-                critic_params = self.load_obj(
-                    directory=f"{directory}/critics/critic_{str(u_id)}.pt"
-                )
-                self.learning_role.rl_strats[u_id].critic.load_state_dict(
-                    critic_params["critic"]
-                )
-                self.learning_role.rl_strats[u_id].critic.optimizer.load_state_dict(
-                    critic_params["critic_optimizer"]
-                )
-            except Exception:
-                logger.warning(f"No critic values loaded for agent {u_id}")
-
-
-    # Centralized
     # def load_critic_params(self, directory: str) -> None:
     #     """
     #     Load the parameters of critic networks from a specified directory.
 
-    #     This method loads the parameters of critic networks, including the critic's state_dict, critic_target's state_dict, and
+    #     This method loads the parameters of critic networks, including the critic's state_dict and
     #     the critic's optimizer state_dict, from the specified directory. It iterates through the learning strategies associated
-    #     with the learning role, loads the respective parameters, and updates the critic and target critic networks accordingly.
+    #     with the learning role, loads the respective parameters, and updates the critic networks accordingly.
 
     #     Args:
     #         directory (str): The directory from which the parameters should be loaded.
@@ -216,18 +181,53 @@ def load_critic_params(self, directory: str) -> None:
     #             critic_params = self.load_obj(
     #                 directory=f"{directory}/critics/critic_{str(u_id)}.pt"
     #             )
-    #             self.learning_role.critics[u_id].load_state_dict(
+    #             self.learning_role.rl_strats[u_id].critic.load_state_dict(
     #                 critic_params["critic"]
     #             )
-    #             # self.learning_role.target_critics[u_id].load_state_dict(
-    #             #     critic_params["critic_target"]
-    #             # )
-    #             self.learning_role.critics[u_id].optimizer.load_state_dict(
+    #             self.learning_role.rl_strats[u_id].critic.optimizer.load_state_dict(
     #                 critic_params["critic_optimizer"]
     #             )
     #         except Exception:
     #             logger.warning(f"No critic values loaded for agent {u_id}")
 
+
+    # Centralized
+    def load_critic_params(self, directory: str) -> None:
+        """
+        Load the parameters of critic networks from a specified directory.
+
+        This method loads the parameters of critic networks, including the critic's state_dict, critic_target's state_dict, and
+        the critic's optimizer state_dict, from the specified directory. It iterates through the learning strategies associated
+        with the learning role, loads the respective parameters, and updates the critic and target critic networks accordingly.
+
+        Args:
+            directory (str): The directory from which the parameters should be loaded.
+        """
+        logger.info("Loading critic parameters...")
+
+        if not os.path.exists(directory):
+            logger.warning(
+                "Specified directory for loading the critics does not exist! Starting with randomly initialized values!"
+            )
+            return
+
+        for u_id in self.learning_role.rl_strats.keys():
+            try:
+                critic_params = self.load_obj(
+                    directory=f"{directory}/critics/critic_{str(u_id)}.pt"
+                )
+                self.learning_role.critics[u_id].load_state_dict(
+                    critic_params["critic"]
+                )
+                # self.learning_role.target_critics[u_id].load_state_dict(
+                #     critic_params["critic_target"]
+                # )
+                self.learning_role.critics[u_id].optimizer.load_state_dict(
+                    critic_params["critic_optimizer"]
+                )
+            except Exception:
+                logger.warning(f"No critic values loaded for agent {u_id}")
+
     # Removed actor_target in comparison to MATD3
     def load_actor_params(self, directory: str) -> None:
         """
@@ -266,33 +266,6 @@ def load_actor_params(self, directory: str) -> None:
 
     # Removed target_critics and actor_target in comparison to MATD3
     # Decentralized
-    def initialize_policy(self, actors_and_critics: dict = None) -> None:
-        """
-        Create actor and critic networks for reinforcement learning.
-
-        If `actors_and_critics` is None, this method creates new actor and critic networks.
-        If `actors_and_critics` is provided, it assigns existing networks to the respective attributes.
-
-        Args:
-            actors_and_critics (dict): The actor and critic networks to be assigned.
-        """
-        if actors_and_critics is None:
-            self.create_actors()
-            self.create_critics()
-        else:
-            # Decentralized initialization of actors and critics
-            for u_id, unit_strategy in self.learning_role.rl_strats.items():
-                unit_strategy.actor = actors_and_critics["actors"][u_id]
-                # unit_strategy.actor_target = actors_and_critics["actor_targets"][u_id]
-                unit_strategy.critic = actors_and_critics["critics"][u_id]
-                # unit_strategy.critic_target = actors_and_critics["critic_targets"][u_id]
-
-            # Assign shared dimensions
-            self.obs_dim = actors_and_critics["obs_dim"]
-            self.act_dim = actors_and_critics["act_dim"]
-            self.unique_obs_dim = actors_and_critics["unique_obs_dim"]
-
-    # Centralized
     # def initialize_policy(self, actors_and_critics: dict = None) -> None:
     #     """
     #     Create actor and critic networks for reinforcement learning.
@@ -302,23 +275,50 @@ def initialize_policy(self, actors_and_critics: dict = None) -> None:
 
     #     Args:
     #         actors_and_critics (dict): The actor and critic networks to be assigned.
-
     #     """
     #     if actors_and_critics is None:
     #         self.create_actors()
     #         self.create_critics()
-
     #     else:
-    #         self.learning_role.critics = actors_and_critics["critics"]
-    #         # self.learning_role.target_critics = actors_and_critics["target_critics"]
+    #         # Decentralized initialization of actors and critics
     #         for u_id, unit_strategy in self.learning_role.rl_strats.items():
     #             unit_strategy.actor = actors_and_critics["actors"][u_id]
     #             # unit_strategy.actor_target = actors_and_critics["actor_targets"][u_id]
+    #             unit_strategy.critic = actors_and_critics["critics"][u_id]
+    #             # unit_strategy.critic_target = actors_and_critics["critic_targets"][u_id]
 
+    #         # Assign shared dimensions
     #         self.obs_dim = actors_and_critics["obs_dim"]
     #         self.act_dim = actors_and_critics["act_dim"]
     #         self.unique_obs_dim = actors_and_critics["unique_obs_dim"]
 
+    # Centralized
+    def initialize_policy(self, actors_and_critics: dict = None) -> None:
+        """
+        Create actor and critic networks for reinforcement learning.
+
+        If `actors_and_critics` is None, this method creates new actor and critic networks.
+        If `actors_and_critics` is provided, it assigns existing networks to the respective attributes.
+
+        Args:
+            actors_and_critics (dict): The actor and critic networks to be assigned.
+
+        """
+        if actors_and_critics is None:
+            self.create_actors()
+            self.create_critics()
+
+        else:
+            self.learning_role.critics = actors_and_critics["critics"]
+            # self.learning_role.target_critics = actors_and_critics["target_critics"]
+            for u_id, unit_strategy in self.learning_role.rl_strats.items():
+                unit_strategy.actor = actors_and_critics["actors"][u_id]
+                # unit_strategy.actor_target = actors_and_critics["actor_targets"][u_id]
+
+            self.obs_dim = actors_and_critics["obs_dim"]
+            self.act_dim = actors_and_critics["act_dim"]
+            self.unique_obs_dim = actors_and_critics["unique_obs_dim"]
+
     # Removed actor_target in comparison to MATD3
     def create_actors(self) -> None:
         """
@@ -369,43 +369,6 @@ def create_actors(self) -> None:
     # Removed target_critics in comparison to MATD3
     # Changed initialization of CriticPPO compared to MATD3
     # Decentralized
-    def create_critics(self) -> None:
-        """
-        Create decentralized critic networks for reinforcement learning.
-
-        This method initializes a separate critic network for each agent in the reinforcement learning setup.
-        Each critic learns to predict the value function based on the individual agent's observation.
-
-        Notes:
-            Each agent has its own critic, so the critic is no longer shared among all agents.
-        """
-
-        unique_obs_dim_list = []
-
-        for _, unit_strategy in self.learning_role.rl_strats.items():
-            unit_strategy.critic = CriticPPO(
-                obs_dim=unit_strategy.obs_dim,
-                float_type=self.float_type,
-            ).to(self.device)
-
-            unit_strategy.critic.optimizer = Adam(
-                unit_strategy.critic.parameters(), lr=self.learning_rate
-            )
-
-            unique_obs_dim_list.append(unit_strategy.unique_obs_dim)
-
-        # Check if all unique_obs_dim are the same and raise an error if not
-        # If they are all the same, set the unique_obs_dim attribute
-        if len(set(unique_obs_dim_list)) > 1:
-            raise ValueError(
-                "All unique_obs_dim values must be the same for all RL agents"
-            )
-        else:
-            self.unique_obs_dim = unique_obs_dim_list[0]
-
-
-
-    # Centralized
     # def create_critics(self) -> None:
     #     """
     #     Create decentralized critic networks for reinforcement learning.
@@ -417,27 +380,22 @@ def create_critics(self) -> None:
     #         Each agent has its own critic, so the critic is no longer shared among all agents.
     #     """
 
-    #     strategy: LearningStrategy
     #     unique_obs_dim_list = []
 
-    #     for u_id, strategy in self.learning_role.rl_strats.items():
-    #         self.learning_role.critics[u_id] = CriticPPO(
-    #             obs_dim=strategy.obs_dim,
+    #     for _, unit_strategy in self.learning_role.rl_strats.items():
+    #         unit_strategy.critic = CriticPPO(
+    #             obs_dim=unit_strategy.obs_dim,
     #             float_type=self.float_type,
-    #         )
+    #         ).to(self.device)
 
-    #         self.learning_role.critics[u_id].optimizer = Adam(
-    #             self.learning_role.critics[u_id].parameters(), lr=self.learning_rate
+    #         unit_strategy.critic.optimizer = Adam(
+    #             unit_strategy.critic.parameters(), lr=self.learning_rate
     #         )
 
-    #         self.learning_role.critics[u_id] = self.learning_role.critics[u_id].to(
-    #             self.device
-    #         )
+    #         unique_obs_dim_list.append(unit_strategy.unique_obs_dim)
 
-    #         unique_obs_dim_list.append(strategy.unique_obs_dim)
-
-    #     # check if all unique_obs_dim are the same and raise an error if not
-    #     # if they are all the same, set the unique_obs_dim attribute
+    #     # Check if all unique_obs_dim are the same and raise an error if not
+    #     # If they are all the same, set the unique_obs_dim attribute
     #     if len(set(unique_obs_dim_list)) > 1:
     #         raise ValueError(
     #             "All unique_obs_dim values must be the same for all RL agents"
@@ -445,55 +403,74 @@ def create_critics(self) -> None:
     #     else:
     #         self.unique_obs_dim = unique_obs_dim_list[0]
 
-    # Decentralized
-    def extract_policy(self) -> dict:
+
+
+    # Centralized
+    def create_critics(self) -> None:
         """
-        Extract actor and critic networks.
+        Create decentralized critic networks for reinforcement learning.
 
-        This method extracts the actor and critic networks associated with each learning strategy and organizes them into a
-        dictionary structure. The extracted networks include actors and critics. The resulting
-        dictionary is typically used for saving and sharing these networks.
+        This method initializes a separate critic network for each agent in the reinforcement learning setup.
+        Each critic learns to predict the value function based on the individual agent's observation.
 
-        Returns:
-            dict: The extracted actor and critic networks.
+        Notes:
+            Each agent has its own critic, so the critic is no longer shared among all agents.
         """
-        actors = {}
-        critics = {}
 
-        for u_id, unit_strategy in self.learning_role.rl_strats.items():
-            actors[u_id] = unit_strategy.actor
-            critics[u_id] = unit_strategy.critic
+        n_agents = len(self.learning_role.rl_strats)
+        strategy: LearningStrategy
+        unique_obs_dim_list = []
 
-        actors_and_critics = {
-            "actors": actors,
-            "critics": critics,
-            "obs_dim": self.obs_dim,
-            "act_dim": self.act_dim,
-            "unique_obs_dim": self.unique_obs_dim,
-        }
+        for u_id, strategy in self.learning_role.rl_strats.items():
+            self.learning_role.critics[u_id] = CriticPPO(
+                n_agents=n_agents,
+                obs_dim=strategy.obs_dim,
+                act_dim=strategy.act_dim,
+                unique_obs_dim=strategy.unique_obs_dim,
+                float_type=self.float_type,
+            )
 
-        return actors_and_critics
+            self.learning_role.critics[u_id].optimizer = Adam(
+                self.learning_role.critics[u_id].parameters(), lr=self.learning_rate
+            )
 
-    # Centralized
+            self.learning_role.critics[u_id] = self.learning_role.critics[u_id].to(
+                self.device
+            )
+
+            unique_obs_dim_list.append(strategy.unique_obs_dim)
+
+        # check if all unique_obs_dim are the same and raise an error if not
+        # if they are all the same, set the unique_obs_dim attribute
+        if len(set(unique_obs_dim_list)) > 1:
+            raise ValueError(
+                "All unique_obs_dim values must be the same for all RL agents"
+            )
+        else:
+            self.unique_obs_dim = unique_obs_dim_list[0]
+
+    # Decentralized
     # def extract_policy(self) -> dict:
     #     """
     #     Extract actor and critic networks.
 
     #     This method extracts the actor and critic networks associated with each learning strategy and organizes them into a
-    #     dictionary structure. The extracted networks include actors, and critics. The resulting
+    #     dictionary structure. The extracted networks include actors and critics. The resulting
     #     dictionary is typically used for saving and sharing these networks.
 
     #     Returns:
     #         dict: The extracted actor and critic networks.
     #     """
     #     actors = {}
+    #     critics = {}
 
     #     for u_id, unit_strategy in self.learning_role.rl_strats.items():
     #         actors[u_id] = unit_strategy.actor
+    #         critics[u_id] = unit_strategy.critic
 
     #     actors_and_critics = {
     #         "actors": actors,
-    #         "critics": self.learning_role.critics,
+    #         "critics": critics,
     #         "obs_dim": self.obs_dim,
     #         "act_dim": self.act_dim,
     #         "unique_obs_dim": self.unique_obs_dim,
@@ -501,6 +478,33 @@ def extract_policy(self) -> dict:
 
     #     return actors_and_critics
 
+    # Centralized
+    def extract_policy(self) -> dict:
+        """
+        Extract actor and critic networks.
+
+        This method extracts the actor and critic networks associated with each learning strategy and organizes them into a
+        dictionary structure. The extracted networks include actors, and critics. The resulting
+        dictionary is typically used for saving and sharing these networks.
+
+        Returns:
+            dict: The extracted actor and critic networks.
+        """
+        actors = {}
+
+        for u_id, unit_strategy in self.learning_role.rl_strats.items():
+            actors[u_id] = unit_strategy.actor
+
+        actors_and_critics = {
+            "actors": actors,
+            "critics": self.learning_role.critics,
+            "obs_dim": self.obs_dim,
+            "act_dim": self.act_dim,
+            "unique_obs_dim": self.unique_obs_dim,
+        }
+
+        return actors_and_critics
+
     def update_policy(self):
         """
         Perform policy updates using PPO with the clipped objective.
@@ -517,11 +521,10 @@ def update_policy(self):
             # Each agent has its own actor and critic. Critic (value network) is in comparison to MATD3 decentralized, meaning each agent learns its own value function.
             for u_id in self.learning_role.rl_strats.keys():
                 
-                
                 # Centralized
-                # critic = self.learning_role.critics[u_id]
+                critic = self.learning_role.critics[u_id]
                 # Decentralized
-                critic = self.learning_role.rl_strats[u_id].critic
+                #critic = self.learning_role.rl_strats[u_id].critic
                 actor = self.learning_role.rl_strats[u_id].actor
 
                 # Retrieve experiences from the buffer
@@ -536,7 +539,7 @@ def update_policy(self):
                 # Potentially, it could be useful to source some functionality out into methods stored in buffer.py
 
                 # Pass the current states through the critic network to get value estimates.
-                values = critic(states).squeeze(dim=2)
+                values = critic(states, actions).squeeze(dim=2)
 
                 logger.debug(f"Values: {values}")
 
diff --git a/assume/reinforcement_learning/neural_network_architecture.py b/assume/reinforcement_learning/neural_network_architecture.py
index eac22dcfe..8b1a1f0ad 100644
--- a/assume/reinforcement_learning/neural_network_architecture.py
+++ b/assume/reinforcement_learning/neural_network_architecture.py
@@ -94,33 +94,46 @@ def q1_forward(self, obs, actions):
 class CriticPPO(nn.Module):
     """Critic Network for Proximal Policy Optimization (PPO).
 
-    Each agent has its own critic, so this class defines the architecture for an individual agent's critic.
+    Centralized critic.
 
     Args:
-        obs_dim (int): Dimension of the observation space.
-        float_type: Data type for the model parameters.
+        n_agents (int): Number of agents
+        obs_dim (int): Dimension of each state
+        act_dim (int): Dimension of each action
     """
 
-    def __init__(self, obs_dim: int, float_type):
+    def __init__(self, n_agents: int, obs_dim: int, act_dim: int, float_type, unique_obs_dim: int = 0):
         super().__init__()
 
-        # Define the architecture of the Critic network for an individual agent
-        self.fc1 = nn.Linear(obs_dim, 256, dtype=float_type)
-        self.fc2 = nn.Linear(256, 128, dtype=float_type)
-        self.fc3 = nn.Linear(128, 1, dtype=float_type)
+        self.obs_dim = obs_dim + unique_obs_dim * (n_agents - 1)
+        self.act_dim = act_dim * n_agents 
 
-    def forward(self, obs):
-        """Forward pass through the critic network.
+        if n_agents <= 50:
+            self.FC_1 = nn.Linear(self.obs_dim + self.act_dim, 512, dtype=float_type)
+            self.FC_2 = nn.Linear(512, 256, dtype=float_type)
+            self.FC_3 = nn.Linear(256, 128, dtype=float_type)
+            self.FC_4 = nn.Linear(128, 1, dtype=float_type)
+        else:
+            self.FC_1 = nn.Linear(self.obs_dim + self.act_dim, 1024, dtype=float_type)
+            self.FC_2 = nn.Linear(1024, 512, dtype=float_type)
+            self.FC_3 = nn.Linear(512, 128, dtype=float_type)
+            self.FC_4 = nn.Linear(128, 1, dtype=float_type)
 
+    def forward(self, obs, actions):
+        """
         Args:
-            obs (torch.Tensor): The observation input for the agent.
+            obs (torch.Tensor): The observations
+            actions (torch.Tensor): The actions
 
-        Returns:
-            torch.Tensor: The value output from the critic network.
         """
-        x = F.relu(self.fc1(obs))
-        x = F.relu(self.fc2(x))
-        value = self.fc3(x)
+
+        xu = th.cat([obs, actions], dim=-1)
+
+        x = F.relu(self.FC_1(xu))
+        x = F.relu(self.FC_2(x))
+        x = F.relu(self.FC_3(x))
+        value = self.FC_4(x)
+
         return value
 
 class Actor(nn.Module):
@@ -148,12 +161,12 @@ def forward(self, obs):
         x = F.relu(self.FC1(obs))
         x = F.relu(self.FC2(x))
         # Works with MATD3, output of softsign: [-1, 1]
-        # x = F.softsign(self.FC3(x))
+        x = F.softsign(self.FC3(x))
         
         # x = th.tanh(self.FC3(x))
 
         # Tested for PPO, scales the output to [0, 1] range
-        x = th.sigmoid(self.FC3(x))
+        #x = th.sigmoid(self.FC3(x))
 
         return x
 
diff --git a/assume/scenario/loader_csv.py b/assume/scenario/loader_csv.py
index 8886b9b85..67323582a 100644
--- a/assume/scenario/loader_csv.py
+++ b/assume/scenario/loader_csv.py
@@ -922,7 +922,7 @@ def run_learning(
         else RolloutBuffer
     )
     buffer = buffer_cls(
-        buffer_size=int(world.learning_config.get("buffer_size", 5e5)),
+        buffer_size=int(float(world.learning_config.get("buffer_size", 5e5))),
         obs_dim=world.learning_role.rl_algorithm.obs_dim,
         act_dim=world.learning_role.rl_algorithm.act_dim,
         n_rl_units=len(world.learning_role.rl_strats),
diff --git a/docker_configs/dashboard-definitions/ASSUME_Actor_Comparison.json b/docker_configs/dashboard-definitions/ASSUME_Actor_Comparison.json
new file mode 100644
index 000000000..84a151fe2
--- /dev/null
+++ b/docker_configs/dashboard-definitions/ASSUME_Actor_Comparison.json
@@ -0,0 +1,2279 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "target": {
+          "limit": 100,
+          "matchAny": false,
+          "tags": [],
+          "type": "dashboard"
+        },
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "This dashboard offers various perspectives (performance, run time and robustness) to compare different Actor architectures.",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": 3,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "postgres",
+        "uid": "P7B13B9DF907EC40C"
+      },
+      "description": "",
+      "gridPos": {
+        "h": 2,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 10,
+      "options": {
+        "code": {
+          "language": "plaintext",
+          "showLineNumbers": false,
+          "showMiniMap": false
+        },
+        "content": "# Welcome to ASSUMES Actor Comparison Dashboard\n",
+        "mode": "markdown"
+      },
+      "pluginVersion": "10.4.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "format": "time_series",
+          "group": [],
+          "metricColumn": "none",
+          "rawQuery": false,
+          "rawSql": "SELECT\n  datetime AS \"time\",\n  power\nFROM market_dispatch\nWHERE\n  $__timeFilter(datetime)\nORDER BY 1",
+          "refId": "A",
+          "select": [
+            [
+              {
+                "params": [
+                  "power"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "table": "market_dispatch",
+          "timeColumn": "datetime",
+          "timeColumnType": "timestamp",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        }
+      ],
+      "type": "text"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 2
+      },
+      "id": 16,
+      "panels": [],
+      "title": "Market Result Comparison",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "grafana-postgresql-datasource",
+        "uid": "P7B13B9DF907EC40C"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisGridShow": false,
+            "axisLabel": "Price in [€/MWh]",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineStyle": {
+              "fill": "solid"
+            },
+            "lineWidth": 0.5,
+            "pointSize": 2,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "line"
+            }
+          },
+          "mappings": [],
+          "max": 90,
+          "min": 33,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "#a3222240",
+                "value": 55.7
+              },
+              {
+                "color": "#a322226e",
+                "value": 85.7
+              }
+            ]
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Max. Price"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "#8f8f8f52",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Avg. Price"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "#404040",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Min. Price"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "#5f6a7a82",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byRegexp",
+              "options": "Max. Price"
+            },
+            "properties": [
+              {
+                "id": "custom.fillBelowTo",
+                "value": "Min. Price"
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Q1"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "#00968252",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byRegexp",
+              "options": "Q3"
+            },
+            "properties": [
+              {
+                "id": "custom.fillBelowTo",
+                "value": "Q1"
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Q3"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "#0096824f",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 13,
+        "w": 12,
+        "x": 0,
+        "y": 3
+      },
+      "id": 25,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "10.4.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "rawQuery": true,
+          "rawSql": "SELECT\r\n  product_start AS \"time\",\r\n  avg(price) AS \"Avg. Price\",\r\n  min(price) AS \"Min. Price\", \r\n  percentile_disc (0.25) WITHIN GROUP ( ORDER BY price ) AS \"Q1\", \r\n  percentile_disc (0.75) WITHIN GROUP ( ORDER BY price ) AS \"Q3\", \r\n  max(price) AS \"Max. Price\"\r\nFROM market_meta\r\nWHERE (SUBSTRING(simulation, 1, regexp_instr(simulation, 'run')-2) IN ($case_study)) AND $__timeFilter(product_start) \r\nGROUP BY product_start\r\nORDER BY product_start",
+          "refId": "A",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        }
+      ],
+      "title": "Market Clearing Price - Case 2 with LSTM",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "grafana-postgresql-datasource",
+        "uid": "P7B13B9DF907EC40C"
+      },
+      "description": "Accepted price and (flexible load) bid prices per unit in the chosen market",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic-by-name"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": true,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "area"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "transparent",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 85.7
+              }
+            ]
+          },
+          "unit": "currencyEUR"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byRegexp",
+              "options": "price .*"
+            },
+            "properties": [
+              {
+                "id": "unit",
+                "value": "€/MW"
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Accepted price: pp_1 - EOM"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "#a22223",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Bid price - flex:"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "#009682",
+                  "mode": "shades"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 13,
+        "w": 12,
+        "x": 12,
+        "y": 3
+      },
+      "id": 26,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max",
+            "mean"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true,
+          "sortBy": "Name",
+          "sortDesc": false
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "pluginVersion": "9.2.15",
+      "targets": [
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "time_series",
+          "group": [
+            {
+              "params": [
+                "$__interval",
+                "none"
+              ],
+              "type": "time"
+            },
+            {
+              "params": [
+                "unit_id"
+              ],
+              "type": "column"
+            }
+          ],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\r\n  $__timeGroupAlias(start_time,$__interval),\r\n  avg(accepted_price::float) AS \"Accepted price:\",\r\n  max(price) AS \"Bid price - flex:\",\r\n  concat(unit_id, ' - ', market_id) as \"unit_id\"\r\nFROM market_orders\r\nWHERE\r\n  $__timeFilter(start_time) AND\r\n  unit_id <> 'demand_EOM' AND \r\n  simulation = '$simulation'\r\nGROUP BY 1, unit_id, market_id\r\nORDER BY 1",
+          "refId": "A",
+          "select": [
+            [
+              {
+                "params": [
+                  "original_price"
+                ],
+                "type": "column"
+              },
+              {
+                "params": [
+                  "avg"
+                ],
+                "type": "aggregate"
+              },
+              {
+                "params": [
+                  "price"
+                ],
+                "type": "alias"
+              }
+            ],
+            [
+              {
+                "params": [
+                  "unit_id"
+                ],
+                "type": "column"
+              },
+              {
+                "params": [
+                  "unit_id"
+                ],
+                "type": "alias"
+              }
+            ]
+          ],
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          },
+          "table": "market_orders",
+          "timeColumn": "start_time",
+          "timeColumnType": "timestamp",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            },
+            {
+              "datatype": "text",
+              "name": "",
+              "params": [
+                "market_id",
+                "=",
+                "'$market'"
+              ],
+              "type": "expression"
+            },
+            {
+              "datatype": "text",
+              "name": "",
+              "params": [
+                "simulation",
+                "=",
+                "'$simulation'"
+              ],
+              "type": "expression"
+            }
+          ]
+        }
+      ],
+      "title": "Bid Prices: $simulation",
+      "transformations": [
+        {
+          "id": "filterFieldsByName",
+          "options": {
+            "byVariable": false,
+            "include": {
+              "names": [
+                "Time",
+                "Bid price - flex: pp_1 - EOM",
+                "Bid price - flex: pp_10 - EOM",
+                "Bid price - flex: pp_11 - EOM",
+                "Bid price - flex: pp_2 - EOM",
+                "Bid price - flex: pp_3 - EOM",
+                "Bid price - flex: pp_4 - EOM",
+                "Bid price - flex: pp_5 - EOM",
+                "Bid price - flex: pp_6 - EOM",
+                "Bid price - flex: pp_7 - EOM",
+                "Bid price - flex: pp_8 - EOM",
+                "Bid price - flex: pp_9 - EOM",
+                "Accepted price: pp_1 - EOM"
+              ]
+            }
+          }
+        }
+      ],
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "grafana-postgresql-datasource",
+        "uid": "P7B13B9DF907EC40C"
+      },
+      "description": "Averaged over all simulation runs.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "fixedColor": "text",
+            "mode": "fixed"
+          },
+          "custom": {
+            "align": "auto",
+            "cellOptions": {
+              "type": "color-text"
+            },
+            "filterable": false,
+            "inspect": false
+          },
+          "decimals": 2,
+          "mappings": [
+            {
+              "options": {
+                "example_02a_harder": {
+                  "color": "dark-blue",
+                  "index": 0,
+                  "text": "Case 1"
+                },
+                "example_02a_harder_LF": {
+                  "color": "semi-dark-blue",
+                  "index": 1,
+                  "text": "Case 1 (LF)"
+                },
+                "example_02a_harder_lstm": {
+                  "color": "blue",
+                  "index": 2,
+                  "text": "Case 1 with LSTM"
+                },
+                "example_02a_harder_lstm_LF": {
+                  "color": "light-blue",
+                  "index": 3,
+                  "text": "Case 1 with LSTM (LF)"
+                },
+                "example_02b_harder": {
+                  "color": "dark-orange",
+                  "index": 4,
+                  "text": "Case 2"
+                },
+                "example_02b_harder_lstm": {
+                  "color": "super-light-orange",
+                  "index": 5,
+                  "text": "Case 2 with LSTM"
+                }
+              },
+              "type": "value"
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Case Study"
+            },
+            "properties": [
+              {
+                "id": "custom.width",
+                "value": 163
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Overall Average"
+            },
+            "properties": [
+              {
+                "id": "custom.width",
+                "value": 172
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Std. Dev"
+            },
+            "properties": [
+              {
+                "id": "custom.width",
+                "value": 82
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Std. Dev."
+            },
+            "properties": [
+              {
+                "id": "custom.width",
+                "value": 138
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 9,
+        "x": 0,
+        "y": 16
+      },
+      "id": 24,
+      "options": {
+        "cellHeight": "sm",
+        "footer": {
+          "countRows": false,
+          "fields": [],
+          "reducer": [
+            "sum"
+          ],
+          "show": false
+        },
+        "showHeader": true,
+        "sortBy": []
+      },
+      "pluginVersion": "10.4.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "rawQuery": true,
+          "rawSql": "SELECT\r\n  simulation as \"Case Study\", \r\n  avg(\"Avg. Price\") as \"Overall Average\", \r\n  stddev(\"Avg. Price\") as \"Std. Dev.\"\r\nFROM (\r\n  SELECT\r\n    SUBSTRING(simulation, 1, regexp_instr(simulation, 'run')-2) as \"simulation\", \r\n    substr(simulation, regexp_instr(simulation, 'run')) as \"run\",\r\n    avg(price) AS \"Avg. Price\"\r\n  FROM market_meta\r\n  WHERE (\"market_id\" LIKE 'EOM')\r\n  GROUP BY simulation\r\n  ORDER BY 1, LENGTH(simulation), 2\r\n) as subselect\r\nGROUP BY simulation",
+          "refId": "A",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        }
+      ],
+      "title": "Overall Avg. Market Clearing Price",
+      "type": "table"
+    },
+    {
+      "datasource": {
+        "type": "grafana-postgresql-datasource",
+        "uid": "P7B13B9DF907EC40C"
+      },
+      "description": "",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic-by-name"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "hidden",
+            "fillOpacity": 80,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineWidth": 1,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [
+            {
+              "options": {
+                "example_02a_harder": {
+                  "color": "dark-blue",
+                  "index": 0,
+                  "text": "Case 1"
+                },
+                "example_02a_harder_LF": {
+                  "color": "semi-dark-blue",
+                  "index": 1,
+                  "text": "Case 1 (LF)"
+                },
+                "example_02a_harder_lstm": {
+                  "color": "blue",
+                  "index": 2,
+                  "text": "Case 1 with LSTM"
+                },
+                "example_02a_harder_lstm_LF": {
+                  "color": "light-blue",
+                  "index": 3,
+                  "text": "Case 1 with LSTM (LF)"
+                },
+                "example_02b_harder": {
+                  "color": "dark-orange",
+                  "index": 4,
+                  "text": "Case 2"
+                },
+                "example_02b_harder_lstm": {
+                  "color": "super-light-orange",
+                  "index": 5,
+                  "text": "Case 2 with LSTM"
+                }
+              },
+              "type": "value"
+            }
+          ],
+          "thresholds": {
+            "mode": "percentage",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Case 2"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "dark-orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 15,
+        "x": 9,
+        "y": 16
+      },
+      "id": 23,
+      "options": {
+        "barRadius": 0,
+        "barWidth": 0.97,
+        "colorByField": "simulation",
+        "fullHighlight": false,
+        "groupWidth": 0.7,
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "orientation": "auto",
+        "showValue": "always",
+        "stacking": "none",
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        },
+        "xTickLabelRotation": 0,
+        "xTickLabelSpacing": 0
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "hide": false,
+          "rawQuery": true,
+          "rawSql": "SELECT\r\n  SUBSTRING(simulation, 1, regexp_instr(simulation, 'run')-2) as \"simulation\", \r\n  substr(simulation, regexp_instr(simulation, 'run')) as \"run\",\r\n  avg(price) AS \"Avg. Price\"\r\nFROM market_meta\r\nWHERE (\"market_id\" LIKE 'EOM')\r\nGROUP BY simulation\r\nORDER BY 1, LENGTH(simulation), 2;",
+          "refId": "Market Clearing Price",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        }
+      ],
+      "title": "Avg. Market Clearing Price per Run",
+      "type": "barchart"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 24
+      },
+      "id": 18,
+      "panels": [],
+      "title": "Computation Time Comparison",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "grafana-postgresql-datasource",
+        "uid": "P7B13B9DF907EC40C"
+      },
+      "description": "Comparison of Total Run Times for each Simulation Run",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic-by-name"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "bars",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineStyle": {
+              "fill": "solid"
+            },
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "decimals": 2,
+          "mappings": [
+            {
+              "options": {
+                "1": {
+                  "color": "dark-blue",
+                  "index": 0,
+                  "text": "Case 1"
+                },
+                "2": {
+                  "color": "semi-dark-blue",
+                  "index": 1,
+                  "text": "Case 1 (LF)"
+                },
+                "3": {
+                  "color": "blue",
+                  "index": 2,
+                  "text": "Case 1 with LSTM"
+                },
+                "4": {
+                  "color": "light-blue",
+                  "index": 3,
+                  "text": "Case 1 with LSTM (LF)"
+                },
+                "5": {
+                  "color": "semi-dark-purple",
+                  "index": 4,
+                  "text": "Case 2"
+                },
+                "6": {
+                  "color": "light-purple",
+                  "index": 5,
+                  "text": "Case 2 with LSTM"
+                }
+              },
+              "type": "value"
+            }
+          ],
+          "min": 0,
+          "thresholds": {
+            "mode": "percentage",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "min"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "super-light-green",
+                  "mode": "fixed"
+                }
+              },
+              {
+                "id": "custom.drawStyle",
+                "value": "points"
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "avg"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "light-yellow",
+                  "mode": "fixed"
+                }
+              },
+              {
+                "id": "custom.drawStyle",
+                "value": "points"
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "max"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "super-light-red",
+                  "mode": "fixed"
+                }
+              },
+              {
+                "id": "custom.drawStyle",
+                "value": "bars"
+              },
+              {
+                "id": "custom.showPoints",
+                "value": "always"
+              },
+              {
+                "id": "custom.lineWidth",
+                "value": 0
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byRegexp",
+              "options": "max"
+            },
+            "properties": [
+              {
+                "id": "custom.fillBelowTo",
+                "value": "min"
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "index"
+            },
+            "properties": [
+              {
+                "id": "custom.axisGridShow",
+                "value": false
+              },
+              {
+                "id": "custom.axisPlacement",
+                "value": "auto"
+              },
+              {
+                "id": "max",
+                "value": 6.5
+              },
+              {
+                "id": "unit",
+                "value": "time: "
+              },
+              {
+                "id": "min",
+                "value": 0.5
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 11,
+        "x": 0,
+        "y": 25
+      },
+      "id": 19,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        },
+        "xField": "index"
+      },
+      "pluginVersion": "10.4.2",
+      "targets": [
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "hide": true,
+          "rawQuery": true,
+          "rawSql": "SELECT \r\n  SUBSTRING(ident, 1, 18) as scenario,\r\n  SUBSTRING(ident, 20) as run,\r\n  value \r\nFROM \r\n  kpis \r\nWHERE \r\n  variable = 'total_run_time'\r\nORDER BY \r\n  scenario, LENGTH(ident), run",
+          "refId": "A",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        },
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "hide": false,
+          "rawQuery": true,
+          "rawSql": "SELECT\r\n  scenario, \r\n  ROW_NUMBER() OVER (ORDER BY scenario) as index, \r\n  min(value),\r\n  avg(value),  \r\n  max(value)\r\nFROM (\r\n  SELECT \r\n    SUBSTRING(ident, 1, regexp_instr(ident, 'run')-2) as scenario,\r\n    SUBSTRING(ident, regexp_instr(ident, 'run')) as run,\r\n    value \r\n  FROM \r\n    kpis \r\n  WHERE \r\n    variable = 'total_run_time'\r\n  ORDER BY \r\n    scenario, LENGTH(ident), run\r\n  ) as subselect\r\nGROUP BY \r\n  scenario",
+          "refId": "B",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        }
+      ],
+      "title": "Total Run Time",
+      "transformations": [
+        {
+          "disabled": true,
+          "id": "calculateField",
+          "options": {
+            "alias": "index_2",
+            "index": {
+              "asPercentile": false
+            },
+            "mode": "index",
+            "reduce": {
+              "reducer": "sum"
+            },
+            "replaceFields": false
+          }
+        },
+        {
+          "disabled": true,
+          "id": "calculateField",
+          "options": {
+            "alias": "index",
+            "binary": {
+              "left": "index_2",
+              "right": "1"
+            },
+            "mode": "binary",
+            "reduce": {
+              "reducer": "sum"
+            }
+          }
+        }
+      ],
+      "type": "trend"
+    },
+    {
+      "datasource": {
+        "type": "grafana-postgresql-datasource",
+        "uid": "P7B13B9DF907EC40C"
+      },
+      "description": "How long takes one simulation episode?",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "fixedColor": "green",
+            "mode": "palette-classic-by-name"
+          },
+          "custom": {
+            "fillOpacity": 80,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineWidth": 1
+          },
+          "fieldMinMax": false,
+          "mappings": [
+            {
+              "options": {
+                "example_02a_harder": {
+                  "color": "dark-blue",
+                  "index": 0,
+                  "text": "Case 1"
+                },
+                "example_02a_harder_LF": {
+                  "color": "semi-dark-blue",
+                  "index": 1,
+                  "text": "Case 1 (LF)"
+                },
+                "example_02a_harder_lstm": {
+                  "color": "blue",
+                  "index": 2,
+                  "text": "Case 1 with LSTM"
+                },
+                "example_02a_harder_lstm_LF": {
+                  "color": "light-blue",
+                  "index": 3,
+                  "text": "Case 1 with LSTM (LF)"
+                },
+                "example_02b_harder": {
+                  "color": "semi-dark-purple",
+                  "index": 4,
+                  "text": "Case 2"
+                },
+                "example_02b_harder_lstm": {
+                  "color": "light-purple",
+                  "index": 5,
+                  "text": "Case 2 with LSTM"
+                }
+              },
+              "type": "value"
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Case 1"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "semi-dark-blue",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Case 1 (LF)"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "semi-dark-blue",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Case 1 with LSTM"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "blue",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Case 1 with LSTM (LF)"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "light-blue",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Case 2"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "dark-orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Case 2 with LSTM"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "super-light-orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 15,
+        "w": 9,
+        "x": 11,
+        "y": 25
+      },
+      "id": 20,
+      "options": {
+        "bucketCount": 200,
+        "combine": false,
+        "legend": {
+          "calcs": [
+            "mean",
+            "stdDev",
+            "count"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        }
+      },
+      "pluginVersion": "10.4.2",
+      "targets": [
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "hide": false,
+          "rawQuery": true,
+          "rawSql": "SELECT\r\n  variable, \r\n  ident,\r\n  value as \"Case 1\"\r\nFROM \r\n  kpis\r\nWHERE \r\n  variable = 'run_time' AND \r\n  SUBSTRING(ident, 1, regexp_instr(ident, 'run')-2) = 'example_02a_harder' AND  \r\n  (CASE \r\n    WHEN 'exploration' in ($sim_type_filter) AND 'evaluation' in ($sim_type_filter) AND 'learning' in ($sim_type_filter) THEN 1=1\r\n    WHEN 'exploration' in ($sim_type_filter) AND 'learning' in ($sim_type_filter) THEN ident !~ 'eval'\r\n    WHEN 'exploration' in ($sim_type_filter) AND 'evaluation' in ($sim_type_filter) THEN ident ~ '[0-9]{1,2}[_]{1}[1-5]{1}$' OR ident ~ 'eval'\r\n    WHEN 'evaluation' in ($sim_type_filter) AND 'learning' in ($sim_type_filter)  THEN ident !~ '[0-9]{1,2}[_]{1}[1-5]{1}$' OR ident ~ 'eval'\r\n    WHEN 'exploration' IN ($sim_type_filter) THEN ident ~ '[0-9]{1,2}[_]{1}[1-5]{1}$' AND ident !~ 'eval'\r\n    WHEN 'evaluation' IN ($sim_type_filter) THEN ident ~ 'eval'\r\n    WHEN 'learning' IN ($sim_type_filter)  THEN ident !~ '[0-9]{1,2}[_]{1}[1-5]{1}$' AND ident !~ 'eval'\r\n  END)",
+          "refId": "Case1",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        },
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "hide": false,
+          "rawQuery": true,
+          "rawSql": "SELECT\r\n  variable, \r\n  ident,\r\n  value as \"Case 1 (LF)\"\r\nFROM \r\n  kpis\r\nWHERE \r\n  variable = 'run_time' AND \r\n  SUBSTRING(ident, 1, regexp_instr(ident, 'run')-2) = 'example_02a_harder_LF' AND  \r\n  (CASE \r\n    WHEN 'exploration' in ($sim_type_filter) AND 'evaluation' in ($sim_type_filter) AND 'learning' in ($sim_type_filter) THEN 1=1\r\n    WHEN 'exploration' in ($sim_type_filter) AND 'learning' in ($sim_type_filter) THEN ident !~ 'eval'\r\n    WHEN 'exploration' in ($sim_type_filter) AND 'evaluation' in ($sim_type_filter) THEN ident ~ '[0-9]{1,2}[_]{1}[1-5]{1}$' OR ident ~ 'eval'\r\n    WHEN 'evaluation' in ($sim_type_filter) AND 'learning' in ($sim_type_filter)  THEN ident !~ '[0-9]{1,2}[_]{1}[1-5]{1}$' OR ident ~ 'eval'\r\n    WHEN 'exploration' IN ($sim_type_filter) THEN ident ~ '[0-9]{1,2}[_]{1}[1-5]{1}$' AND ident !~ 'eval'\r\n    WHEN 'evaluation' IN ($sim_type_filter) THEN ident ~ 'eval'\r\n    WHEN 'learning' IN ($sim_type_filter)  THEN ident !~ '[0-9]{1,2}[_]{1}[1-5]{1}$' AND ident !~ 'eval'\r\n  END)",
+          "refId": "Case1_LF",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        },
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "hide": false,
+          "rawQuery": true,
+          "rawSql": "SELECT\r\n  variable, \r\n  ident,\r\n  value as \"Case 1 with LSTM\"\r\nFROM \r\n  kpis\r\nWHERE \r\n  variable = 'run_time' AND\r\n  SUBSTRING(ident, 1, regexp_instr(ident, 'run')-2) = 'example_02a_harder_lstm' AND \r\n  (CASE \r\n    WHEN 'exploration' in ($sim_type_filter) AND 'evaluation' in ($sim_type_filter) AND 'learning' in ($sim_type_filter) THEN 1=1\r\n    WHEN 'exploration' in ($sim_type_filter) AND 'learning' in ($sim_type_filter) THEN ident !~ 'eval'\r\n    WHEN 'exploration' in ($sim_type_filter) AND 'evaluation' in ($sim_type_filter) THEN ident ~ '[0-9]{1,2}[_]{1}[1-5]{1}$' OR ident ~ 'eval'\r\n    WHEN 'evaluation' in ($sim_type_filter) AND 'learning' in ($sim_type_filter)  THEN ident !~ '[0-9]{1,2}[_]{1}[1-5]{1}$' OR ident ~ 'eval'\r\n    WHEN 'exploration' IN ($sim_type_filter) THEN ident ~ '[0-9]{1,2}[_]{1}[1-5]{1}$' AND ident !~ 'eval'\r\n    WHEN 'evaluation' IN ($sim_type_filter) THEN ident ~ 'eval'\r\n    WHEN 'learning' IN ($sim_type_filter)  THEN ident !~ '[0-9]{1,2}[_]{1}[1-5]{1}$' AND ident !~ 'eval'\r\n  END)",
+          "refId": "Case1_LSTM",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        },
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "hide": false,
+          "rawQuery": true,
+          "rawSql": "SELECT\r\n  variable, \r\n  ident,\r\n  value as \"Case 1 with LSTM (LF)\"\r\nFROM \r\n  kpis\r\nWHERE \r\n  variable = 'run_time' AND\r\n  SUBSTRING(ident, 1, regexp_instr(ident, 'run')-2) = 'example_02a_harder_lstm_LF' AND \r\n  (CASE \r\n    WHEN 'exploration' in ($sim_type_filter) AND 'evaluation' in ($sim_type_filter) AND 'learning' in ($sim_type_filter) THEN 1=1\r\n    WHEN 'exploration' in ($sim_type_filter) AND 'learning' in ($sim_type_filter) THEN ident !~ 'eval'\r\n    WHEN 'exploration' in ($sim_type_filter) AND 'evaluation' in ($sim_type_filter) THEN ident ~ '[0-9]{1,2}[_]{1}[1-5]{1}$' OR ident ~ 'eval'\r\n    WHEN 'evaluation' in ($sim_type_filter) AND 'learning' in ($sim_type_filter)  THEN ident !~ '[0-9]{1,2}[_]{1}[1-5]{1}$' OR ident ~ 'eval'\r\n    WHEN 'exploration' IN ($sim_type_filter) THEN ident ~ '[0-9]{1,2}[_]{1}[1-5]{1}$' AND ident !~ 'eval'\r\n    WHEN 'evaluation' IN ($sim_type_filter) THEN ident ~ 'eval'\r\n    WHEN 'learning' IN ($sim_type_filter)  THEN ident !~ '[0-9]{1,2}[_]{1}[1-5]{1}$' AND ident !~ 'eval'\r\n  END)",
+          "refId": "Case 1_LSTM_LF",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        },
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "hide": false,
+          "rawQuery": true,
+          "rawSql": "SELECT\r\n  variable, \r\n  ident,\r\n  value as \"Case 2\"\r\nFROM \r\n  kpis\r\nWHERE \r\n  variable = 'run_time' AND \r\n  SUBSTRING(ident, 1, regexp_instr(ident, 'run')-2) = 'example_02b_harder' AND \r\n  (CASE \r\n    WHEN 'exploration' in ($sim_type_filter) AND 'evaluation' in ($sim_type_filter) AND 'learning' in ($sim_type_filter) THEN 1=1\r\n    WHEN 'exploration' in ($sim_type_filter) AND 'learning' in ($sim_type_filter) THEN ident !~ 'eval'\r\n    WHEN 'exploration' in ($sim_type_filter) AND 'evaluation' in ($sim_type_filter) THEN ident ~ '[0-9]{1,2}[_]{1}[1-5]{1}$' OR ident ~ 'eval'\r\n    WHEN 'evaluation' in ($sim_type_filter) AND 'learning' in ($sim_type_filter)  THEN ident !~ '[0-9]{1,2}[_]{1}[1-5]{1}$' OR ident ~ 'eval'\r\n    WHEN 'exploration' IN ($sim_type_filter) THEN ident ~ '[0-9]{1,2}[_]{1}[1-5]{1}$' AND ident !~ 'eval'\r\n    WHEN 'evaluation' IN ($sim_type_filter) THEN ident ~ 'eval'\r\n    WHEN 'learning' IN ($sim_type_filter)  THEN ident !~ '[0-9]{1,2}[_]{1}[1-5]{1}$' AND ident !~ 'eval'\r\n  END)",
+          "refId": "Case2",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        },
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "hide": false,
+          "rawQuery": true,
+          "rawSql": "SELECT\r\n  variable, \r\n  ident,\r\n  value as \"Case 2 with LSTM\"\r\nFROM \r\n  kpis\r\nWHERE \r\n  variable = 'run_time' AND \r\n  SUBSTRING(ident, 1, regexp_instr(ident, 'run')-2) = 'example_02b_harder_lstm' AND \r\n  (CASE \r\n    WHEN 'exploration' in ($sim_type_filter) AND 'evaluation' in ($sim_type_filter) AND 'learning' in ($sim_type_filter) THEN 1=1\r\n    WHEN 'exploration' in ($sim_type_filter) AND 'learning' in ($sim_type_filter) THEN ident !~ 'eval'\r\n    WHEN 'exploration' in ($sim_type_filter) AND 'evaluation' in ($sim_type_filter) THEN ident ~ '[0-9]{1,2}[_]{1}[1-5]{1}$' OR ident ~ 'eval'\r\n    WHEN 'evaluation' in ($sim_type_filter) AND 'learning' in ($sim_type_filter)  THEN ident !~ '[0-9]{1,2}[_]{1}[1-5]{1}$' OR ident ~ 'eval'\r\n    WHEN 'exploration' IN ($sim_type_filter) THEN ident ~ '[0-9]{1,2}[_]{1}[1-5]{1}$' AND ident !~ 'eval'\r\n    WHEN 'evaluation' IN ($sim_type_filter) THEN ident ~ 'eval'\r\n    WHEN 'learning' IN ($sim_type_filter)  THEN ident !~ '[0-9]{1,2}[_]{1}[1-5]{1}$' AND ident !~ 'eval'\r\n  END)",
+          "refId": "Case2_LSTM",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        }
+      ],
+      "title": "Run Time per Simulation Episode",
+      "type": "histogram"
+    },
+    {
+      "datasource": {
+        "type": "grafana-postgresql-datasource",
+        "uid": "P7B13B9DF907EC40C"
+      },
+      "description": "How many episodes were needed for training, until early stopping was triggered?",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "bars",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "decimals": 1,
+          "fieldMinMax": false,
+          "mappings": [
+            {
+              "options": {
+                "1": {
+                  "color": "dark-blue",
+                  "index": 0,
+                  "text": "Case 1"
+                },
+                "2": {
+                  "color": "semi-dark-blue",
+                  "index": 1,
+                  "text": "Case 1 (LF)"
+                },
+                "3": {
+                  "color": "blue",
+                  "index": 2,
+                  "text": "Case 1 with LSTM"
+                },
+                "4": {
+                  "color": "light-blue",
+                  "index": 3,
+                  "text": "Case 1 with LSTM (LF)"
+                },
+                "5": {
+                  "color": "semi-dark-purple",
+                  "index": 4,
+                  "text": "Case 2"
+                },
+                "6": {
+                  "color": "light-purple",
+                  "index": 5,
+                  "text": "Case 2 with LSTM"
+                }
+              },
+              "type": "value"
+            }
+          ],
+          "min": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "min. episodes"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "super-light-green",
+                  "mode": "fixed"
+                }
+              },
+              {
+                "id": "custom.drawStyle",
+                "value": "points"
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "min. eval episodes"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "light-green",
+                  "mode": "fixed"
+                }
+              },
+              {
+                "id": "custom.drawStyle",
+                "value": "points"
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "avg. episodes"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "super-light-yellow",
+                  "mode": "fixed"
+                }
+              },
+              {
+                "id": "custom.drawStyle",
+                "value": "points"
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "avg. eval episodes"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "light-yellow",
+                  "mode": "fixed"
+                }
+              },
+              {
+                "id": "custom.drawStyle",
+                "value": "points"
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "max. episodes"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "super-light-red",
+                  "mode": "fixed"
+                }
+              },
+              {
+                "id": "custom.fillBelowTo",
+                "value": "min. episodes"
+              },
+              {
+                "id": "custom.showPoints",
+                "value": "always"
+              },
+              {
+                "id": "custom.lineWidth",
+                "value": 0
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "max. eval episodes"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "light-red",
+                  "mode": "fixed"
+                }
+              },
+              {
+                "id": "custom.fillBelowTo",
+                "value": "min. eval episodes"
+              },
+              {
+                "id": "custom.showPoints",
+                "value": "always"
+              },
+              {
+                "id": "custom.lineWidth",
+                "value": 0
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "index"
+            },
+            "properties": [
+              {
+                "id": "unit",
+                "value": "time: "
+              },
+              {
+                "id": "custom.axisGridShow",
+                "value": false
+              },
+              {
+                "id": "max",
+                "value": 6.5
+              },
+              {
+                "id": "min",
+                "value": 0.5
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 11,
+        "x": 0,
+        "y": 36
+      },
+      "id": 21,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        },
+        "xField": "index"
+      },
+      "pluginVersion": "10.4.2",
+      "targets": [
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "hide": false,
+          "rawQuery": true,
+          "rawSql": "CREATE EXTENSION IF NOT EXISTS tablefunc; \r\n\r\nSELECT \r\n  scenario,\r\n  ROW_NUMBER() OVER (ORDER BY scenario) as index,\r\n  min(episodes_done) as \"min. episodes\", \r\n  avg(episodes_done) as \"avg. episodes\", \r\n  max(episodes_done) as \"max. episodes\"\r\nFROM (\r\n  SELECT \r\n    SUBSTRING(ident, 1, regexp_instr(ident, 'run')-2) as scenario, \r\n    SUBSTRING(ident, regexp_instr(ident, 'run')) as run, \r\n    episodes_done::NUMERIC\r\n  FROM crosstab( \r\n    'SELECT ident, variable, value \r\n    FROM kpis \r\n    WHERE variable = ''episodes_done'' OR variable = ''eval_episodes_done'' \r\n    ORDER BY ident, LENGTH(ident), variable',\r\n    'VALUES(''episodes_done''), (''eval_episodes_done'')' \r\n    ) as ct (ident text, episodes_done text, eval_episodes_done text)\r\n  ORDER BY scenario, LENGTH(ident), run) as subselect\r\nGROUP BY scenario",
+          "refId": "A",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        },
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "editorMode": "code",
+          "format": "table",
+          "hide": true,
+          "rawQuery": true,
+          "rawSql": "CREATE EXTENSION IF NOT EXISTS tablefunc; \r\n\r\nSELECT\r\n  scenario\r\nFROM (\r\n  SELECT \r\n    SUBSTRING(ident, 1, regexp_instr(ident, 'run')-2) as scenario, \r\n    SUBSTRING(ident, regexp_instr(ident, 'run')) as run, \r\n    episodes_done, \r\n    eval_episodes_done \r\n  FROM crosstab( \r\n    'SELECT ident, variable, value \r\n    FROM kpis \r\n    WHERE variable = ''episodes_done'' OR variable = ''eval_episodes_done'' \r\n    ORDER BY ident, LENGTH(ident), variable', \r\n    'VALUES(''episodes_done''), (''eval_episodes_done'')' \r\n    ) as ct (ident text, episodes_done text, eval_episodes_done text)\r\n  ORDER BY scenario, LENGTH(ident), run) as subselect\r\nGROUP BY \r\n  scenario ",
+          "refId": "B",
+          "sql": {
+            "columns": [
+              {
+                "parameters": [],
+                "type": "function"
+              }
+            ],
+            "groupBy": [
+              {
+                "property": {
+                  "type": "string"
+                },
+                "type": "groupBy"
+              }
+            ],
+            "limit": 50
+          }
+        }
+      ],
+      "title": "Number of Episodes",
+      "type": "trend"
+    },
+    {
+      "collapsed": true,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 47
+      },
+      "id": 27,
+      "panels": [
+        {
+          "datasource": {
+            "type": "grafana-postgresql-datasource",
+            "uid": "P7B13B9DF907EC40C"
+          },
+          "description": "How much profit make all learning units combined per evaluation episode? (Trained models in eval_episode 1 are mostly based on exploration runs and performing badly.)",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "fillOpacity": 80,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "lineWidth": 1,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "min": -3000000,
+              "thresholds": {
+                "mode": "percentage",
+                "steps": [
+                  {
+                    "color": "text"
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 3,
+            "w": 3,
+            "x": 0,
+            "y": 50
+          },
+          "id": 22,
+          "options": {
+            "barRadius": 0,
+            "barWidth": 0.97,
+            "fullHighlight": false,
+            "groupWidth": 0.7,
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "orientation": "vertical",
+            "showValue": "auto",
+            "stacking": "none",
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            },
+            "xField": "episode",
+            "xTickLabelRotation": 0,
+            "xTickLabelSpacing": 0
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "grafana-postgresql-datasource",
+                "uid": "P7B13B9DF907EC40C"
+              },
+              "editorMode": "code",
+              "format": "table",
+              "rawQuery": true,
+              "rawSql": "SELECT\r\n  SUBSTRING(simulation, 1, regexp_instr(simulation, 'eval')-2) as \"simulation run\",\r\n  episode, \r\n  sum(profit) as \"Sum (Learning Agents Profit)\"\r\nFROM rl_params\r\nWHERE simulation ~ '${simulation}_' AND perform_evaluation is true \r\nGROUP BY simulation, episode\r\nORDER BY 1, episode",
+              "refId": "A",
+              "sql": {
+                "columns": [
+                  {
+                    "parameters": [],
+                    "type": "function"
+                  }
+                ],
+                "groupBy": [
+                  {
+                    "property": {
+                      "type": "string"
+                    },
+                    "type": "groupBy"
+                  }
+                ],
+                "limit": 50
+              }
+            }
+          ],
+          "title": "Total Evaluation Profit",
+          "type": "barchart"
+        }
+      ],
+      "title": "RIP - Plot Friedhof",
+      "type": "row"
+    }
+  ],
+  "refresh": "",
+  "schemaVersion": 39,
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "example_02b_harder_run_4",
+          "value": "example_02b_harder_run_4"
+        },
+        "datasource": {
+          "type": "postgres",
+          "uid": "P7B13B9DF907EC40C"
+        },
+        "definition": "SELECT DISTINCT\nSUBSTRING(m.simulation, 0, LENGTH(m.simulation) +1  - strpos(REVERSE(m.simulation),'_')) AS market_simulation\nFROM rl_params m\nwhere learning_mode is True and perform_evaluation is False",
+        "description": "Can choose which simulation we want to show ",
+        "hide": 0,
+        "includeAll": false,
+        "multi": false,
+        "name": "simulation",
+        "options": [],
+        "query": "SELECT DISTINCT\nSUBSTRING(m.simulation, 0, LENGTH(m.simulation) +1  - strpos(REVERSE(m.simulation),'_')) AS market_simulation\nFROM rl_params m\nwhere learning_mode is True and perform_evaluation is False",
+        "refresh": 2,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      },
+      {
+        "current": {
+          "selected": false,
+          "text": "pp_6",
+          "value": "pp_6"
+        },
+        "datasource": {
+          "type": "postgres",
+          "uid": "P7B13B9DF907EC40C"
+        },
+        "definition": "SELECT DISTINCT unit\nFROM rl_params\nwhere simulation ~ '^${simulation}_[1-9]+'",
+        "description": "All units that have an reinforcment learning strategy and hence have the Rl specific parameteres logged",
+        "hide": 0,
+        "includeAll": false,
+        "label": "rl_unit",
+        "multi": false,
+        "name": "rl_unit",
+        "options": [],
+        "query": "SELECT DISTINCT unit\nFROM rl_params\nwhere simulation ~ '^${simulation}_[1-9]+'",
+        "refresh": 2,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      },
+      {
+        "current": {
+          "selected": false,
+          "text": "1551402000000",
+          "value": "1551402000000"
+        },
+        "datasource": {
+          "type": "postgres",
+          "uid": "P7B13B9DF907EC40C"
+        },
+        "definition": "",
+        "hide": 2,
+        "includeAll": false,
+        "multi": false,
+        "name": "timeRange",
+        "options": [],
+        "query": "SELECT MIN(datetime), MAX(datetime) FROM rl_params",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      },
+      {
+        "current": {
+          "selected": true,
+          "text": [
+            "example_02b_harder"
+          ],
+          "value": [
+            "example_02b_harder"
+          ]
+        },
+        "description": "The four different scenario and actor combinations.",
+        "hide": 0,
+        "includeAll": false,
+        "label": "Case Study",
+        "multi": true,
+        "name": "case_study",
+        "options": [
+          {
+            "selected": false,
+            "text": "example_02a_harder",
+            "value": "example_02a_harder"
+          },
+          {
+            "selected": false,
+            "text": "example_02a_harder_LF",
+            "value": "example_02a_harder_LF"
+          },
+          {
+            "selected": false,
+            "text": "example_02a_harder_lstm",
+            "value": "example_02a_harder_lstm"
+          },
+          {
+            "selected": false,
+            "text": "example_02a_harder_lstm_LF",
+            "value": "example_02a_harder_lstm_LF"
+          },
+          {
+            "selected": true,
+            "text": "example_02b_harder",
+            "value": "example_02b_harder"
+          },
+          {
+            "selected": false,
+            "text": "example_02b_harder_lstm",
+            "value": "example_02b_harder_lstm"
+          }
+        ],
+        "query": "example_02a_harder, example_02a_harder_LF, example_02a_harder_lstm, example_02a_harder_lstm_LF, example_02b_harder, example_02b_harder_lstm",
+        "queryValue": "",
+        "skipUrlSync": false,
+        "type": "custom"
+      },
+      {
+        "current": {
+          "selected": true,
+          "text": [
+            "exploration"
+          ],
+          "value": [
+            "exploration"
+          ]
+        },
+        "description": "Charts can be filtered by type of simulation (exploration, learning or evaluation episodes).",
+        "hide": 0,
+        "includeAll": false,
+        "label": "Simulation Type",
+        "multi": true,
+        "name": "sim_type_filter",
+        "options": [
+          {
+            "selected": true,
+            "text": "exploration",
+            "value": "exploration"
+          },
+          {
+            "selected": false,
+            "text": "learning",
+            "value": "learning"
+          },
+          {
+            "selected": false,
+            "text": "evaluation",
+            "value": "evaluation"
+          }
+        ],
+        "query": "exploration, learning, evaluation",
+        "queryValue": "",
+        "skipUrlSync": false,
+        "type": "custom"
+      }
+    ]
+  },
+  "time": {
+    "from": "2019-03-03T23:00:00.000Z",
+    "to": "2019-03-09T03:00:00.000Z"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h"
+    ]
+  },
+  "timezone": "",
+  "title": "ASSUME: Actor Comparison",
+  "uid": "bdnvfato1dlogd",
+  "version": 16,
+  "weekStart": ""
+}
\ No newline at end of file

From 9047578d0137a2d300ba6d7731e5c6cb44ecfab8 Mon Sep 17 00:00:00 2001
From: ufqjh <ufqjh@student.kit.edu>
Date: Fri, 25 Oct 2024 15:14:55 +0200
Subject: [PATCH 07/23] Fixed comments regarding centralized critic

---
 assume/reinforcement_learning/algorithms/ppo.py | 3 +--
 assume/scenario/loader_csv.py                   | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index a46b7dfab..b385ccf54 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -518,13 +518,12 @@ def update_policy(self):
             self.n_updates += 1
 
             # Iterate through over each agent's strategy
-            # Each agent has its own actor and critic. Critic (value network) is in comparison to MATD3 decentralized, meaning each agent learns its own value function.
+            # Each agent has its own actor. Critic (value network) is centralized.
             for u_id in self.learning_role.rl_strats.keys():
                 
                 # Centralized
                 critic = self.learning_role.critics[u_id]
                 # Decentralized
-                #critic = self.learning_role.rl_strats[u_id].critic
                 actor = self.learning_role.rl_strats[u_id].actor
 
                 # Retrieve experiences from the buffer
diff --git a/assume/scenario/loader_csv.py b/assume/scenario/loader_csv.py
index 67323582a..996daa567 100644
--- a/assume/scenario/loader_csv.py
+++ b/assume/scenario/loader_csv.py
@@ -979,7 +979,7 @@ def run_learning(
 
         world.learning_role.load_inter_episodic_data(inter_episodic_data)
 
-        world.run()  # triggers calculate_bids() which equals to step
+        world.run()  # triggers calculate_bids()
 
         inter_episodic_data = world.learning_role.get_inter_episodic_data()
         inter_episodic_data["episodes_done"] = episode

From ca1f9fc66ecf9dc64cfbf190fe4286b9ad87d397 Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Mon, 28 Oct 2024 16:56:34 +0100
Subject: [PATCH 08/23] - implemnted perform eval differentiation that gets rid
 of stochasticity - added mc as mean of distribution from which action_logits
 (softmax [-1,1] are substracted

---
 .../reinforcement_learning/algorithms/ppo.py  | 24 ++++++++++++++-----
 docs/source/learning.rst                      |  2 +-
 docs/source/release_notes.rst                 |  6 +++++
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index b385ccf54..aca7fa963 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -659,23 +659,33 @@ def get_actions(rl_strategy, next_observation):
 
     actor = rl_strategy.actor
     device = rl_strategy.device
+    learning_mode = rl_strategy.learning_mode
+    perform_evaluation = rl_strategy.perform_evaluation
 
     # Pass observation through the actor network to get action logits (mean of action distribution)
     action_logits = actor(next_observation.to(device))
 
     logger.debug(f"Action logits: {action_logits}")
 
-    # Create a normal distribution for continuous actions (with assumed standard deviation of 1.0)
-    action_distribution = th.distributions.Normal(action_logits, 1.0)
+    if learning_mode and not perform_evaluation:
 
-    logger.debug(f"Action distribution: {action_distribution}")
+        # Create a normal distribution for continuous actions (with assumed standard deviation of 1.0)
+        action_distribution = th.distributions.Normal(next_observation[-1]-action_logits, 1.0)
 
-    # Sample an action from the distribution
-    sampled_action = action_distribution.sample()
+        logger.debug(f"Action distribution: {action_distribution}")
+
+        # Sample an action from the distribution
+        sampled_action = action_distribution.sample()
+
+
+    else:
+        # If not in learning mode or during evaluation, use the mean of the action distribution
+        sampled_action = action_logits
 
     logger.debug(f"Sampled action: {sampled_action}")
 
-    # Get the log probability of the sampled action (for later PPO loss calculation)
+    # Get the log probability of the sampled actions (for later PPO loss calculation)
+    # Sum the log probabilities across all action dimensions TODO: Why sum?
     log_prob_action = action_distribution.log_prob(sampled_action).sum(dim=-1)
 
     # Detach the log probability tensor to stop gradient tracking (since we only need the value for later)
@@ -685,8 +695,10 @@ def get_actions(rl_strategy, next_observation):
 
     # PREVIOUSLY SET TO (-1, 1)
     # Bound actions to [0, 1] range
+    # TODO: Does it make more sense o to log probaility of the action before or after clamping?
     sampled_action = sampled_action.clamp(0, 1)
 
     logger.debug(f"Clamped sampled action: {sampled_action}")
 
+
     return sampled_action, log_prob_action
diff --git a/docs/source/learning.rst b/docs/source/learning.rst
index 6e2d6c083..08cafe341 100644
--- a/docs/source/learning.rst
+++ b/docs/source/learning.rst
@@ -31,7 +31,7 @@ interacting in the same environment. The Markov game for :math:`N` agents consis
 a set of observations :math:`O_1, \ldots, O_N`, and a state transition function :math:`P: S \times A_1 \times \ldots \times A_N \rightarrow \mathcal{P}(S)` dependent on the state and actions of all agents.
 After taking action :math:`a_i \in A_i` in state :math:`s_i \in S` according to a policy :math:`\pi_i: O_i \rightarrow A_i`, every agent :math:`i` is transitioned into the new state :math:`s'_i \in S`.
 Each agent receives a reward :math:`r_i` according to the individual reward function :math:`R_i` and a private observation correlated with the state :math:`o_i: S \rightarrow O_i`.
-Like MDP, each agent :math:`i` learns an optimal policy :math:`\pi_i^*(s)` that maximizes its expected reward.
+Like Markov Decision Process (MDP), each agent :math:`i` learns an optimal policy :math:`\pi_i^*(s)` that maximizes its expected reward.
 
 To enable multi-agent learning some adjustments are needed within the learning algorithm to get from the TD3 to an MATD3 algorithm.
 Other authors used similar tweaks to improve the MADDPG algorithm and derive the MA-TD3 algorithm.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
index d396da252..c6c552b1c 100644
--- a/docs/source/release_notes.rst
+++ b/docs/source/release_notes.rst
@@ -12,6 +12,12 @@ Upcoming Release
   The features in this section are not released yet, but will be part of the next release! To use the features already you have to install the main branch,
   e.g. pip install git+https://github.com/assume-framework/assume
 
+**New Features:**
+
+- **PPO Integration:** The Proximal Policy Optimization (PPO) algorithm has been integrated into the framework, 
+providing users with an additional reinforcement learning algorithm option for training agents. 
+PPO is a popular policy gradient method that has been shown to be effective in a wide range of applications, 
+making it a valuable addition to the framework's learning capabilities. A tutorial on how to use this feature is coming soon.
 
 v0.4.0 - latest release (8th August 2024)
 =========================================

From e011baaefbc49ec4c1bac538c3a0d068bad7dd48 Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Tue, 29 Oct 2024 10:25:26 +0100
Subject: [PATCH 09/23] - tensor handling in get_actions function - reset
 buffer for proper on-policy learning

---
 .../reinforcement_learning/algorithms/ppo.py  |  15 +--
 assume/reinforcement_learning/buffer.py       |   4 +-
 .../reinforcement_learning/learning_role.py   |  10 +-
 examples/inputs/example_02a/config.yaml       | 105 ++++++++++++++++--
 4 files changed, 114 insertions(+), 20 deletions(-)

diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index aca7fa963..5493f9ae4 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -663,24 +663,25 @@ def get_actions(rl_strategy, next_observation):
     perform_evaluation = rl_strategy.perform_evaluation
 
     # Pass observation through the actor network to get action logits (mean of action distribution)
-    action_logits = actor(next_observation.to(device))
+    action_logits = actor(next_observation).detach()
 
     logger.debug(f"Action logits: {action_logits}")
 
-    if learning_mode and not perform_evaluation:
+    # Create a normal distribution for continuous actions (with assumed standard deviation of 
+    # TODO: 0.01/0.0 as in marlbenchmark or 1.0 or sheduled decrease?)
+    action_distribution = th.distributions.Normal(next_observation[-1]-action_logits, 0.01)
 
-        # Create a normal distribution for continuous actions (with assumed standard deviation of 1.0)
-        action_distribution = th.distributions.Normal(next_observation[-1]-action_logits, 1.0)
+    logger.debug(f"Action distribution: {action_distribution}")
 
-        logger.debug(f"Action distribution: {action_distribution}")
+    if learning_mode and not perform_evaluation:
 
         # Sample an action from the distribution
-        sampled_action = action_distribution.sample()
+        sampled_action = action_distribution.sample().to(device)
 
 
     else:
         # If not in learning mode or during evaluation, use the mean of the action distribution
-        sampled_action = action_logits
+        sampled_action = action_logits.detach()
 
     logger.debug(f"Sampled action: {sampled_action}")
 
diff --git a/assume/reinforcement_learning/buffer.py b/assume/reinforcement_learning/buffer.py
index 0566c28a5..d7b686621 100644
--- a/assume/reinforcement_learning/buffer.py
+++ b/assume/reinforcement_learning/buffer.py
@@ -411,8 +411,8 @@ def reset(self):
         # self.values = None
         # self.advantages = None
         # self.returns = None
-        # self.pos = 0
-        # self.full = False
+        self.pos = 0
+        self.full = False 
 
     # def compute_returns_and_advantages(self, last_values, dones):
     #     """
diff --git a/assume/reinforcement_learning/learning_role.py b/assume/reinforcement_learning/learning_role.py
index 48e9d8210..ab625fa79 100644
--- a/assume/reinforcement_learning/learning_role.py
+++ b/assume/reinforcement_learning/learning_role.py
@@ -65,10 +65,12 @@ def __init__(
         self.actor_architecture = learning_config.get(self.rl_algorithm_name, {}).get(
             "actor_architecture", "mlp"
         )
-        self.training_episodes = learning_config[self.rl_algorithm_name][
+        self.training_episodes = learning_config[
             "training_episodes"
         ]
-        self.train_freq = learning_config[self.rl_algorithm_name]["train_freq"]
+        self.train_freq = learning_config.get(self.rl_algorithm_name, {}).get(
+            "train_freq"
+        )
         self.batch_size = learning_config.get(self.rl_algorithm_name, {}).get(
             "batch_size", 128
         )
@@ -222,6 +224,10 @@ def save_buffer_and_update(self, content: dict, meta: dict) -> None:
 
             self.update_policy()
 
+            # since the PPO is an on-policy algorithm it onyl uses the expercience collected with the current policy
+            # after the policy-update which ultimately changes the policy, theb buffer needs to be cleared 
+            self.buffer.reset()
+
     # TD3
     def turn_off_initial_exploration(self) -> None:
         """
diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index 7779be514..0c695eeb5 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -1,8 +1,97 @@
 # SPDX-FileCopyrightText: ASSUME Developers
 #
 # SPDX-License-Identifier: AGPL-3.0-or-later
-
 tiny:
+  start_date: 2019-01-01 00:00
+  end_date: 2019-01-05 00:00
+  time_step: 1h
+  save_frequency_hours: null
+  learning_mode: True
+
+  learning_config:
+    continue_learning: False
+    trained_policies_save_path: null
+    max_bid_price: 100
+    algorithm: matd3
+    actor_architecture: mlp
+    learning_rate: 0.001
+    training_episodes: 10
+    episodes_collecting_initial_experience: 3
+    train_freq: 24h
+    gradient_steps: -1
+    batch_size: 64
+    gamma: 0.99
+    device: cpu
+    noise_sigma: 0.1
+    noise_scale: 1
+    noise_dt: 1
+    validation_episodes_interval: 5
+
+  markets_config:
+    EOM:
+      operator: EOM_operator
+      product_type: energy
+      products:
+        - duration: 1h
+          count: 1
+          first_delivery: 1h
+      opening_frequency: 1h
+      opening_duration: 1h
+      volume_unit: MWh
+      maximum_bid_volume: 100000
+      maximum_bid_price: 3000
+      minimum_bid_price: -500
+      price_unit: EUR/MWh
+      market_mechanism: pay_as_clear
+
+
+base:
+  start_date: 2019-03-01 00:00
+  end_date: 2019-03-31 00:00
+  time_step: 1h
+  save_frequency_hours: null
+  learning_mode: True
+
+  learning_config:
+    continue_learning: False
+    trained_policies_save_path: null
+    max_bid_price: 100
+    algorithm: matd3
+    actor_architecture: mlp
+    learning_rate: 0.001
+    training_episodes: 50
+    episodes_collecting_initial_experience: 5
+    train_freq: 24h
+    gradient_steps: -1
+    batch_size: 256
+    gamma: 0.99
+    device: cpu
+    noise_sigma: 0.1
+    noise_scale: 1
+    noise_dt: 1
+    validation_episodes_interval: 5
+    early_stopping_steps: 10
+    early_stopping_threshold: 0.05
+
+  markets_config:
+    EOM:
+      operator: EOM_operator
+      product_type: energy
+      products:
+        - duration: 1h
+          count: 1
+          first_delivery: 1h
+      opening_frequency: 1h
+      opening_duration: 1h
+      volume_unit: MWh
+      maximum_bid_volume: 100000
+      maximum_bid_price: 3000
+      minimum_bid_price: -500
+      price_unit: EUR/MWh
+      market_mechanism: pay_as_clear
+
+
+tiny_ppo:
   start_date: 2019-01-01 00:00
   end_date: 2019-01-05 00:00 
   time_step: 1h
@@ -33,7 +122,7 @@ tiny:
       actor_architecture: mlp
       training_episodes: 10
       validation_episodes_interval: 5 # after how many episodes the validation starts and the policy is updated
-      train_freq: 48h # how often write_to_learning_role gets called
+      train_freq: 64h # how often write_to_learning_role gets called
       gamma: 0.99  # Discount factor for future rewards
       epochs: 5 # #4  # Number of epochs for updating the policy
       clip_ratio: 0.2  # Clipping parameter for policy updates
@@ -61,7 +150,7 @@ tiny:
       market_mechanism: pay_as_clear
 
 
-base:
+base_ppo:
   start_date: 2019-03-01 00:00
   end_date: 2019-03-31 00:00
   time_step: 1h
@@ -75,23 +164,21 @@ base:
     algorithm: ppo
     device: cpu
     learning_rate: 0.001
+    validation_episodes_interval: 2 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 10
     matd3:
       actor_architecture: mlp
-      training_episodes: 10
+      train_freq: 24h # how often write_to_learning_role gets called
       episodes_collecting_initial_experience: 3
-      train_freq: 48h
       gradient_steps: -1
       batch_size: 64
       gamma: 0.99
       noise_sigma: 0.1
       noise_scale: 1
       noise_dt: 1
-      validation_episodes_interval: 5
     ppo: 
       actor_architecture: mlp
-      training_episodes: 10
-      validation_episodes_interval: 5 # after how many episodes the validation starts and the policy is updated
-      train_freq: 24h # how often write_to_learning_role gets called
+      train_freq: 64h # how often write_to_learning_role gets called
       gamma: 0.99  # Discount factor for future rewards
       epochs: 5 # #4  # Number of epochs for updating the policy
       clip_ratio: 0.2  # Clipping parameter for policy updates

From 5d2dd2d339bd0194b57d908e771186b7d0346b18 Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Tue, 29 Oct 2024 11:49:31 +0100
Subject: [PATCH 10/23] =?UTF-8?q?convergence=20of=20PPO=3F=20zur=C3=BCckha?=
 =?UTF-8?q?ltendes=20wuhu?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 assume/reinforcement_learning/algorithms/ppo.py | 2 +-
 examples/inputs/example_02a/config.yaml         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index 5493f9ae4..524f43c14 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -669,7 +669,7 @@ def get_actions(rl_strategy, next_observation):
 
     # Create a normal distribution for continuous actions (with assumed standard deviation of 
     # TODO: 0.01/0.0 as in marlbenchmark or 1.0 or sheduled decrease?)
-    action_distribution = th.distributions.Normal(next_observation[-1]-action_logits, 0.01)
+    action_distribution = th.distributions.Normal(next_observation[-1]-action_logits, 0.2)
 
     logger.debug(f"Action distribution: {action_distribution}")
 
diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index 0c695eeb5..fa84426ab 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -164,8 +164,8 @@ base_ppo:
     algorithm: ppo
     device: cpu
     learning_rate: 0.001
-    validation_episodes_interval: 2 # after how many episodes the validation starts and the policy is updated
-    training_episodes: 10
+    validation_episodes_interval: 100 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called

From 79c287acae902bd672aa14ed5c075d7ee1b28538 Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Tue, 29 Oct 2024 12:12:59 +0100
Subject: [PATCH 11/23] - pushed config setting for reproducability

---
 examples/inputs/example_02a/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index fa84426ab..d867216b1 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -164,7 +164,7 @@ base_ppo:
     algorithm: ppo
     device: cpu
     learning_rate: 0.001
-    validation_episodes_interval: 100 # after how many episodes the validation starts and the policy is updated
+    validation_episodes_interval: 80 # after how many episodes the validation starts and the policy is updated
     training_episodes: 100
     matd3:
       actor_architecture: mlp

From 97c3bb64f579623e5c5f83a7199e0bcc14d9daf0 Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Tue, 29 Oct 2024 17:00:52 +0100
Subject: [PATCH 12/23] - added further todos for prettier code and better
 handling of critic information

---
 .../reinforcement_learning/algorithms/ppo.py  | 75 +++++++++++--------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index 524f43c14..8e842cba8 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -219,9 +219,6 @@ def load_critic_params(self, directory: str) -> None:
                 self.learning_role.critics[u_id].load_state_dict(
                     critic_params["critic"]
                 )
-                # self.learning_role.target_critics[u_id].load_state_dict(
-                #     critic_params["critic_target"]
-                # )
                 self.learning_role.critics[u_id].optimizer.load_state_dict(
                     critic_params["critic_optimizer"]
                 )
@@ -514,6 +511,26 @@ def update_policy(self):
         # We will iterate for multiple epochs to update both the policy (actor) and value (critic) networks
         # The number of epochs controls how many times we update using the same collected data (from the buffer).
 
+
+                # Retrieve experiences from the buffer
+        # The collected experiences (observations, actions, rewards, log_probs) are stored in the buffer.
+        transitions = self.learning_role.buffer.get()
+        states = transitions.observations
+        actions = transitions.actions
+        rewards = transitions.rewards
+        log_probs = transitions.log_probs
+
+        # STARTING FROM HERE, THE IMPLEMENTATION NEEDS TO BE FIXED
+        # Potentially, it could be useful to source some functionality out into methods stored in buffer.py
+
+        # Pass the current states through the critic network to get value estimates.
+        #TODO: Handling von unique obs dim and make ciritc obsevrations better,
+        #TODO: critic should not get own action to estimate value, here confusion with Q(s,a) and V(s), need acitions of other agent though
+        #TODO: hier durch alle critics gehen und values generieren, weil sonst ciritcgeupdated zwischen durhc und neue values 
+        # loop mit allen critics für value und advantage berehcnung als extra function hier in PPO
+        values = critic(states, actions).squeeze(dim=2)
+
+        #TODO: epochen in gradient steps umbennen
         for _ in range(self.epochs):
             self.n_updates += 1
 
@@ -526,29 +543,13 @@ def update_policy(self):
                 # Decentralized
                 actor = self.learning_role.rl_strats[u_id].actor
 
-                # Retrieve experiences from the buffer
-                # The collected experiences (observations, actions, rewards, log_probs) are stored in the buffer.
-                transitions = self.learning_role.buffer.get()
-                states = transitions.observations
-                actions = transitions.actions
-                rewards = transitions.rewards
-                log_probs = transitions.log_probs
-
-                # STARTING FROM HERE, THE IMPLEMENTATION NEEDS TO BE FIXED
-                # Potentially, it could be useful to source some functionality out into methods stored in buffer.py
 
-                # Pass the current states through the critic network to get value estimates.
-                values = critic(states, actions).squeeze(dim=2)
 
                 logger.debug(f"Values: {values}")
 
-                # Store the calculated values in the rollout buffer
-                # These values are used later to calculate the advantage estimates (for policy updates).
-                self.learning_role.buffer.values = values.detach().cpu().numpy()
-
                 # Compute advantages using Generalized Advantage Estimation (GAE)
                 advantages = []
-                last_advantage = 0
+                advantage = 0
                 returns = []
 
                 # Iterate through the collected experiences in reverse order to calculate advantages and returns
@@ -561,24 +562,30 @@ def update_policy(self):
                     else:
                         next_value = values[t + 1]
 
-                    # Temporal difference delta
+                    # Temporal difference delta Equation 12 from PPO paper
                     delta = (
-                        rewards[t] + self.gamma * next_value - values[t]
+                        - values[t] + rewards[t] + self.gamma * next_value 
                     )  # Use self.gamma for discount factor
 
                     logger.debug(f"Delta: {delta}")
 
-                    # GAE advantage
-                    last_advantage = (
-                        delta + self.gamma * self.gae_lambda * last_advantage
+                    # GAE advantage Equation 11 from PPO paper
+                    advantage = (
+                        delta + self.gamma * self.gae_lambda * advantage
                     )  # Use self.gae_lambda for advantage estimation
 
-                    logger.debug(f"Last_advantage: {last_advantage}")
+                    logger.debug(f"Last_advantage: {advantage}")
 
-                    advantages.insert(0, last_advantage)
-                    returns.insert(0, last_advantage + values[t])
+                    advantages.insert(0, advantage)
+                    returns.insert(0, advantage + values[t])
 
                 # Convert advantages and returns to tensors
+                #TODO: normalisieren von advantages wie in spinning up von 
+                #also done in mappo         
+                #advantages_copy[buffer.active_masks[:-1] == 0.0] = np.nan
+                #mean_advantages = np.nanmean(advantages_copy)
+                #std_advantages = np.nanstd(advantages_copy)
+                #advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)
                 advantages = th.tensor(advantages, dtype=th.float32, device=self.device)
                 returns = th.tensor(returns, dtype=th.float32, device=self.device)
 
@@ -587,8 +594,12 @@ def update_policy(self):
                 action_stddev = th.ones_like(
                     action_means
                 )  # Assuming fixed standard deviation for simplicity
+                # TODO: rename to actions function and use same fix std
+                # TODO: move mean and std in extra actor that outputs distributin immediately
                 dist = th.distributions.Normal(action_means, action_stddev)
                 new_log_probs = dist.log_prob(actions).sum(-1)
+                
+                
                 entropy = dist.entropy().sum(-1)
 
                 # Compute the ratio of new policy to old policy
@@ -606,8 +617,8 @@ def update_policy(self):
                 logger.debug(f"surrogate1: {surrogate1}")
                 logger.debug(f"surrogate2: {surrogate2}")
 
-                # Final policy loss (clipped surrogate loss)
-                policy_loss = -th.min(surrogate1, surrogate2).mean()
+                # Final policy loss (clipped surrogate loss) equation 7 from PPO paper
+                policy_loss = th.min(surrogate1, surrogate2).mean()
 
                 logger.debug(f"policy_loss: {policy_loss}")
 
@@ -617,8 +628,9 @@ def update_policy(self):
                 logger.debug(f"value loss: {value_loss}")
 
                 # Total loss: policy loss + value loss - entropy bonus
+                # euqation 9 from PPO paper multiplied with -1 to enable minimizing
                 total_loss = (
-                    policy_loss
+                    - policy_loss
                     + self.vf_coef * value_loss
                     - self.entropy_coef * entropy.mean()
                 )  # Use self.vf_coef and self.entropy_coef
@@ -669,6 +681,7 @@ def get_actions(rl_strategy, next_observation):
 
     # Create a normal distribution for continuous actions (with assumed standard deviation of 
     # TODO: 0.01/0.0 as in marlbenchmark or 1.0 or sheduled decrease?)
+    # TODO: differently fixed std for policy update and action sampling!?
     action_distribution = th.distributions.Normal(next_observation[-1]-action_logits, 0.2)
 
     logger.debug(f"Action distribution: {action_distribution}")

From 77f28d5fd148e3d9cc0f6681159ae3b7b519f6cb Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Wed, 30 Oct 2024 13:29:16 +0100
Subject: [PATCH 13/23] - moved advantage and value calculation outside of the
 loop since it would always be calculated new with the gradient steps, which
 is not in accordance with the spinning up implementation - also added
 normalisation of advantages in accordance with Mappo - used obs collection
 for central critic from MATD3 to enable proper multi-agent learning - renamed
 epoch to gradient step

---
 .../algorithms/matd3.py                       |  45 +----
 .../reinforcement_learning/algorithms/ppo.py  | 164 +++++++++++-------
 .../reinforcement_learning/learning_role.py   |  14 +-
 .../reinforcement_learning/learning_utils.py  |  40 +++++
 examples/inputs/example_02a/config.yaml       |   2 +-
 5 files changed, 158 insertions(+), 107 deletions(-)

diff --git a/assume/reinforcement_learning/algorithms/matd3.py b/assume/reinforcement_learning/algorithms/matd3.py
index a5e9b9f27..807708393 100644
--- a/assume/reinforcement_learning/algorithms/matd3.py
+++ b/assume/reinforcement_learning/algorithms/matd3.py
@@ -11,7 +11,7 @@
 
 from assume.common.base import LearningStrategy
 from assume.reinforcement_learning.algorithms.base_algorithm import RLAlgorithm
-from assume.reinforcement_learning.learning_utils import polyak_update
+from assume.reinforcement_learning.learning_utils import polyak_update, collect_obs_for_central_critic
 from assume.reinforcement_learning.neural_network_architecture import CriticTD3
 
 logger = logging.getLogger(__name__)
@@ -433,47 +433,14 @@ def update_policy(self):
 
                 all_actions = actions.view(self.batch_size, -1)
 
-                # this takes the unique observations from all other agents assuming that
-                # the unique observations are at the end of the observation vector
-                temp = th.cat(
-                    (
-                        states[:, :i, self.obs_dim - self.unique_obs_dim :].reshape(
-                            self.batch_size, -1
-                        ),
-                        states[
-                            :, i + 1 :, self.obs_dim - self.unique_obs_dim :
-                        ].reshape(self.batch_size, -1),
-                    ),
-                    axis=1,
+                #collect observations for critic
+                all_states = collect_obs_for_central_critic(
+                    states, i, self.obs_dim, self.unique_obs_dim, self.batch_size
                 )
-
-                # the final all_states vector now contains the current agent's observation
-                # and the unique observations from all other agents
-                all_states = th.cat(
-                    (states[:, i, :].reshape(self.batch_size, -1), temp), axis=1
-                ).view(self.batch_size, -1)
-                # all_states = states[:, i, :].reshape(self.batch_size, -1)
-
-                # this is the same as above but for the next states
-                temp = th.cat(
-                    (
-                        next_states[
-                            :, :i, self.obs_dim - self.unique_obs_dim :
-                        ].reshape(self.batch_size, -1),
-                        next_states[
-                            :, i + 1 :, self.obs_dim - self.unique_obs_dim :
-                        ].reshape(self.batch_size, -1),
-                    ),
-                    axis=1,
+                all_next_states = collect_obs_for_central_critic(
+                    next_states, i, self.obs_dim, self.unique_obs_dim, self.batch_size
                 )
 
-                # the final all_next_states vector now contains the current agent's observation
-                # and the unique observations from all other agents
-                all_next_states = th.cat(
-                    (next_states[:, i, :].reshape(self.batch_size, -1), temp), axis=1
-                ).view(self.batch_size, -1)
-                # all_next_states = next_states[:, i, :].reshape(self.batch_size, -1)
-
                 with th.no_grad():
                     # Compute the next Q-values: min over all critics targets
                     next_q_values = th.cat(
diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index 8e842cba8..8c259786e 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -12,6 +12,8 @@
 from assume.common.base import LearningStrategy
 from assume.reinforcement_learning.algorithms.base_algorithm import RLAlgorithm
 from assume.reinforcement_learning.neural_network_architecture import CriticPPO
+from assume.reinforcement_learning.learning_utils import collect_obs_for_central_critic
+
 
 logger = logging.getLogger(__name__)
 
@@ -35,7 +37,7 @@ def __init__(
         learning_role,
         learning_rate: float,
         gamma: float,  # Discount factor for future rewards
-        epochs: int,  # Number of epochs for updating the policy
+        gradient_steps: int,  # Number of steps for updating the policy
         clip_ratio: float,  # Clipping parameter for policy updates
         vf_coef: float,  # Value function coefficient in the loss function
         entropy_coef: float,  # Entropy coefficient for exploration
@@ -49,7 +51,7 @@ def __init__(
             gamma=gamma,
             actor_architecture=actor_architecture,
         )
-        self.epochs = epochs
+        self.gradient_steps = gradient_steps
         self.clip_ratio = clip_ratio
         self.vf_coef = vf_coef
         self.entropy_coef = entropy_coef
@@ -501,6 +503,93 @@ def extract_policy(self) -> dict:
         }
 
         return actors_and_critics
+    
+    def get_values(self, states, actions):
+        """
+        Gets values for a unit based on the observation using PPO.
+
+        Args:
+            rl_strategy (RLStrategy): The strategy containing relevant information.
+            next_observation (torch.Tensor): The observation.
+
+        Returns:
+            torch.Tensor: The value of the observation.
+        """
+        #counter iterating over all agents for dynamic buffer slice
+        i=0
+
+        #get length of all states to pass it on as batch size, since the entire buffer is used for the PPO
+        buffer_length = len(states)
+        all_actions = actions.view(buffer_length, -1)
+        # Initialize an empty tensor to store all values
+        all_values = th.empty(0, buffer_length, 1)
+         
+        for u_id in self.learning_role.rl_strats.keys():
+
+            all_states = collect_obs_for_central_critic(states, i, self.obs_dim, self.unique_obs_dim, buffer_length)
+        
+            critic = self.learning_role.critics[u_id]
+
+            # Pass the current states through the critic network to get value estimates.
+            values = critic(all_states, all_actions)
+
+            if  all_values.numel() == 0:
+                all_values = values
+            else:
+                all_values = th.cat((all_values, values), dim=1)
+
+            i=i+1
+
+        return all_values
+
+    def get_advantages(self, rewards, values):
+
+        # Compute advantages using Generalized Advantage Estimation (GAE)
+        advantages = []
+        advantage = 0
+        returns = []
+
+        # Iterate through the collected experiences in reverse order to calculate advantages and returns
+        for t in reversed(range(len(rewards))):
+            
+            logger.debug(f"Reward: {t}")    
+
+            if t == len(rewards) - 1:
+                next_value = 0
+            else:
+                next_value = values[t + 1]
+
+            # Temporal difference delta Equation 12 from PPO paper
+            delta = (
+                - values[t] + rewards[t] + self.gamma * next_value 
+            )  # Use self.gamma for discount factor
+
+            logger.debug(f"Delta: {delta}")
+
+            # GAE advantage Equation 11 from PPO paper
+            advantage = (
+                delta + self.gamma * self.gae_lambda * advantage
+            )  # Use self.gae_lambda for advantage estimation
+
+            logger.debug(f"Last_advantage: {advantage}")
+
+            advantages.insert(0, advantage)
+            returns.insert(0, advantage + values[t])
+        
+        # Convert advantages and returns to tensors
+        advantages = th.tensor(advantages, dtype=th.float32, device=self.device)
+        returns = th.tensor(returns, dtype=th.float32, device=self.device)
+
+        #Normalize advantages
+        #in accordance with spinning up and mappo version of PPO
+        mean_advantages = th.nanmean(advantages)
+        std_advantages = th.std(advantages)
+        advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)
+
+        #TODO: Should we detach here? I though becaus of normalisation not being included in backward
+        # but unsure if this is correct
+        return advantages, returns
+
 
     def update_policy(self):
         """
@@ -512,26 +601,21 @@ def update_policy(self):
         # The number of epochs controls how many times we update using the same collected data (from the buffer).
 
 
-                # Retrieve experiences from the buffer
+        # Retrieve experiences from the buffer
         # The collected experiences (observations, actions, rewards, log_probs) are stored in the buffer.
         transitions = self.learning_role.buffer.get()
         states = transitions.observations
         actions = transitions.actions
         rewards = transitions.rewards
         log_probs = transitions.log_probs
+        
+        # Pass the current states through the critic network to get value estimates.
+        values = self.get_values(states, actions)
 
-        # STARTING FROM HERE, THE IMPLEMENTATION NEEDS TO BE FIXED
-        # Potentially, it could be useful to source some functionality out into methods stored in buffer.py
+        # Compute advantages using Generalized Advantage Estimation (GAE)
+        advantages, returns = self.get_advantages(rewards, values)
 
-        # Pass the current states through the critic network to get value estimates.
-        #TODO: Handling von unique obs dim and make ciritc obsevrations better,
-        #TODO: critic should not get own action to estimate value, here confusion with Q(s,a) and V(s), need acitions of other agent though
-        #TODO: hier durch alle critics gehen und values generieren, weil sonst ciritcgeupdated zwischen durhc und neue values 
-        # loop mit allen critics für value und advantage berehcnung als extra function hier in PPO
-        values = critic(states, actions).squeeze(dim=2)
-
-        #TODO: epochen in gradient steps umbennen
-        for _ in range(self.epochs):
+        for _ in range(self.gradient_steps):
             self.n_updates += 1
 
             # Iterate through over each agent's strategy
@@ -544,51 +628,6 @@ def update_policy(self):
                 actor = self.learning_role.rl_strats[u_id].actor
 
 
-
-                logger.debug(f"Values: {values}")
-
-                # Compute advantages using Generalized Advantage Estimation (GAE)
-                advantages = []
-                advantage = 0
-                returns = []
-
-                # Iterate through the collected experiences in reverse order to calculate advantages and returns
-                for t in reversed(range(len(rewards))):
-                    
-                    logger.debug(f"Reward: {t}")    
-
-                    if t == len(rewards) - 1:
-                        next_value = 0
-                    else:
-                        next_value = values[t + 1]
-
-                    # Temporal difference delta Equation 12 from PPO paper
-                    delta = (
-                        - values[t] + rewards[t] + self.gamma * next_value 
-                    )  # Use self.gamma for discount factor
-
-                    logger.debug(f"Delta: {delta}")
-
-                    # GAE advantage Equation 11 from PPO paper
-                    advantage = (
-                        delta + self.gamma * self.gae_lambda * advantage
-                    )  # Use self.gae_lambda for advantage estimation
-
-                    logger.debug(f"Last_advantage: {advantage}")
-
-                    advantages.insert(0, advantage)
-                    returns.insert(0, advantage + values[t])
-
-                # Convert advantages and returns to tensors
-                #TODO: normalisieren von advantages wie in spinning up von 
-                #also done in mappo         
-                #advantages_copy[buffer.active_masks[:-1] == 0.0] = np.nan
-                #mean_advantages = np.nanmean(advantages_copy)
-                #std_advantages = np.nanstd(advantages_copy)
-                #advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)
-                advantages = th.tensor(advantages, dtype=th.float32, device=self.device)
-                returns = th.tensor(returns, dtype=th.float32, device=self.device)
-
                 # Evaluate the new log-probabilities and entropy under the current policy
                 action_means = actor(states)
                 action_stddev = th.ones_like(
@@ -640,7 +679,7 @@ def update_policy(self):
                 # Zero the gradients and perform backpropagation for both actor and critic
                 actor.optimizer.zero_grad()
                 critic.optimizer.zero_grad()
-                total_loss.backward()
+                total_loss.backward(retain_graph=True)
 
                 # Clip gradients to prevent gradient explosion
                 th.nn.utils.clip_grad_norm_(
@@ -654,6 +693,8 @@ def update_policy(self):
                 actor.optimizer.step()
                 critic.optimizer.step()
 
+    
+
 
 def get_actions(rl_strategy, next_observation):
     """
@@ -716,3 +757,6 @@ def get_actions(rl_strategy, next_observation):
 
 
     return sampled_action, log_prob_action
+
+
+
diff --git a/assume/reinforcement_learning/learning_role.py b/assume/reinforcement_learning/learning_role.py
index ab625fa79..440bbcd87 100644
--- a/assume/reinforcement_learning/learning_role.py
+++ b/assume/reinforcement_learning/learning_role.py
@@ -82,16 +82,16 @@ def __init__(
             1,
         )
 
+        self.gradient_steps = (
+            int(self.train_freq[:-1])
+            if learning_config.get("gradient_steps", -1) == -1
+            else learning_config["gradient_steps"]
+        )
+
         # Algorithm-specific parameters
         if self.rl_algorithm_name == "matd3":
             self.buffer: ReplayBuffer = None
             self.target_critics = {}
-
-            self.gradient_steps = (
-                int(self.train_freq[:-1])
-                if learning_config["matd3"].get("gradient_steps", -1) == -1
-                else learning_config["matd3"]["gradient_steps"]
-            )
             self.noise_sigma = learning_config["matd3"]["noise_sigma"]
             self.noise_scale = learning_config["matd3"]["noise_scale"]
 
@@ -281,7 +281,7 @@ def create_learning_algorithm(self, algorithm: str):
                 learning_role=self,
                 learning_rate=self.learning_rate,
                 gamma=self.gamma,  # Discount factor
-                epochs=self.steps_per_epoch,  # Number of epochs for policy updates
+                gradient_steps=self.gradient_steps,  # Number of epochs for policy updates
                 clip_ratio=self.clip_ratio,  # PPO-specific clipping parameter
                 vf_coef=self.value_coeff,  # Coefficient for value function loss
                 entropy_coef=self.entropy_coeff,  # Coefficient for entropy to encourage exploration
diff --git a/assume/reinforcement_learning/learning_utils.py b/assume/reinforcement_learning/learning_utils.py
index 2189c37bb..7c624a83d 100644
--- a/assume/reinforcement_learning/learning_utils.py
+++ b/assume/reinforcement_learning/learning_utils.py
@@ -101,6 +101,46 @@ def polyak_update(params, target_params, tau: float):
             th.add(target_param.data, param.data, alpha=tau, out=target_param.data)
 
 
+def collect_obs_for_central_critic(
+    states: th.Tensor, i: int, obs_dim: int, unique_obs_dim: int, batch_size: int   
+) -> th.Tensor:
+    """
+    This function samels the observations from allagents for the central critic. 
+    In detail it takes all actions and concates all unique_obs of the agents and one time the similar observations. 
+
+    Args:
+        actions (th.Tensor): The actions
+        n_agents (int): Number of agents
+        n_actions (int): Number of actions
+
+    Returns:
+        th.Tensor: The sampled actions
+    """
+    # Sample actions for the central critic
+
+    # this takes the unique observations from all other agents assuming that
+    # the unique observations are at the end of the observation vector
+    temp = th.cat(
+        (
+            states[:, :i, obs_dim - unique_obs_dim :].reshape(
+                batch_size, -1
+            ),
+            states[
+                :, i + 1 :, obs_dim - unique_obs_dim :
+            ].reshape(batch_size, -1),
+        ),
+        axis=1,
+    )
+
+    # the final all_states vector now contains the current agent's observation
+    # and the unique observations from all other agents
+    all_states = th.cat(
+        (states[:, i, :].reshape(batch_size, -1), temp), axis=1
+    ).view(batch_size, -1)
+
+
+    return all_states
+
 # # For non-dynamic PPO buffer size calculation (remove if buffer stays dynamic)
 # def convert_to_timedelta(time_str):
 #     # Wenn bereits ein Timedelta-Objekt, direkt zurückgeben
diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index d867216b1..0c8d3ba90 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -166,11 +166,11 @@ base_ppo:
     learning_rate: 0.001
     validation_episodes_interval: 80 # after how many episodes the validation starts and the policy is updated
     training_episodes: 100
+    gradient_steps: 1
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called
       episodes_collecting_initial_experience: 3
-      gradient_steps: -1
       batch_size: 64
       gamma: 0.99
       noise_sigma: 0.1

From a0c2a883622e29093b8db7468f180ae6e7795ada Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Wed, 30 Oct 2024 14:10:42 +0100
Subject: [PATCH 14/23] - introduce new actor architecture with distributuon
 layer so that we do not define two distribtuions one in get_actions and one
 in policiy update, as this is prone to mistakes

---
 .../algorithms/__init__.py                    |  2 ++
 .../reinforcement_learning/algorithms/ppo.py  | 30 ++++++++-----------
 .../neural_network_architecture.py            | 23 ++++++++++++++
 examples/inputs/example_02a/config.yaml       |  2 +-
 4 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/assume/reinforcement_learning/algorithms/__init__.py b/assume/reinforcement_learning/algorithms/__init__.py
index 645e5c991..cb23e79b6 100644
--- a/assume/reinforcement_learning/algorithms/__init__.py
+++ b/assume/reinforcement_learning/algorithms/__init__.py
@@ -7,9 +7,11 @@
 from assume.reinforcement_learning.neural_network_architecture import (
     MLPActor,
     LSTMActor,
+    DistActor,
 )
 
 actor_architecture_aliases: dict[str, type[nn.Module]] = {
     "mlp": MLPActor,
     "lstm": LSTMActor,
+    "dist": DistActor,
 }
diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index 8c259786e..4fdf31f63 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -59,6 +59,12 @@ def __init__(
         self.gae_lambda = gae_lambda
         self.n_updates = 0  # Number of updates performed
 
+        # write error if different actor_architecture than dist is used
+        if actor_architecture != "dist":
+            raise ValueError(
+                "PPO only supports the 'dist' actor architecture. Please define 'dist' as actor architecture in config."
+            )
+
     # Unchanged method from MATD3
     def save_params(self, directory):
         """
@@ -629,17 +635,11 @@ def update_policy(self):
 
 
                 # Evaluate the new log-probabilities and entropy under the current policy
-                action_means = actor(states)
-                action_stddev = th.ones_like(
-                    action_means
-                )  # Assuming fixed standard deviation for simplicity
-                # TODO: rename to actions function and use same fix std
-                # TODO: move mean and std in extra actor that outputs distributin immediately
-                dist = th.distributions.Normal(action_means, action_stddev)
-                new_log_probs = dist.log_prob(actions).sum(-1)
+                action_logits, action_distribution = actor(states)
+                new_log_probs = action_distribution.log_prob(actions).sum(-1)
                 
                 
-                entropy = dist.entropy().sum(-1)
+                entropy = action_distribution.entropy().sum(-1)
 
                 # Compute the ratio of new policy to old policy
                 ratio = (new_log_probs - log_probs).exp()
@@ -716,15 +716,10 @@ def get_actions(rl_strategy, next_observation):
     perform_evaluation = rl_strategy.perform_evaluation
 
     # Pass observation through the actor network to get action logits (mean of action distribution)
-    action_logits = actor(next_observation).detach()
-
+    action_logits, action_distribution = actor(next_observation)
+    action_logits = action_logits.detach()
     logger.debug(f"Action logits: {action_logits}")
 
-    # Create a normal distribution for continuous actions (with assumed standard deviation of 
-    # TODO: 0.01/0.0 as in marlbenchmark or 1.0 or sheduled decrease?)
-    # TODO: differently fixed std for policy update and action sampling!?
-    action_distribution = th.distributions.Normal(next_observation[-1]-action_logits, 0.2)
-
     logger.debug(f"Action distribution: {action_distribution}")
 
     if learning_mode and not perform_evaluation:
@@ -732,7 +727,6 @@ def get_actions(rl_strategy, next_observation):
         # Sample an action from the distribution
         sampled_action = action_distribution.sample().to(device)
 
-
     else:
         # If not in learning mode or during evaluation, use the mean of the action distribution
         sampled_action = action_logits.detach()
@@ -750,7 +744,7 @@ def get_actions(rl_strategy, next_observation):
 
     # PREVIOUSLY SET TO (-1, 1)
     # Bound actions to [0, 1] range
-    # TODO: Does it make more sense o to log probaility of the action before or after clamping?
+    # TODO: Does it make more sense to log probaility of the action before or after clamping?
     sampled_action = sampled_action.clamp(0, 1)
 
     logger.debug(f"Clamped sampled action: {sampled_action}")
diff --git a/assume/reinforcement_learning/neural_network_architecture.py b/assume/reinforcement_learning/neural_network_architecture.py
index 8b1a1f0ad..a5cfa09bd 100644
--- a/assume/reinforcement_learning/neural_network_architecture.py
+++ b/assume/reinforcement_learning/neural_network_architecture.py
@@ -169,6 +169,29 @@ def forward(self, obs):
         #x = th.sigmoid(self.FC3(x))
 
         return x
+    
+class DistActor(MLPActor):
+    """
+    The actor based on the  neural network MLP actor that contrcuts a distribution for the action defintion.
+    """
+
+
+    def forward(self, obs):
+        x = F.relu(self.FC1(obs))
+        x = F.relu(self.FC2(x))
+        # Works with MATD3, output of softsign: [-1, 1]
+        x = F.softsign(self.FC3(x))
+        
+        # x = th.tanh(self.FC3(x))
+
+        # Tested for PPO, scales the output to [0, 1] range
+        #x = th.sigmoid(self.FC3(x))
+
+        # Create a normal distribution for continuous actions (with assumed standard deviation of 
+        # TODO: 0.01/0.0 as in marlbenchmark or 1.0 or sheduled decrease?)
+        dist = th.distributions.Normal(x, 0.2)
+                
+        return x, dist
 
 
 class LSTMActor(Actor):
diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index 0c8d3ba90..c3f724af7 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -177,7 +177,7 @@ base_ppo:
       noise_scale: 1
       noise_dt: 1
     ppo: 
-      actor_architecture: mlp
+      actor_architecture: dist
       train_freq: 64h # how often write_to_learning_role gets called
       gamma: 0.99  # Discount factor for future rewards
       epochs: 5 # #4  # Number of epochs for updating the policy

From cd3091595c6bb01a90c80f681709e7d8d40a1414 Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Wed, 30 Oct 2024 15:53:09 +0100
Subject: [PATCH 15/23] - delted epochs from config

---
 examples/inputs/example_02a/config.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index c3f724af7..14335d2ed 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -164,8 +164,8 @@ base_ppo:
     algorithm: ppo
     device: cpu
     learning_rate: 0.001
-    validation_episodes_interval: 80 # after how many episodes the validation starts and the policy is updated
-    training_episodes: 100
+    validation_episodes_interval: 900 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 1000
     gradient_steps: 1
     matd3:
       actor_architecture: mlp
@@ -180,7 +180,6 @@ base_ppo:
       actor_architecture: dist
       train_freq: 64h # how often write_to_learning_role gets called
       gamma: 0.99  # Discount factor for future rewards
-      epochs: 5 # #4  # Number of epochs for updating the policy
       clip_ratio: 0.2  # Clipping parameter for policy updates
       vf_coef: 0.5  # Value function coefficient in the loss function
       entropy_coef: 0.01  # Entropy coefficient for exploration

From 5ad6e15cb57b947497c2f77c067c0bf00b439098 Mon Sep 17 00:00:00 2001
From: adiwied <92265142+AdrianWiedemann@users.noreply.github.com>
Date: Wed, 13 Nov 2024 20:13:45 -0600
Subject: [PATCH 16/23] add mini batch sampling to ppo

---
 .../reinforcement_learning/algorithms/ppo.py  | 28 ++++++++++++-----
 assume/reinforcement_learning/buffer.py       | 31 +++++++++++++++++++
 2 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index 4fdf31f63..ce412969c 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -609,21 +609,33 @@ def update_policy(self):
 
         # Retrieve experiences from the buffer
         # The collected experiences (observations, actions, rewards, log_probs) are stored in the buffer.
-        transitions = self.learning_role.buffer.get()
-        states = transitions.observations
-        actions = transitions.actions
-        rewards = transitions.rewards
-        log_probs = transitions.log_probs
+        full_transitions = self.learning_role.buffer.get()
+        full_values = self.get_values(full_transitions.observations, full_transitions.actions)
+        full_advantages, full_returns = self.get_advantages(full_transitions.rewards, full_values)
+        
+        #states = transitions.observations
+        #actions = transitions.actions
+        #rewards = transitions.rewards
+        #log_probs = transitions.log_probs
         
         # Pass the current states through the critic network to get value estimates.
-        values = self.get_values(states, actions)
+        #values = self.get_values(states, actions)
 
         # Compute advantages using Generalized Advantage Estimation (GAE)
-        advantages, returns = self.get_advantages(rewards, values)
+        #advantages, returns = self.get_advantages(rewards, values)
 
         for _ in range(self.gradient_steps):
             self.n_updates += 1
 
+            batch_size = 32 #todo: get batch_size directly from config
+            transitions, batch_inds = self.learning_role.buffer.sample(batch_size)
+            states = transitions.observations
+            actions = transitions.actions
+            log_probs = transitions.log_probs
+            advantages = full_advantages[batch_inds]
+            returns = full_returns[batch_inds]
+            values = self.get_values(states, actions)  # always use updated values --> check later if best
+
             # Iterate through over each agent's strategy
             # Each agent has its own actor. Critic (value network) is centralized.
             for u_id in self.learning_role.rl_strats.keys():
@@ -635,7 +647,7 @@ def update_policy(self):
 
 
                 # Evaluate the new log-probabilities and entropy under the current policy
-                action_logits, action_distribution = actor(states)
+                action_distribution = actor(states)[1]
                 new_log_probs = action_distribution.log_prob(actions).sum(-1)
                 
                 
diff --git a/assume/reinforcement_learning/buffer.py b/assume/reinforcement_learning/buffer.py
index d7b686621..a3d5ccb0b 100644
--- a/assume/reinforcement_learning/buffer.py
+++ b/assume/reinforcement_learning/buffer.py
@@ -492,3 +492,34 @@ def get(self) -> RolloutBufferTransitions:
         )
 
         return RolloutBufferTransitions(*tuple(map(self.to_torch, data)))
+
+
+    def sample(self, batch_size: int) -> RolloutBufferTransitions:
+        """
+        Samples a random batch of experiences from the rollout buffer.
+        Unlike the replay buffer, this samples only from the current rollout data (up to self.pos)
+        and includes log probabilities needed for PPO updates.
+
+        Args:
+            batch_size (int): The number of experiences to sample.
+
+        Returns:
+            RolloutBufferTransitions: A named tuple containing the sampled observations, actions, rewards,
+                and log probabilities.
+
+        Raises:
+            Exception: If there are less than batch_size entries in the buffer.
+        """
+        if self.pos < batch_size:
+            raise Exception(f"Not enough entries in buffer (need {batch_size}, have {self.pos})")
+        
+        batch_inds = np.random.randint(0, self.pos, size=batch_size)
+        
+        data = (
+            self.observations[batch_inds, :, :],
+            self.actions[batch_inds, :, :],
+            self.rewards[batch_inds],
+            self.log_probs[batch_inds],
+        )
+        
+        return RolloutBufferTransitions(*tuple(map(self.to_torch, data))), batch_inds # also return the indices of the sampled minibatch episodes
\ No newline at end of file

From 8928bf7a49b62a3073ebda032af50d1aa93d065f Mon Sep 17 00:00:00 2001
From: adiwied <92265142+AdrianWiedemann@users.noreply.github.com>
Date: Fri, 15 Nov 2024 12:16:08 -0600
Subject: [PATCH 17/23] fix clamping of action distribution

---
 .../reinforcement_learning/algorithms/ppo.py  |  2 +-
 .../neural_network_architecture.py            |  2 +-
 examples/inputs/example_02a/config.yaml       | 24 +++++++++----------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index ce412969c..8537fa328 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -757,7 +757,7 @@ def get_actions(rl_strategy, next_observation):
     # PREVIOUSLY SET TO (-1, 1)
     # Bound actions to [0, 1] range
     # TODO: Does it make more sense to log probaility of the action before or after clamping?
-    sampled_action = sampled_action.clamp(0, 1)
+    sampled_action = sampled_action.clamp(-1, 1) #--> why clamped and why often zero in action dim 1
 
     logger.debug(f"Clamped sampled action: {sampled_action}")
 
diff --git a/assume/reinforcement_learning/neural_network_architecture.py b/assume/reinforcement_learning/neural_network_architecture.py
index a5cfa09bd..8946cf6cd 100644
--- a/assume/reinforcement_learning/neural_network_architecture.py
+++ b/assume/reinforcement_learning/neural_network_architecture.py
@@ -189,7 +189,7 @@ def forward(self, obs):
 
         # Create a normal distribution for continuous actions (with assumed standard deviation of 
         # TODO: 0.01/0.0 as in marlbenchmark or 1.0 or sheduled decrease?)
-        dist = th.distributions.Normal(x, 0.2)
+        dist = th.distributions.Normal(x, 0.2) # --> eventuell als hyperparameter und eventuell sigmoid (0,1)
                 
         return x, dist
 
diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index 14335d2ed..964264a9d 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -163,10 +163,10 @@ base_ppo:
     max_bid_price: 100
     algorithm: ppo
     device: cpu
-    learning_rate: 0.001
-    validation_episodes_interval: 900 # after how many episodes the validation starts and the policy is updated
-    training_episodes: 1000
-    gradient_steps: 1
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 150
+    gradient_steps: 10
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called
@@ -178,14 +178,14 @@ base_ppo:
       noise_dt: 1
     ppo: 
       actor_architecture: dist
-      train_freq: 64h # how often write_to_learning_role gets called
-      gamma: 0.99  # Discount factor for future rewards
-      clip_ratio: 0.2  # Clipping parameter for policy updates
-      vf_coef: 0.5  # Value function coefficient in the loss function
-      entropy_coef: 0.01  # Entropy coefficient for exploration
-      max_grad_norm: 0.5  # Gradient clipping value
-      gae_lambda: 0.95  # GAE lambda for advantage estimation
-      batch_size: 5  # Batch size for each update, if mini-batch approach is used (currently not implemented)
+      train_freq: 33h # how often write_to_learning_role gets called
+      gamma: 0.995  # Discount factor for future rewards
+      clip_ratio: 0.01  # Clipping parameter for policy updates
+      vf_coef: 0.75  # Value function coefficient in the loss function
+      entropy_coef: 0.05  # Entropy coefficient for exploration
+      max_grad_norm: 0.3  # Gradient clipping value
+      gae_lambda: 0.98  # GAE lambda for advantage estimation
+      #batch_size: 5  # Batch size for each update, if mini-batch approach is used (currently not implemented)
 
   markets_config:
     EOM:

From 949ba73fbff35329ce974a1330facf4786bacec5 Mon Sep 17 00:00:00 2001
From: adiwied <92265142+AdrianWiedemann@users.noreply.github.com>
Date: Mon, 18 Nov 2024 17:46:56 -0600
Subject: [PATCH 18/23] ppo is now stable in ex2a base, added orthogonal
 initialization and working hyper params

---
 assume/reinforcement_learning/algorithms/ppo.py |  9 ++-------
 .../neural_network_architecture.py              | 17 +++++++++++++++++
 examples/inputs/example_02a/config.yaml         | 14 +++++++-------
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index 8537fa328..ec4b84f9f 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -58,6 +58,7 @@ def __init__(
         self.max_grad_norm = max_grad_norm
         self.gae_lambda = gae_lambda
         self.n_updates = 0  # Number of updates performed
+        self.batch_size = learning_role.batch_size 
 
         # write error if different actor_architecture than dist is used
         if actor_architecture != "dist":
@@ -627,8 +628,7 @@ def update_policy(self):
         for _ in range(self.gradient_steps):
             self.n_updates += 1
 
-            batch_size = 32 #todo: get batch_size directly from config
-            transitions, batch_inds = self.learning_role.buffer.sample(batch_size)
+            transitions, batch_inds = self.learning_role.buffer.sample(self.batch_size)
             states = transitions.observations
             actions = transitions.actions
             log_probs = transitions.log_probs
@@ -754,11 +754,6 @@ def get_actions(rl_strategy, next_observation):
 
     logger.debug(f"Detached log probability of the sampled action: {log_prob_action}")
 
-    # PREVIOUSLY SET TO (-1, 1)
-    # Bound actions to [0, 1] range
-    # TODO: Does it make more sense to log probaility of the action before or after clamping?
-    sampled_action = sampled_action.clamp(-1, 1) #--> why clamped and why often zero in action dim 1
-
     logger.debug(f"Clamped sampled action: {sampled_action}")
 
 
diff --git a/assume/reinforcement_learning/neural_network_architecture.py b/assume/reinforcement_learning/neural_network_architecture.py
index 8946cf6cd..6008b38c6 100644
--- a/assume/reinforcement_learning/neural_network_architecture.py
+++ b/assume/reinforcement_learning/neural_network_architecture.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
+import numpy as np
 import torch as th
 from torch import nn
 from torch.nn import functional as F
@@ -119,6 +120,10 @@ def __init__(self, n_agents: int, obs_dim: int, act_dim: int, float_type, unique
             self.FC_3 = nn.Linear(512, 128, dtype=float_type)
             self.FC_4 = nn.Linear(128, 1, dtype=float_type)
 
+        for layer in [self.FC_1, self.FC_2, self.FC_3, self.FC_4]:
+            nn.init.orthogonal_(layer.weight, gain=np.sqrt(2))
+            nn.init.constant_(layer.bias, 0.0)
+
     def forward(self, obs, actions):
         """
         Args:
@@ -174,6 +179,18 @@ class DistActor(MLPActor):
     """
     The actor based on the  neural network MLP actor that contrcuts a distribution for the action defintion.
     """
+    def __init__(self, obs_dim: int, act_dim: int, float_type, *args, **kwargs):
+        super().__init__(obs_dim, act_dim, float_type, *args, **kwargs)
+
+        #self.initialize_weights(final_gain=0.1)
+
+    def initialize_weights(self, final_gain=np.sqrt(2)):
+        for layer in [self.FC1, self.FC2]:
+            nn.init.orthogonal_(layer.weight, gain=np.sqrt(2))
+            nn.init.constant_(layer.bias, 0.0)
+        # use smaller gain for final layer
+        nn.init.orthogonal_(self.FC3.weight, gain=final_gain)
+        nn.init.constant_(self.FC3.bias, 0.0)
 
 
     def forward(self, obs):
diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index 964264a9d..b4e5eb8e0 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -165,8 +165,8 @@ base_ppo:
     device: cpu
     learning_rate: 0.0003
     validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
-    training_episodes: 150
-    gradient_steps: 10
+    training_episodes: 100
+    gradient_steps: 5
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called
@@ -179,13 +179,13 @@ base_ppo:
     ppo: 
       actor_architecture: dist
       train_freq: 33h # how often write_to_learning_role gets called
-      gamma: 0.995  # Discount factor for future rewards
-      clip_ratio: 0.01  # Clipping parameter for policy updates
+      gamma: 0.99 # Discount factor for future rewards
+      clip_ratio: 0.1  # Clipping parameter for policy updates
       vf_coef: 0.75  # Value function coefficient in the loss function
-      entropy_coef: 0.05  # Entropy coefficient for exploration
+      entropy_coef: 0.005  # Entropy coefficient for exploration
       max_grad_norm: 0.3  # Gradient clipping value
-      gae_lambda: 0.98  # GAE lambda for advantage estimation
-      #batch_size: 5  # Batch size for each update, if mini-batch approach is used (currently not implemented)
+      gae_lambda: 0.95  # GAE lambda for advantage estimation
+      batch_size: 11  # Batch size for each update, if mini-batch approach is used (currently not implemented)
 
   markets_config:
     EOM:

From 68c16a9a6dceb68635803f1a4229aa6a3b233fc2 Mon Sep 17 00:00:00 2001
From: adiwied <92265142+AdrianWiedemann@users.noreply.github.com>
Date: Tue, 19 Nov 2024 14:07:58 -0600
Subject: [PATCH 19/23] improve hyperparams

---
 examples/inputs/example_02a/config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index b4e5eb8e0..3b31d39fd 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -166,7 +166,7 @@ base_ppo:
     learning_rate: 0.0003
     validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
     training_episodes: 100
-    gradient_steps: 5
+    gradient_steps: 10
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called
@@ -180,7 +180,7 @@ base_ppo:
       actor_architecture: dist
       train_freq: 33h # how often write_to_learning_role gets called
       gamma: 0.99 # Discount factor for future rewards
-      clip_ratio: 0.1  # Clipping parameter for policy updates
+      clip_ratio: 0.05  # Clipping parameter for policy updates
       vf_coef: 0.75  # Value function coefficient in the loss function
       entropy_coef: 0.005  # Entropy coefficient for exploration
       max_grad_norm: 0.3  # Gradient clipping value

From 9ce1c36fe5cf0b09b0b779ba9e8f384d9e9685a1 Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Thu, 21 Nov 2024 17:30:29 +0100
Subject: [PATCH 20/23] align all configs with additional algorithm feature

---
 examples/inputs/example_02a/config.yaml | 130 ++++++++++++------------
 examples/inputs/example_02b/config.yaml |  54 +++++-----
 examples/inputs/example_02c/config.yaml |  27 +++--
 examples/inputs/example_02d/config.yaml |  59 ++++++-----
 examples/inputs/example_02e/config.yaml |  54 +++++-----
 examples/inputs/example_03a/config.yaml |  26 ++---
 examples/inputs/example_03b/config.yaml |  26 ++---
 7 files changed, 184 insertions(+), 192 deletions(-)

diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index 5701463bd..d1ca397cf 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -4,24 +4,26 @@
 
 base:
   end_date: 2019-03-31 00:00
+  learning_mode: true
   learning_config:
+    continue_learning: False
+    trained_policies_save_path: null
+    max_bid_price: 100
     algorithm: matd3
-    batch_size: 256
-    continue_learning: false
     device: cpu
-    episodes_collecting_initial_experience: 5
-    gamma: 0.99
-    gradient_steps: -1
-    learning_rate: 0.001
-    max_bid_price: 100
-    noise_dt: 1
-    noise_scale: 1
-    noise_sigma: 0.1
-    train_freq: 24h
-    trained_policies_save_path: null
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
     training_episodes: 100
-    validation_episodes_interval: 5
-  learning_mode: true
+    gradient_steps: 10
+    matd3:
+      actor_architecture: mlp
+      train_freq: 24h # how often write_to_learning_role gets called
+      episodes_collecting_initial_experience: 3
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
   markets_config:
     EOM:
       market_mechanism: pay_as_clear
@@ -41,28 +43,28 @@ base:
   save_frequency_hours: null
   start_date: 2019-03-01 00:00
   time_step: 1h
+
 base_lstm:
   end_date: 2019-03-31 00:00
   learning_config:
-    actor_architecture: lstm
+    continue_learning: False
+    trained_policies_save_path: null
+    max_bid_price: 100
     algorithm: matd3
-    batch_size: 256
-    continue_learning: false
     device: cpu
-    early_stopping_steps: 10
-    early_stopping_threshold: 0.05
-    episodes_collecting_initial_experience: 5
-    gamma: 0.99
-    gradient_steps: -1
-    learning_rate: 0.001
-    max_bid_price: 100
-    noise_dt: 1
-    noise_scale: 1
-    noise_sigma: 0.1
-    train_freq: 24h
-    trained_policies_save_path: null
-    training_episodes: 50
-    validation_episodes_interval: 5
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
+    gradient_steps: 10
+    matd3:
+      actor_architecture: lstm
+      train_freq: 24h # how often write_to_learning_role gets called
+      episodes_collecting_initial_experience: 3
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
   learning_mode: true
   markets_config:
     EOM:
@@ -93,41 +95,35 @@ tiny_ppo:
   time_step: 1h
   save_frequency_hours: null
   learning_mode: True
-
   learning_config:
     continue_learning: False
     trained_policies_save_path: null
     max_bid_price: 100
     algorithm: ppo
     device: cpu
-    learning_rate: 0.001
-    buffer_size: 5e5
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
+    gradient_steps: 10
     matd3:
       actor_architecture: mlp
-      training_episodes: 10
+      train_freq: 24h # how often write_to_learning_role gets called
       episodes_collecting_initial_experience: 3
-      train_freq: 48h
-      gradient_steps: -1
       batch_size: 64
       gamma: 0.99
       noise_sigma: 0.1
       noise_scale: 1
       noise_dt: 1
-      validation_episodes_interval: 5
     ppo: 
-      actor_architecture: mlp
-      training_episodes: 10
-      validation_episodes_interval: 5 # after how many episodes the validation starts and the policy is updated
-      train_freq: 64h # how often write_to_learning_role gets called
-      gamma: 0.99  # Discount factor for future rewards
-      epochs: 5 # #4  # Number of epochs for updating the policy
-      clip_ratio: 0.2  # Clipping parameter for policy updates
-      vf_coef: 0.5  # Value function coefficient in the loss function
-      entropy_coef: 0.01  # Entropy coefficient for exploration
-      max_grad_norm: 0.5  # Gradient clipping value
+      actor_architecture: dist
+      train_freq: 33h # how often write_to_learning_role gets called
+      gamma: 0.99 # Discount factor for future rewards
+      clip_ratio: 0.05  # Clipping parameter for policy updates
+      vf_coef: 0.75  # Value function coefficient in the loss function
+      entropy_coef: 0.005  # Entropy coefficient for exploration
+      max_grad_norm: 0.3  # Gradient clipping value
       gae_lambda: 0.95  # GAE lambda for advantage estimation
-      batch_size: 5  # Batch size for each update, if mini-batch approach is used (currently not implemented)
-
+      batch_size: 11  # Batch size for each update, if mini-batch approach is used (currently not implemented)
   markets_config:
     EOM:
       operator: EOM_operator
@@ -152,7 +148,6 @@ base_ppo:
   time_step: 1h
   save_frequency_hours: null
   learning_mode: True
-
   learning_config:
     continue_learning: False
     trained_policies_save_path: null
@@ -206,23 +201,24 @@ base_lstm:
 tiny:
   end_date: 2019-01-05 00:00
   learning_config:
-    actor_architecture: mlp
-    algorithm: matd3
-    batch_size: 64
-    continue_learning: false
-    device: cpu
-    episodes_collecting_initial_experience: 3
-    gamma: 0.99
-    gradient_steps: -1
-    learning_rate: 0.001
-    max_bid_price: 100
-    noise_dt: 1
-    noise_scale: 1
-    noise_sigma: 0.1
-    train_freq: 24h
+    continue_learning: False
     trained_policies_save_path: null
-    training_episodes: 10
-    validation_episodes_interval: 5
+    max_bid_price: 100
+    algorithm: ppo
+    device: cpu
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
+    gradient_steps: 10
+    matd3:
+      actor_architecture: mlp
+      train_freq: 24h # how often write_to_learning_role gets called
+      episodes_collecting_initial_experience: 3
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
   learning_mode: true
   markets_config:
     EOM:
diff --git a/examples/inputs/example_02b/config.yaml b/examples/inputs/example_02b/config.yaml
index d50d8a170..acbe56093 100644
--- a/examples/inputs/example_02b/config.yaml
+++ b/examples/inputs/example_02b/config.yaml
@@ -14,21 +14,20 @@ base:
     trained_policies_save_path: null
     max_bid_price: 100
     algorithm: matd3
-    actor_architecture: mlp
-    learning_rate: 0.001
-    training_episodes: 100
-    episodes_collecting_initial_experience: 3
-    train_freq: 24h
-    gradient_steps: 1
-    batch_size: 256
-    gamma: 0.99
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
-    validation_episodes_interval: 5
-    early_stopping_steps: 10
-    early_stopping_threshold: 0.05
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
+    gradient_steps: 10
+    matd3:
+      actor_architecture: mlp
+      train_freq: 24h # how often write_to_learning_role gets called
+      episodes_collecting_initial_experience: 3
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
 
   markets_config:
     EOM:
@@ -59,21 +58,20 @@ base_lstm:
     trained_policies_save_path: null
     max_bid_price: 100
     algorithm: matd3
-    actor_architecture: lstm
-    learning_rate: 0.001
-    training_episodes: 100
-    episodes_collecting_initial_experience: 3
-    train_freq: 24h
-    gradient_steps: 1
-    batch_size: 256
-    gamma: 0.99
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
-    validation_episodes_interval: 5
-    early_stopping_steps: 10
-    early_stopping_threshold: 0.05
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
+    gradient_steps: 10
+    matd3:
+      actor_architecture: mlp
+      train_freq: 24h # how often write_to_learning_role gets called
+      episodes_collecting_initial_experience: 3
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
 
   markets_config:
     EOM:
diff --git a/examples/inputs/example_02c/config.yaml b/examples/inputs/example_02c/config.yaml
index 0cd3f4091..d9f03497e 100644
--- a/examples/inputs/example_02c/config.yaml
+++ b/examples/inputs/example_02c/config.yaml
@@ -14,21 +14,20 @@ base:
     trained_policies_save_path: null
     max_bid_price: 100
     algorithm: matd3
-    actor_architecture: mlp
-    learning_rate: 0.001
-    training_episodes: 100
-    episodes_collecting_initial_experience: 3
-    train_freq: 24h
-    gradient_steps: 1
-    batch_size: 256
-    gamma: 0.99
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
-    validation_episodes_interval: 5
-    early_stopping_steps: 10
-    early_stopping_threshold: 0.05
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
+    gradient_steps: 10
+    matd3:
+      actor_architecture: mlp
+      train_freq: 24h # how often write_to_learning_role gets called
+      episodes_collecting_initial_experience: 3
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
 
   markets_config:
     EOM:
diff --git a/examples/inputs/example_02d/config.yaml b/examples/inputs/example_02d/config.yaml
index e52c39c78..3abd89c64 100644
--- a/examples/inputs/example_02d/config.yaml
+++ b/examples/inputs/example_02d/config.yaml
@@ -12,24 +12,22 @@ dam:
   learning_config:
     continue_learning: False
     trained_policies_save_path: null
-    max_bid_price: 200
+    max_bid_price: 100
     algorithm: matd3
-    actor_architecture: mlp
-    learning_rate: 0.0001
-    training_episodes: 30
-    episodes_collecting_initial_experience: 5
-    train_freq: 24h
-    gradient_steps: -1
-    batch_size: 128
-    gamma: 0.99
-    device: cuda:0
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
-    validation_episodes_interval: 5
-    order_types: ["SB", "BB", "LB"]
-    early_stopping_steps: 10
-    early_stopping_threshold: 0.05
+    device: cpu
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
+    gradient_steps: 10
+    matd3:
+      actor_architecture: mlp
+      train_freq: 24h # how often write_to_learning_role gets called
+      episodes_collecting_initial_experience: 3
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
 
   markets_config:
     EOM:
@@ -62,23 +60,24 @@ tiny:
   learning_mode: True
 
   learning_config:
-    continue_learning: True
+    continue_learning: False
     trained_policies_save_path: null
     max_bid_price: 100
     algorithm: matd3
-    actor_architecture: mlp
-    learning_rate: 0.001
-    training_episodes: 3
-    episodes_collecting_initial_experience: 1
-    train_freq: 24h
-    gradient_steps: -1
-    batch_size: 128
-    gamma: 0.99
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
-    validation_episodes_interval: 1
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
+    gradient_steps: 10
+    matd3:
+      actor_architecture: mlp
+      train_freq: 24h # how often write_to_learning_role gets called
+      episodes_collecting_initial_experience: 3
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
 
   markets_config:
     EOM:
diff --git a/examples/inputs/example_02e/config.yaml b/examples/inputs/example_02e/config.yaml
index 5011950cf..00b67fa81 100644
--- a/examples/inputs/example_02e/config.yaml
+++ b/examples/inputs/example_02e/config.yaml
@@ -12,21 +12,22 @@ tiny:
   learning_config:
     continue_learning: False
     trained_policies_save_path: null
-    max_bid_price: 50
+    max_bid_price: 100
     algorithm: matd3
-    actor_architecture: mlp
-    learning_rate: 0.001
-    training_episodes: 5
-    validation_episodes_interval: 2
-    episodes_collecting_initial_experience: 1
-    train_freq: 24h
-    gradient_steps: -1
-    batch_size: 64
-    gamma: 0.99
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
+    gradient_steps: 10
+    matd3:
+      actor_architecture: mlp
+      train_freq: 24h # how often write_to_learning_role gets called
+      episodes_collecting_initial_experience: 3
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
 
   markets_config:
     EOM:
@@ -58,21 +59,20 @@ base:
     trained_policies_save_path: null
     max_bid_price: 100
     algorithm: matd3
-    actor_architecture: mlp
-    learning_rate: 0.0001
-    training_episodes: 200
-    episodes_collecting_initial_experience: 5
-    train_freq: 1000h
-    gradient_steps: -1
-    batch_size: 256
-    gamma: 0.999
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
-    validation_episodes_interval: 5
-    early_stopping_steps: 10
-    early_stopping_threshold: 0.05
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
+    gradient_steps: 10
+    matd3:
+      actor_architecture: mlp
+      train_freq: 24h # how often write_to_learning_role gets called
+      episodes_collecting_initial_experience: 3
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
 
   markets_config:
     EOM:
diff --git a/examples/inputs/example_03a/config.yaml b/examples/inputs/example_03a/config.yaml
index 9acc8f80a..a7973cb57 100644
--- a/examples/inputs/example_03a/config.yaml
+++ b/examples/inputs/example_03a/config.yaml
@@ -14,20 +14,20 @@ base_case_2019:
     trained_policies_save_path: null
     max_bid_price: 100
     algorithm: matd3
-    learning_rate: 0.001
-    training_episodes: 50
-    episodes_collecting_initial_experience: 5
-    train_freq: 24h
-    gradient_steps: 1
-    batch_size: 256
-    gamma: 0.99
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
-    validation_episodes_interval: 5
-    early_stopping_steps: 10
-    early_stopping_threshold: 0.05
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
+    gradient_steps: 10
+    matd3:
+      actor_architecture: mlp
+      train_freq: 24h # how often write_to_learning_role gets called
+      episodes_collecting_initial_experience: 3
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
 
   markets_config:
     EOM:
diff --git a/examples/inputs/example_03b/config.yaml b/examples/inputs/example_03b/config.yaml
index 176397bd3..1af2dbbd5 100644
--- a/examples/inputs/example_03b/config.yaml
+++ b/examples/inputs/example_03b/config.yaml
@@ -14,20 +14,20 @@ base_case_2021:
     trained_policies_save_path: null
     max_bid_price: 100
     algorithm: matd3
-    learning_rate: 0.001
-    training_episodes: 50
-    episodes_collecting_initial_experience: 5
-    train_freq: 24h
-    gradient_steps: 1
-    batch_size: 256
-    gamma: 0.99
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
-    validation_episodes_interval: 5
-    early_stopping_steps: 10
-    early_stopping_threshold: 0.05
+    learning_rate: 0.0003
+    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
+    training_episodes: 100
+    gradient_steps: 10
+    matd3:
+      actor_architecture: mlp
+      train_freq: 24h # how often write_to_learning_role gets called
+      episodes_collecting_initial_experience: 3
+      batch_size: 64
+      gamma: 0.99
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
 
   markets_config:
     EOM:

From 3f1ac6c0f6291ac88065772d5b456b0094723429 Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Mon, 25 Nov 2024 10:55:29 +0100
Subject: [PATCH 21/23] - adjusted all learning configs to match new config
 format

---
 examples/inputs/example_02a/config.yaml       | 219 +++++++-----------
 examples/inputs/example_02b/config.yaml       |   4 +-
 examples/inputs/example_02c/config.yaml       |   4 +-
 examples/inputs/example_02d/config.yaml       |   8 +-
 examples/inputs/example_02e/config.yaml       |   8 +-
 examples/inputs/example_03a/config.yaml       |   4 +-
 examples/inputs/example_03b/config.yaml       |   4 +-
 ...forcement_learning_algorithm_example.ipynb |  30 ++-
 8 files changed, 119 insertions(+), 162 deletions(-)

diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index d1ca397cf..8758afd0d 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -1,29 +1,24 @@
-# SPDX-FileCopyrightText: ASSUME Developers
-#
-# SPDX-License-Identifier: AGPL-3.0-or-later
-
 base:
   end_date: 2019-03-31 00:00
-  learning_mode: true
   learning_config:
-    continue_learning: False
-    trained_policies_save_path: null
-    max_bid_price: 100
     algorithm: matd3
+    continue_learning: false
     device: cpu
-    learning_rate: 0.0003
-    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
-    training_episodes: 100
-    gradient_steps: 10
+    gradient_steps: -1
+    learning_rate: 0.001
     matd3:
       actor_architecture: mlp
-      train_freq: 24h # how often write_to_learning_role gets called
-      episodes_collecting_initial_experience: 3
       batch_size: 64
+      episodes_collecting_initial_experience: 3
       gamma: 0.99
-      noise_sigma: 0.1
-      noise_scale: 1
       noise_dt: 1
+      noise_scale: 1
+      noise_sigma: 0.1
+    max_bid_price: 100
+    trained_policies_save_path: null
+    training_episodes: 100
+    validation_episodes_interval: 10
+  learning_mode: true
   markets_config:
     EOM:
       market_mechanism: pay_as_clear
@@ -43,28 +38,40 @@ base:
   save_frequency_hours: null
   start_date: 2019-03-01 00:00
   time_step: 1h
-
 base_lstm:
+  start_date: 2019-03-01 00:00
+  time_step: 1h
+base_ppo:
   end_date: 2019-03-31 00:00
   learning_config:
-    continue_learning: False
-    trained_policies_save_path: null
-    max_bid_price: 100
-    algorithm: matd3
+    algorithm: ppo
+    continue_learning: false
     device: cpu
-    learning_rate: 0.0003
-    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
-    training_episodes: 100
     gradient_steps: 10
+    learning_rate: 0.0003
     matd3:
-      actor_architecture: lstm
-      train_freq: 24h # how often write_to_learning_role gets called
-      episodes_collecting_initial_experience: 3
+      actor_architecture: mlp
       batch_size: 64
+      episodes_collecting_initial_experience: 3
       gamma: 0.99
-      noise_sigma: 0.1
-      noise_scale: 1
       noise_dt: 1
+      noise_scale: 1
+      noise_sigma: 0.1
+      train_freq: 24h
+    max_bid_price: 100
+    ppo:
+      actor_architecture: dist
+      batch_size: 11
+      clip_ratio: 0.05
+      entropy_coef: 0.005
+      gae_lambda: 0.95
+      gamma: 0.99
+      max_grad_norm: 0.3
+      train_freq: 33h
+      vf_coef: 0.75
+    trained_policies_save_path: null
+    training_episodes: 100
+    validation_episodes_interval: 10
   learning_mode: true
   markets_config:
     EOM:
@@ -82,143 +89,81 @@ base_lstm:
         duration: 1h
         first_delivery: 1h
       volume_unit: MWh
-      maximum_bid_volume: 100000
-      maximum_bid_price: 3000
-      minimum_bid_price: -500
-      price_unit: EUR/MWh
-      market_mechanism: pay_as_clear
-
-
-tiny_ppo:
-  start_date: 2019-01-01 00:00
-  end_date: 2019-01-05 00:00 
-  time_step: 1h
   save_frequency_hours: null
-  learning_mode: True
-  learning_config:
-    continue_learning: False
-    trained_policies_save_path: null
-    max_bid_price: 100
-    algorithm: ppo
-    device: cpu
-    learning_rate: 0.0003
-    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
-    training_episodes: 100
-    gradient_steps: 10
-    matd3:
-      actor_architecture: mlp
-      train_freq: 24h # how often write_to_learning_role gets called
-      episodes_collecting_initial_experience: 3
-      batch_size: 64
-      gamma: 0.99
-      noise_sigma: 0.1
-      noise_scale: 1
-      noise_dt: 1
-    ppo: 
-      actor_architecture: dist
-      train_freq: 33h # how often write_to_learning_role gets called
-      gamma: 0.99 # Discount factor for future rewards
-      clip_ratio: 0.05  # Clipping parameter for policy updates
-      vf_coef: 0.75  # Value function coefficient in the loss function
-      entropy_coef: 0.005  # Entropy coefficient for exploration
-      max_grad_norm: 0.3  # Gradient clipping value
-      gae_lambda: 0.95  # GAE lambda for advantage estimation
-      batch_size: 11  # Batch size for each update, if mini-batch approach is used (currently not implemented)
-  markets_config:
-    EOM:
-      operator: EOM_operator
-      product_type: energy
-      products:
-        - duration: 1h
-          count: 1
-          first_delivery: 1h
-      opening_frequency: 1h
-      opening_duration: 1h
-      volume_unit: MWh
-      maximum_bid_volume: 100000
-      maximum_bid_price: 3000
-      minimum_bid_price: -500
-      price_unit: EUR/MWh
-      market_mechanism: pay_as_clear
-
-
-base_ppo:
   start_date: 2019-03-01 00:00
-  end_date: 2019-03-31 00:00
   time_step: 1h
-  save_frequency_hours: null
-  learning_mode: True
+tiny:
+  end_date: 2019-01-05 00:00
   learning_config:
-    continue_learning: False
-    trained_policies_save_path: null
-    max_bid_price: 100
     algorithm: ppo
+    continue_learning: false
     device: cpu
-    learning_rate: 0.0003
-    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
-    training_episodes: 100
     gradient_steps: 10
+    learning_rate: 0.0003
     matd3:
       actor_architecture: mlp
-      train_freq: 24h # how often write_to_learning_role gets called
-      episodes_collecting_initial_experience: 3
       batch_size: 64
+      episodes_collecting_initial_experience: 3
       gamma: 0.99
-      noise_sigma: 0.1
-      noise_scale: 1
       noise_dt: 1
-    ppo: 
-      actor_architecture: dist
-      train_freq: 33h # how often write_to_learning_role gets called
-      gamma: 0.99 # Discount factor for future rewards
-      clip_ratio: 0.05  # Clipping parameter for policy updates
-      vf_coef: 0.75  # Value function coefficient in the loss function
-      entropy_coef: 0.005  # Entropy coefficient for exploration
-      max_grad_norm: 0.3  # Gradient clipping value
-      gae_lambda: 0.95  # GAE lambda for advantage estimation
-      batch_size: 11  # Batch size for each update, if mini-batch approach is used (currently not implemented)
-
+      noise_scale: 1
+      noise_sigma: 0.1
+      train_freq: 24h
+    max_bid_price: 100
+    trained_policies_save_path: null
+    training_episodes: 100
+    validation_episodes_interval: 10
+  learning_mode: true
   markets_config:
     EOM:
+      market_mechanism: pay_as_clear
+      maximum_bid_price: 3000
+      maximum_bid_volume: 100000
+      minimum_bid_price: -500
+      opening_duration: 1h
+      opening_frequency: 1h
       operator: EOM_operator
+      price_unit: EUR/MWh
       product_type: energy
       products:
-        - duration: 1h
-          count: 1
-          first_delivery: 1h
-      opening_frequency: 1h
-      opening_duration: 1h
+      - count: 1
+        duration: 1h
+        first_delivery: 1h
       volume_unit: MWh
-      maximum_bid_volume: 100000
-      maximum_bid_price: 3000
-      minimum_bid_price: -500
-      price_unit: EUR/MWh
-      market_mechanism: pay_as_clear
-
-base_lstm:
-  start_date: 2019-03-01 00:00
+  save_frequency_hours: null
+  start_date: 2019-01-01 00:00
   time_step: 1h
-tiny:
+tiny_ppo:
   end_date: 2019-01-05 00:00
   learning_config:
-    continue_learning: False
-    trained_policies_save_path: null
-    max_bid_price: 100
     algorithm: ppo
+    continue_learning: false
     device: cpu
-    learning_rate: 0.0003
-    validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
-    training_episodes: 100
     gradient_steps: 10
+    learning_rate: 0.0003
     matd3:
       actor_architecture: mlp
-      train_freq: 24h # how often write_to_learning_role gets called
-      episodes_collecting_initial_experience: 3
       batch_size: 64
+      episodes_collecting_initial_experience: 3
       gamma: 0.99
-      noise_sigma: 0.1
-      noise_scale: 1
       noise_dt: 1
+      noise_scale: 1
+      noise_sigma: 0.1
+      train_freq: 24h
+    max_bid_price: 100
+    ppo:
+      actor_architecture: dist
+      batch_size: 11
+      clip_ratio: 0.05
+      entropy_coef: 0.005
+      gae_lambda: 0.95
+      gamma: 0.99
+      max_grad_norm: 0.3
+      train_freq: 33h
+      vf_coef: 0.75
+    trained_policies_save_path: null
+    training_episodes: 100
+    validation_episodes_interval: 10
   learning_mode: true
   markets_config:
     EOM:
diff --git a/examples/inputs/example_02b/config.yaml b/examples/inputs/example_02b/config.yaml
index acbe56093..e40f88867 100644
--- a/examples/inputs/example_02b/config.yaml
+++ b/examples/inputs/example_02b/config.yaml
@@ -59,10 +59,10 @@ base_lstm:
     max_bid_price: 100
     algorithm: matd3
     device: cpu
-    learning_rate: 0.0003
+    learning_rate: 0.001
     validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
     training_episodes: 100
-    gradient_steps: 10
+    gradient_steps: -1
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called
diff --git a/examples/inputs/example_02c/config.yaml b/examples/inputs/example_02c/config.yaml
index d9f03497e..694e76248 100644
--- a/examples/inputs/example_02c/config.yaml
+++ b/examples/inputs/example_02c/config.yaml
@@ -15,10 +15,10 @@ base:
     max_bid_price: 100
     algorithm: matd3
     device: cpu
-    learning_rate: 0.0003
+    learning_rate: 0.001
     validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
     training_episodes: 100
-    gradient_steps: 10
+    gradient_steps: -1
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called
diff --git a/examples/inputs/example_02d/config.yaml b/examples/inputs/example_02d/config.yaml
index 3abd89c64..b79e8dbe6 100644
--- a/examples/inputs/example_02d/config.yaml
+++ b/examples/inputs/example_02d/config.yaml
@@ -15,10 +15,10 @@ dam:
     max_bid_price: 100
     algorithm: matd3
     device: cpu
-    learning_rate: 0.0003
+    learning_rate: 0.001
     validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
     training_episodes: 100
-    gradient_steps: 10
+    gradient_steps: -1
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called
@@ -65,10 +65,10 @@ tiny:
     max_bid_price: 100
     algorithm: matd3
     device: cpu
-    learning_rate: 0.0003
+    learning_rate: 0.001
     validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
     training_episodes: 100
-    gradient_steps: 10
+    gradient_steps: -1
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called
diff --git a/examples/inputs/example_02e/config.yaml b/examples/inputs/example_02e/config.yaml
index 00b67fa81..7a08fab36 100644
--- a/examples/inputs/example_02e/config.yaml
+++ b/examples/inputs/example_02e/config.yaml
@@ -15,10 +15,10 @@ tiny:
     max_bid_price: 100
     algorithm: matd3
     device: cpu
-    learning_rate: 0.0003
+    learning_rate: 0.001
     validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
     training_episodes: 100
-    gradient_steps: 10
+    gradient_steps: -1
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called
@@ -60,10 +60,10 @@ base:
     max_bid_price: 100
     algorithm: matd3
     device: cpu
-    learning_rate: 0.0003
+    learning_rate: 0.001
     validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
     training_episodes: 100
-    gradient_steps: 10
+    gradient_steps: -1
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called
diff --git a/examples/inputs/example_03a/config.yaml b/examples/inputs/example_03a/config.yaml
index a7973cb57..89b146a18 100644
--- a/examples/inputs/example_03a/config.yaml
+++ b/examples/inputs/example_03a/config.yaml
@@ -15,10 +15,10 @@ base_case_2019:
     max_bid_price: 100
     algorithm: matd3
     device: cpu
-    learning_rate: 0.0003
+    learning_rate: 0.001
     validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
     training_episodes: 100
-    gradient_steps: 10
+    gradient_steps: -1
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called
diff --git a/examples/inputs/example_03b/config.yaml b/examples/inputs/example_03b/config.yaml
index 1af2dbbd5..15f60ddb3 100644
--- a/examples/inputs/example_03b/config.yaml
+++ b/examples/inputs/example_03b/config.yaml
@@ -15,10 +15,10 @@ base_case_2021:
     max_bid_price: 100
     algorithm: matd3
     device: cpu
-    learning_rate: 0.0003
+    learning_rate: 0.001
     validation_episodes_interval: 10 # after how many episodes the validation starts and the policy is updated
     training_episodes: 100
-    gradient_steps: 10
+    gradient_steps: -1
     matd3:
       actor_architecture: mlp
       train_freq: 24h # how often write_to_learning_role gets called
diff --git a/examples/notebooks/04_reinforcement_learning_algorithm_example.ipynb b/examples/notebooks/04_reinforcement_learning_algorithm_example.ipynb
index 291fcf344..b693da0e8 100644
--- a/examples/notebooks/04_reinforcement_learning_algorithm_example.ipynb
+++ b/examples/notebooks/04_reinforcement_learning_algorithm_example.ipynb
@@ -843,18 +843,20 @@
     "    \"trained_policies_save_path\": None,\n",
     "    \"max_bid_price\": 100,\n",
     "    \"algorithm\": \"matd3\",\n",
+    "    \"device\": \"cpu\",\n",
     "    \"learning_rate\": 0.001,\n",
+    "    \"validation_episodes_interval\": 10,\n",
     "    \"training_episodes\": 100,\n",
-    "    \"episodes_collecting_initial_experience\": 5,\n",
-    "    \"train_freq\": \"24h\",\n",
     "    \"gradient_steps\": -1,\n",
-    "    \"batch_size\": 256,\n",
-    "    \"gamma\": 0.99,\n",
-    "    \"device\": \"cpu\",\n",
-    "    \"noise_sigma\": 0.1,\n",
-    "    \"noise_scale\": 1,\n",
-    "    \"noise_dt\": 1,\n",
-    "    \"validation_episodes_interval\": 5,\n",
+    "    \"matd3\":{\n",
+    "      \"actor_architecture\": \"mlp\",\n",
+    "      \"episodes_collecting_initial_experience\": 3,\n",
+    "      \"batch_size\": 64,\n",
+    "      \"gamma\": 0.99,\n",
+    "      \"noise_sigma\": 0.1,\n",
+    "      \"noise_scale\": 1,\n",
+    "      \"noise_dt\": 1,\n",
+    "    }\n",
     "}"
    ]
   },
@@ -880,6 +882,16 @@
     "    yaml.safe_dump(data, file)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcf605c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "4bea575f",

From 61e9bfae69a4d03e332310392b1d453301689f66 Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Tue, 3 Dec 2024 11:23:28 +0100
Subject: [PATCH 22/23] - cleaned weirdly merged configs - made dependent
 logging in asyn_run function to avoid breaking Traning Episode Counter -
 fixed reading of right needed actor architecture dependend on algorithm for
 run with pretrained staretgey (without learning_role)

---
 .../reinforcement_learning/algorithms/ppo.py  | 159 ------------------
 .../neural_network_architecture.py            |  10 +-
 assume/scenario/loader_csv.py                 |   2 +-
 assume/strategies/learning_strategies.py      |   3 +-
 assume/world.py                               |   6 +-
 examples/inputs/example_02a/config.yaml       | 108 ++++++------
 6 files changed, 65 insertions(+), 223 deletions(-)

diff --git a/assume/reinforcement_learning/algorithms/ppo.py b/assume/reinforcement_learning/algorithms/ppo.py
index ec4b84f9f..788719bfc 100644
--- a/assume/reinforcement_learning/algorithms/ppo.py
+++ b/assume/reinforcement_learning/algorithms/ppo.py
@@ -78,27 +78,6 @@ def save_params(self, directory):
         self.save_critic_params(directory=f"{directory}/critics")
         self.save_actor_params(directory=f"{directory}/actors")
 
- 
-    # Removed critic_target in comparison to MATD3
-    # Decentralized
-    # def save_critic_params(self, directory):
-    #     """
-    #     Save the parameters of critic networks.
-
-    #     This method saves the parameters of the critic networks, including the critic's state_dict and the critic's optimizer state_dict. 
-    #     It organizes the saved parameters into a directory structure specific to the critic associated with each learning strategy.
-
-    #     Args:
-    #         directory (str): The base directory for saving the parameters.
-    #     """
-    #     os.makedirs(directory, exist_ok=True)
-    #     for u_id in self.learning_role.rl_strats.keys():
-    #         obj = {
-    #             "critic": self.learning_role.rl_strats[u_id].critic.state_dict(),
-    #             "critic_optimizer": self.learning_role.rl_strats[u_id].critic.optimizer.state_dict(),
-    #         }
-    #         path = f"{directory}/critic_{u_id}.pt"
-    #         th.save(obj, path)
 
 
     # Centralized
@@ -164,40 +143,6 @@ def load_params(self, directory: str) -> None:
         self.load_critic_params(directory)
         self.load_actor_params(directory)
 
-    # Removed critic_target in comparison to MATD3 (critic network = value function network)
-    # Decentralized
-    # def load_critic_params(self, directory: str) -> None:
-    #     """
-    #     Load the parameters of critic networks from a specified directory.
-
-    #     This method loads the parameters of critic networks, including the critic's state_dict and
-    #     the critic's optimizer state_dict, from the specified directory. It iterates through the learning strategies associated
-    #     with the learning role, loads the respective parameters, and updates the critic networks accordingly.
-
-    #     Args:
-    #         directory (str): The directory from which the parameters should be loaded.
-    #     """
-    #     logger.info("Loading critic parameters...")
-
-    #     if not os.path.exists(directory):
-    #         logger.warning(
-    #             "Specified directory for loading the critics does not exist! Starting with randomly initialized values!"
-    #         )
-    #         return
-
-    #     for u_id in self.learning_role.rl_strats.keys():
-    #         try:
-    #             critic_params = self.load_obj(
-    #                 directory=f"{directory}/critics/critic_{str(u_id)}.pt"
-    #             )
-    #             self.learning_role.rl_strats[u_id].critic.load_state_dict(
-    #                 critic_params["critic"]
-    #             )
-    #             self.learning_role.rl_strats[u_id].critic.optimizer.load_state_dict(
-    #                 critic_params["critic_optimizer"]
-    #             )
-    #         except Exception:
-    #             logger.warning(f"No critic values loaded for agent {u_id}")
 
 
     # Centralized
@@ -270,33 +215,6 @@ def load_actor_params(self, directory: str) -> None:
             except Exception:
                 logger.warning(f"No actor values loaded for agent {u_id}")
 
-    # Removed target_critics and actor_target in comparison to MATD3
-    # Decentralized
-    # def initialize_policy(self, actors_and_critics: dict = None) -> None:
-    #     """
-    #     Create actor and critic networks for reinforcement learning.
-
-    #     If `actors_and_critics` is None, this method creates new actor and critic networks.
-    #     If `actors_and_critics` is provided, it assigns existing networks to the respective attributes.
-
-    #     Args:
-    #         actors_and_critics (dict): The actor and critic networks to be assigned.
-    #     """
-    #     if actors_and_critics is None:
-    #         self.create_actors()
-    #         self.create_critics()
-    #     else:
-    #         # Decentralized initialization of actors and critics
-    #         for u_id, unit_strategy in self.learning_role.rl_strats.items():
-    #             unit_strategy.actor = actors_and_critics["actors"][u_id]
-    #             # unit_strategy.actor_target = actors_and_critics["actor_targets"][u_id]
-    #             unit_strategy.critic = actors_and_critics["critics"][u_id]
-    #             # unit_strategy.critic_target = actors_and_critics["critic_targets"][u_id]
-
-    #         # Assign shared dimensions
-    #         self.obs_dim = actors_and_critics["obs_dim"]
-    #         self.act_dim = actors_and_critics["act_dim"]
-    #         self.unique_obs_dim = actors_and_critics["unique_obs_dim"]
 
     # Centralized
     def initialize_policy(self, actors_and_critics: dict = None) -> None:
@@ -372,44 +290,6 @@ def create_actors(self) -> None:
         else:
             self.act_dim = act_dim_list[0]
 
-    # Removed target_critics in comparison to MATD3
-    # Changed initialization of CriticPPO compared to MATD3
-    # Decentralized
-    # def create_critics(self) -> None:
-    #     """
-    #     Create decentralized critic networks for reinforcement learning.
-
-    #     This method initializes a separate critic network for each agent in the reinforcement learning setup.
-    #     Each critic learns to predict the value function based on the individual agent's observation.
-
-    #     Notes:
-    #         Each agent has its own critic, so the critic is no longer shared among all agents.
-    #     """
-
-    #     unique_obs_dim_list = []
-
-    #     for _, unit_strategy in self.learning_role.rl_strats.items():
-    #         unit_strategy.critic = CriticPPO(
-    #             obs_dim=unit_strategy.obs_dim,
-    #             float_type=self.float_type,
-    #         ).to(self.device)
-
-    #         unit_strategy.critic.optimizer = Adam(
-    #             unit_strategy.critic.parameters(), lr=self.learning_rate
-    #         )
-
-    #         unique_obs_dim_list.append(unit_strategy.unique_obs_dim)
-
-    #     # Check if all unique_obs_dim are the same and raise an error if not
-    #     # If they are all the same, set the unique_obs_dim attribute
-    #     if len(set(unique_obs_dim_list)) > 1:
-    #         raise ValueError(
-    #             "All unique_obs_dim values must be the same for all RL agents"
-    #         )
-    #     else:
-    #         self.unique_obs_dim = unique_obs_dim_list[0]
-
-
 
     # Centralized
     def create_critics(self) -> None:
@@ -455,34 +335,6 @@ def create_critics(self) -> None:
         else:
             self.unique_obs_dim = unique_obs_dim_list[0]
 
-    # Decentralized
-    # def extract_policy(self) -> dict:
-    #     """
-    #     Extract actor and critic networks.
-
-    #     This method extracts the actor and critic networks associated with each learning strategy and organizes them into a
-    #     dictionary structure. The extracted networks include actors and critics. The resulting
-    #     dictionary is typically used for saving and sharing these networks.
-
-    #     Returns:
-    #         dict: The extracted actor and critic networks.
-    #     """
-    #     actors = {}
-    #     critics = {}
-
-    #     for u_id, unit_strategy in self.learning_role.rl_strats.items():
-    #         actors[u_id] = unit_strategy.actor
-    #         critics[u_id] = unit_strategy.critic
-
-    #     actors_and_critics = {
-    #         "actors": actors,
-    #         "critics": critics,
-    #         "obs_dim": self.obs_dim,
-    #         "act_dim": self.act_dim,
-    #         "unique_obs_dim": self.unique_obs_dim,
-    #     }
-
-    #     return actors_and_critics
 
     # Centralized
     def extract_policy(self) -> dict:
@@ -614,16 +466,7 @@ def update_policy(self):
         full_values = self.get_values(full_transitions.observations, full_transitions.actions)
         full_advantages, full_returns = self.get_advantages(full_transitions.rewards, full_values)
         
-        #states = transitions.observations
-        #actions = transitions.actions
-        #rewards = transitions.rewards
-        #log_probs = transitions.log_probs
-        
-        # Pass the current states through the critic network to get value estimates.
-        #values = self.get_values(states, actions)
 
-        # Compute advantages using Generalized Advantage Estimation (GAE)
-        #advantages, returns = self.get_advantages(rewards, values)
 
         for _ in range(self.gradient_steps):
             self.n_updates += 1
@@ -754,8 +597,6 @@ def get_actions(rl_strategy, next_observation):
 
     logger.debug(f"Detached log probability of the sampled action: {log_prob_action}")
 
-    logger.debug(f"Clamped sampled action: {sampled_action}")
-
 
     return sampled_action, log_prob_action
 
diff --git a/assume/reinforcement_learning/neural_network_architecture.py b/assume/reinforcement_learning/neural_network_architecture.py
index 6008b38c6..3cd8babc0 100644
--- a/assume/reinforcement_learning/neural_network_architecture.py
+++ b/assume/reinforcement_learning/neural_network_architecture.py
@@ -95,7 +95,7 @@ def q1_forward(self, obs, actions):
 class CriticPPO(nn.Module):
     """Critic Network for Proximal Policy Optimization (PPO).
 
-    Centralized critic.
+    Centralized critic, meaning that is has access to the observation space of all competitive learning agents.
 
     Args:
         n_agents (int): Number of agents
@@ -182,8 +182,7 @@ class DistActor(MLPActor):
     def __init__(self, obs_dim: int, act_dim: int, float_type, *args, **kwargs):
         super().__init__(obs_dim, act_dim, float_type, *args, **kwargs)
 
-        #self.initialize_weights(final_gain=0.1)
-
+        
     def initialize_weights(self, final_gain=np.sqrt(2)):
         for layer in [self.FC1, self.FC2]:
             nn.init.orthogonal_(layer.weight, gain=np.sqrt(2))
@@ -198,11 +197,6 @@ def forward(self, obs):
         x = F.relu(self.FC2(x))
         # Works with MATD3, output of softsign: [-1, 1]
         x = F.softsign(self.FC3(x))
-        
-        # x = th.tanh(self.FC3(x))
-
-        # Tested for PPO, scales the output to [0, 1] range
-        #x = th.sigmoid(self.FC3(x))
 
         # Create a normal distribution for continuous actions (with assumed standard deviation of 
         # TODO: 0.01/0.0 as in marlbenchmark or 1.0 or sheduled decrease?)
diff --git a/assume/scenario/loader_csv.py b/assume/scenario/loader_csv.py
index 65dd8d653..fe70fa5f4 100644
--- a/assume/scenario/loader_csv.py
+++ b/assume/scenario/loader_csv.py
@@ -997,6 +997,7 @@ def run_learning(
                 )
 
             if world.learning_role.rl_algorithm_name == "ppo":
+                # TODO: add surrogate loss as a parameter to compare_and_save_policies
                 # PPO uses the surrogate loss to monitor policy updates.
                 # The surrogate loss quantifies how much the new policy has changed compared to the old one.
                 # If the surrogate loss becomes too small or too large, it can indicate issues:
@@ -1013,7 +1014,6 @@ def run_learning(
                 # terminate = world.learning_role.compare_and_save_policies({"surrogate_loss": surrogate_loss})
 
                 # Reset the PPO Rollout Buffer after each update
-                # TODO: add surrogate loss as a parameter to compare_and_save_policies
                 inter_episodic_data["buffer"].reset()
 
                 total_rewards = world.output_role.get_sum_reward()
diff --git a/assume/strategies/learning_strategies.py b/assume/strategies/learning_strategies.py
index 71852ca38..8881ba3b4 100644
--- a/assume/strategies/learning_strategies.py
+++ b/assume/strategies/learning_strategies.py
@@ -139,7 +139,8 @@ def __init__(self, *args, **kwargs):
 
         # based on learning config
         self.algorithm = kwargs.get("algorithm", "matd3")
-        actor_architecture = kwargs.get("actor_architecture", "mlp")
+        algo_config = kwargs.get(self.algorithm, {})
+        actor_architecture = algo_config.get("actor_architecture", "mlp")
 
         if actor_architecture in actor_architecture_aliases.keys():
             self.actor_architecture_class = actor_architecture_aliases[
diff --git a/assume/world.py b/assume/world.py
index 3c2a0d81e..6eb632319 100644
--- a/assume/world.py
+++ b/assume/world.py
@@ -656,11 +656,13 @@ async def async_run(self, start_ts: datetime, end_ts: datetime):
             start_ts (datetime.datetime): The start timestamp for the simulation run.
             end_ts (datetime.datetime): The end timestamp for the simulation run.
         """
-        logger.info("activating container")
+        if not self.learning_mode:
+            logger.info("activating container")
         # agent is implicit added to self.container._agents
         async with activate(self.container) as c:
             await tasks_complete_or_sleeping(c)
-            logger.info("all agents up - starting simulation")
+            if not self.learning_mode:
+                logger.info("all agents up - starting simulation")
             pbar = tqdm(total=end_ts - start_ts)
 
             # allow registration before first opening
diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index 8758afd0d..9347e9f6c 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -1,11 +1,19 @@
-base:
-  end_date: 2019-03-31 00:00
+tiny:
+  start_date: 2019-01-01 00:00
+  end_date: 2019-01-05 00:00
+  save_frequency_hours: null  
+  time_step: 1h
+  learning_mode: true
   learning_config:
-    algorithm: matd3
+    algorithm: ppo
     continue_learning: false
     device: cpu
-    gradient_steps: -1
-    learning_rate: 0.001
+    gradient_steps: 10
+    learning_rate: 0.0003
+    max_bid_price: 100
+    trained_policies_save_path: null
+    training_episodes: 100
+    validation_episodes_interval: 10
     matd3:
       actor_architecture: mlp
       batch_size: 64
@@ -14,11 +22,7 @@ base:
       noise_dt: 1
       noise_scale: 1
       noise_sigma: 0.1
-    max_bid_price: 100
-    trained_policies_save_path: null
-    training_episodes: 100
-    validation_episodes_interval: 10
-  learning_mode: true
+      train_freq: 24h
   markets_config:
     EOM:
       market_mechanism: pay_as_clear
@@ -35,20 +39,21 @@ base:
         duration: 1h
         first_delivery: 1h
       volume_unit: MWh
-  save_frequency_hours: null
-  start_date: 2019-03-01 00:00
-  time_step: 1h
-base_lstm:
-  start_date: 2019-03-01 00:00
-  time_step: 1h
-base_ppo:
+
+
+base:
   end_date: 2019-03-31 00:00
   learning_config:
-    algorithm: ppo
+    algorithm: matd3
     continue_learning: false
     device: cpu
-    gradient_steps: 10
-    learning_rate: 0.0003
+    gradient_steps: -1
+    learning_rate: 0.001
+    max_bid_price: 100
+    trained_policies_save_path: null
+    training_episodes: 100
+    validation_episodes_interval: 10
+    learning_mode: true
     matd3:
       actor_architecture: mlp
       batch_size: 64
@@ -57,22 +62,6 @@ base_ppo:
       noise_dt: 1
       noise_scale: 1
       noise_sigma: 0.1
-      train_freq: 24h
-    max_bid_price: 100
-    ppo:
-      actor_architecture: dist
-      batch_size: 11
-      clip_ratio: 0.05
-      entropy_coef: 0.005
-      gae_lambda: 0.95
-      gamma: 0.99
-      max_grad_norm: 0.3
-      train_freq: 33h
-      vf_coef: 0.75
-    trained_policies_save_path: null
-    training_episodes: 100
-    validation_episodes_interval: 10
-  learning_mode: true
   markets_config:
     EOM:
       market_mechanism: pay_as_clear
@@ -92,14 +81,24 @@ base_ppo:
   save_frequency_hours: null
   start_date: 2019-03-01 00:00
   time_step: 1h
-tiny:
-  end_date: 2019-01-05 00:00
+
+
+base_ppo:
+  save_frequency_hours: null
+  start_date: 2019-03-01 00:00
+  time_step: 1h
+  end_date: 2019-03-31 00:00
+  learning_mode: false
   learning_config:
     algorithm: ppo
     continue_learning: false
     device: cpu
     gradient_steps: 10
     learning_rate: 0.0003
+    max_bid_price: 100
+    trained_policies_save_path: "learned_strategies/base_ppo/avg_reward_eval_policies"
+    training_episodes: 100
+    validation_episodes_interval: 10
     matd3:
       actor_architecture: mlp
       batch_size: 64
@@ -109,11 +108,16 @@ tiny:
       noise_scale: 1
       noise_sigma: 0.1
       train_freq: 24h
-    max_bid_price: 100
-    trained_policies_save_path: null
-    training_episodes: 100
-    validation_episodes_interval: 10
-  learning_mode: true
+    ppo:
+      actor_architecture: dist
+      batch_size: 11
+      clip_ratio: 0.05
+      entropy_coef: 0.005
+      gae_lambda: 0.95
+      gamma: 0.99
+      max_grad_norm: 0.3
+      train_freq: 33h
+      vf_coef: 0.75
   markets_config:
     EOM:
       market_mechanism: pay_as_clear
@@ -130,17 +134,24 @@ tiny:
         duration: 1h
         first_delivery: 1h
       volume_unit: MWh
+
+
+tiny_ppo:
   save_frequency_hours: null
   start_date: 2019-01-01 00:00
   time_step: 1h
-tiny_ppo:
   end_date: 2019-01-05 00:00
+  learning_mode: true
   learning_config:
     algorithm: ppo
     continue_learning: false
     device: cpu
     gradient_steps: 10
     learning_rate: 0.0003
+    trained_policies_save_path: null
+    training_episodes: 100
+    validation_episodes_interval: 10
+    max_bid_price: 100
     matd3:
       actor_architecture: mlp
       batch_size: 64
@@ -150,7 +161,6 @@ tiny_ppo:
       noise_scale: 1
       noise_sigma: 0.1
       train_freq: 24h
-    max_bid_price: 100
     ppo:
       actor_architecture: dist
       batch_size: 11
@@ -161,10 +171,6 @@ tiny_ppo:
       max_grad_norm: 0.3
       train_freq: 33h
       vf_coef: 0.75
-    trained_policies_save_path: null
-    training_episodes: 100
-    validation_episodes_interval: 10
-  learning_mode: true
   markets_config:
     EOM:
       market_mechanism: pay_as_clear
@@ -181,6 +187,4 @@ tiny_ppo:
         duration: 1h
         first_delivery: 1h
       volume_unit: MWh
-  save_frequency_hours: null
-  start_date: 2019-01-01 00:00
-  time_step: 1h
+

From 3fa7359ca6f7dfa875be9090e883c78951016866 Mon Sep 17 00:00:00 2001
From: kim-mskw <kim.miskiw@kit.edu>
Date: Wed, 4 Dec 2024 14:22:47 +0100
Subject: [PATCH 23/23] - adjusted exploration noise handling to fit to PPO,
 makes merge commit runable

---
 assume/common/base.py                         | 19 +-----
 .../reinforcement_learning/learning_role.py   | 59 +++++++++----------
 assume/strategies/learning_strategies.py      |  2 +-
 examples/inputs/example_02a/config.yaml       |  9 ++-
 4 files changed, 37 insertions(+), 52 deletions(-)

diff --git a/assume/common/base.py b/assume/common/base.py
index f03830781..4b8f6a8b6 100644
--- a/assume/common/base.py
+++ b/assume/common/base.py
@@ -68,37 +68,20 @@ def __init__(
             for strategy in self.bidding_strategies.values()
         ):
             self.outputs["actions"] = TensorFastSeries(value=0.0, index=self.index)
-            self.outputs["exploration_noise"] = TensorFastSeries(
-                value=0.0,
-                index=self.index,
-            )
             self.outputs["reward"] = FastSeries(value=0.0, index=self.index)
             self.outputs["regret"] = FastSeries(value=0.0, index=self.index)
 
         self.avg_op_time = 0
         self.total_op_time = 0
 
-        # some data is stored as series to allow to store it in the outputs
-        # check if any bidding strategy is using the RL strategy
-        if any(
-            isinstance(strategy, LearningStrategy)
-            for strategy in self.bidding_strategies.values()
-        ):
-            self.outputs["actions"] = TensorFastSeries(value=0.0, index=self.index)
-            self.outputs["exploration_noise"] = TensorFastSeries(
-                value=0.0,
-                index=self.index,
-            )
-            self.outputs["reward"] = FastSeries(value=0.0, index=self.index)
-            self.outputs["regret"] = FastSeries(value=0.0, index=self.index)
 
         # RL data stored as lists to simplify storing to the buffer
         self.outputs["rl_observations"] = []
         self.outputs["rl_actions"] = []
         self.outputs["rl_rewards"] = []
-         # For PPO
         self.outputs["rl_log_probs"] = []
 
+
     def calculate_bids(
         self,
         market_config: MarketConfig,
diff --git a/assume/reinforcement_learning/learning_role.py b/assume/reinforcement_learning/learning_role.py
index e48f44534..94f64bc98 100644
--- a/assume/reinforcement_learning/learning_role.py
+++ b/assume/reinforcement_learning/learning_role.py
@@ -62,33 +62,6 @@ def __init__(
             "trained_policies_load_path", self.trained_policies_save_path
         )
 
-        # if early_stopping_steps are not provided then set default to no early stopping (early_stopping_steps need to be greater than validation_episodes)
-        self.early_stopping_steps = learning_config.get(
-            "early_stopping_steps",
-            int(
-                self.training_episodes
-                / learning_config.get("validation_episodes_interval", 5)
-                + 1
-            ),
-        )
-        self.early_stopping_threshold = learning_config.get(
-            "early_stopping_threshold", 0.05
-        )
-
-        # if early_stopping_steps are not provided then set default to no early stopping (early_stopping_steps need to be greater than validation_episodes)
-        self.early_stopping_steps = learning_config.get(
-            "early_stopping_steps",
-            int(
-                self.training_episodes
-                / learning_config.get("validation_episodes_interval", 5)
-                + 1
-            ),
-        )
-        self.early_stopping_threshold = learning_config.get(
-            "early_stopping_threshold", 0.05
-        )
-
-
 
         self.learning_rate = learning_config["learning_rate"]
         self.actor_architecture = learning_config.get(self.rl_algorithm_name, {}).get(
@@ -133,6 +106,32 @@ def __init__(
             1,
         )
 
+                # if early_stopping_steps are not provided then set default to no early stopping (early_stopping_steps need to be greater than validation_episodes)
+        self.early_stopping_steps = learning_config.get(
+            "early_stopping_steps",
+            int(
+                self.training_episodes
+                / learning_config.get("validation_episodes_interval", 5)
+                + 1
+            ),
+        )
+        self.early_stopping_threshold = learning_config.get(
+            "early_stopping_threshold", 0.05
+        )
+
+        # if early_stopping_steps are not provided then set default to no early stopping (early_stopping_steps need to be greater than validation_episodes)
+        self.early_stopping_steps = learning_config.get(
+            "early_stopping_steps",
+            int(
+                self.training_episodes
+                / learning_config.get("validation_episodes_interval", 5)
+                + 1
+            ),
+        )
+        self.early_stopping_threshold = learning_config.get(
+            "early_stopping_threshold", 0.05
+        )
+
         self.gradient_steps = (
             int(self.train_freq[:-1])
             if learning_config.get("gradient_steps", -1) == -1
@@ -209,8 +208,6 @@ def load_inter_episodic_data(self, inter_episodic_data):
             if self.episodes_done > self.episodes_collecting_initial_experience:
                 self.turn_off_initial_exploration()
 
-        self.set_noise_scale(inter_episodic_data["noise_scale"])
-
         self.initialize_policy(inter_episodic_data["actors_and_critics"])
 
     # TD3 and PPO
@@ -323,7 +320,7 @@ def get_noise_scale(self) -> None:
 
         return stored_scale
 
-    def create_learning_algorithm(self, algorithm: RLAlgorithm):
+    def create_learning_algorithm(self, algorithm: str):
         """
         Create and initialize the reinforcement learning algorithm, based on defined algorithm type.
 
@@ -331,7 +328,7 @@ def create_learning_algorithm(self, algorithm: RLAlgorithm):
         is associated with the learning role and configured with relevant hyperparameters.
 
         Args:
-            algorithm (RLAlgorithm): The name of the reinforcement learning algorithm.
+            algorithm (str): The name of the reinforcement learning algorithm.
         """
         if algorithm == "matd3":
             self.rl_algorithm = TD3(
diff --git a/assume/strategies/learning_strategies.py b/assume/strategies/learning_strategies.py
index 818162ba8..21bd04052 100644
--- a/assume/strategies/learning_strategies.py
+++ b/assume/strategies/learning_strategies.py
@@ -302,7 +302,7 @@ def calculate_bids(
             unit.outputs["exploration_noise"].at[start] = extra_info  # It's noise
         else:
             unit.outputs["rl_log_probs"].append(extra_info)  # It's log_probs
-            # unit.outputs["dones"][start] = False
+            
 
         bids = self.remove_empty_bids(bids)
 
diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index da01bb37d..e5c6a9961 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -95,7 +95,7 @@ base_ppo:
   start_date: 2019-03-01 00:00
   time_step: 1h
   end_date: 2019-03-31 00:00
-  learning_mode: false
+  learning_mode: true
   learning_config:
     algorithm: ppo
     continue_learning: false
@@ -103,7 +103,7 @@ base_ppo:
     gradient_steps: 10
     learning_rate: 0.0003
     max_bid_price: 100
-    trained_policies_save_path: "learned_strategies/base_ppo/avg_reward_eval_policies"
+    trained_policies_save_path: null
     training_episodes: 100
     validation_episodes_interval: 10
     matd3:
@@ -137,6 +137,11 @@ base_ppo:
       opening_frequency: 1h
       opening_duration: 1h
       volume_unit: MWh
+      maximum_bid_volume: 100000
+      maximum_bid_price: 3000
+      minimum_bid_price: -500
+      price_unit: EUR/MWh
+      market_mechanism: pay_as_clear
 
 
 tiny_ppo: