SforAiDl · AdityaKapoor74 · Sep 1, 2020 · Sep 1, 2020 · Sep 1, 2020 · Sep 1, 2020
diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py
@@ -8,7 +8,7 @@
 from genrl.core.base import BaseActorCritic
 from genrl.core.policies import MlpPolicy
 from genrl.core.values import MlpValue
-from genrl.utils.utils import cnn
+from genrl.utils.utils import cnn, shared_mlp
 
 
 class MlpActorCritic(BaseActorCritic):
@@ -216,10 +216,168 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor:
         return value
 
 
+class SharedActorCritic(BaseActorCritic):
+    def __init__(
+        self, 
+        critic_prev,
+        actor_prev,
+        shared,
+        critic_post,
+        actor_post,
+        weight_init,
+        activation_func
+        ):
+        super(SharedActorCritic, self).__init__()
+
+        self.critic,self.actor = shared_mlp(critic_prev,actor_prev,shared,critic_post,actor_post,weight_init,activation_func)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+    def forward(self, state_critic,state_action):
+
+        if state_critic is not None:
+            return self.critic(state_critic)
+
+        if state_action is not None:
+            return self.actor(state_action)
+
+
+
+    def get_action(self, state, one_hot=False, deterministic=False):
+        # state = torch.FloatTensor(state).to(self.device)
+        logits = self.forward(None,state)
+        if one_hot:
+            if deterministic:
+                logits = self.onehot_from_logits(logits,eps=1.0)
+            else:
+                logits = self.onehot_from_logits(logits,eps=0.0)
+            return logits
+
+        dist = F.softmax(logits, dim=0)
+        probs = Categorical(dist)
+        if deterministic:
+            index = torch.argmax(probs)
+        else:
+            index = probs.sample().cpu().detach().item()
+        return index
+
+    def onehot_from_logits(self, logits, eps=0.0):
+        # get best (according to current policy) actions in one-hot form
+        argmax_acs = (logits == logits.max(0, keepdim=True)[0]).float()
+        if eps == 0.0:
+            return argmax_acs
+        # get random actions in one-hot form
+        rand_acs = torch.eye(logits.shape[1])[
+            [np.random.choice(range(logits.shape[1]), size=logits.shape[0])]
+        ]
+        # chooses between best and random actions using epsilon greedy
+        return torch.stack(
+            [
+                argmax_acs[i] if r > eps else rand_acs[i]
+                for i, r in enumerate(torch.rand(logits.shape[0]))
+            ]
+        )
+
+    def get_value(self, state):
+        # state = torch.FloatTensor(state).to(self.device)
+        value = self.forward(state,None)
+        return value
+
+
+
+class Actor(BaseActorCritic):
+    def __init__(
+        self,
+        state_dim: spaces.Space,
+        action_dim: spaces.Space,
+        policy_layers: Tuple = (32, 32),
+        discrete: bool = True,
+        **kwargs,
+    ):
+    def __init__(self, layer_sizes,weight_init,activation_func):
+        super(Actor, self).__init__()
+
+        self.actor = MlpPolicy(layer, action_dim, policy_layers, discrete, **kwargs)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    def forward(self, policy):
+        policy = self.actor(policy)
+        return policy
+
+
+
+    def get_action(self, state, one_hot=False, deterministic=False):
+        # state = torch.FloatTensor(state).to(self.device)
+        logits = self.forward(state)
+        if one_hot:
+            if deterministic:
+                logits = self.onehot_from_logits(logits,eps=1.0)
+            else:
+                logits = self.onehot_from_logits(logits,eps=0.0)
+            return logits
+
+        dist = F.softmax(logits, dim=0)
+        probs = Categorical(dist)
+        if deterministic:
+            index = torch.argmax(probs)
+        else:
+            index = probs.sample().cpu().detach().item()
+        return index
+
+    def onehot_from_logits(self, logits, eps=0.0):
+        # get best (according to current policy) actions in one-hot form
+        argmax_acs = (logits == logits.max(0, keepdim=True)[0]).float()
+        if eps == 0.0:
+            return argmax_acs
+        # get random actions in one-hot form
+        rand_acs = torch.eye(logits.shape[1])[
+            [np.random.choice(range(logits.shape[1]), size=logits.shape[0])]
+        ]
+        # chooses between best and random actions using epsilon greedy
+        return torch.stack(
+            [
+                argmax_acs[i] if r > eps else rand_acs[i]
+                for i, r in enumerate(torch.rand(logits.shape[0]))
+            ]
+        )
+
+
+class Critic(BaseActorCritic):
+    def __init__(
+        self,
+        state_dim: spaces.Space,
+        action_dim: spaces.Space,
+        policy_layers: Tuple = (32, 32),
+        value_layers: Tuple = (32, 32),
+        val_type: str = "V",
+        discrete: bool = True,
+        **kwargs,
+    ):
+        super(MlpActorCritic, self).__init__()
+
+        self.critic = MlpValue(state_dim, action_dim, val_type, value_layers, **kwargs)
+
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    def forward(self, value):
+
+        value = self.critic(value)
+
+        return value
+
+
+
+    def get_value(self, state):
+        # state = torch.FloatTensor(state).to(self.device)
+        value = self.forward(state)
+        return value
+
+
 actor_critic_registry = {
     "mlp": MlpActorCritic,
     "cnn": CNNActorCritic,
     "mlp12": MlpSingleActorMultiCritic,
+    "mlpshared": SharedActorCritic,
 }
 
 

diff --git a/genrl/core/buffers.py b/genrl/core/buffers.py
@@ -23,6 +23,13 @@ class PrioritizedReplayBufferSamples(NamedTuple):
     indices: torch.Tensor
     weights: torch.Tensor
 
+class MultiAgentReplayBuffer(NamedTuple):
+    states: torch.Tensor
+    actions: torch.Tensor
+    rewards: torch.Tensor
+    next_states: torch.Tensor
+    dones: torch.Tensor
+
 
 class ReplayBuffer:
     """
@@ -181,3 +188,83 @@ def __len__(self) -> int:
     @property
     def pos(self):
         return len(self.buffer)
+
+
+
+class MultiAgentReplayBuffer:
+    """
+    Implements the basic Experience Replay Mechanism for MultiAgents by feeding in global states, 
+    global actions, global rewards, global next_states, global dones
+
+    :param capacity: Size of the replay buffer
+    :type capacity: int
+    :param num_agents: Number of agents in the environment
+    :type num_agents: int
+    """
+    def __init__(self, num_agents, capacity):
+        self.capacity = capacity
+        self.num_agents = num_agents
+        self.buffer = deque(maxlen=max_size)
+
+    def push(self, inp: Tuple) -> None:
+        """
+                Adds new experience to buffer
+
+                :param inp: (Tuple containing `state`, `action`, `reward`,
+        `next_state` and `done`)
+                :type inp: tuple
+                :returns: None
+        """
+        self.buffer.append(inp)
+
+
+    def sample(self, batch_size):
+
+        """
+                Returns randomly sampled experiences from replay memory
+
+                :param batch_size: Number of samples per batch
+                :type batch_size: int
+                :returns: (Tuple composing of `indiv_obs_batch`, `indiv_action_batch`, `indiv_reward_batch`, `indiv_next_obs_batch`, 
+                `global_state_batch`, `global_actions_batch`, `global_next_state_batch`, `done_batch`)
+        """
+        indiv_obs_batch = [[] for _ in range(self.num_agents)]  # [ [states of agent 1], ... ,[states of agent n] ]    ]
+        indiv_action_batch = [[] for _ in range(self.num_agents)] # [ [actions of agent 1], ... , [actions of agent n]]
+        indiv_reward_batch = [[] for _ in range(self.num_agents)]
+        indiv_next_obs_batch = [[] for _ in range(self.num_agents)]
+
+        global_state_batch = []
+        global_next_state_batch = []
+        global_actions_batch = []
+        done_batch = []
+
+        batch = random.sample(self.buffer, batch_size)
+
+
+        for experience in batch:
+            state, action, reward, next_state, done = experience
+
+            for i in range(self.num_agents):
+                indiv_obs_batch[i].append(state[i])
+                indiv_action_batch[i].append(action[i])
+                indiv_reward_batch[i].append(reward[i])
+                indiv_next_obs_batch[i].append(next_state[i])
+
+            global_state_batch.append(torch.cat(state))
+            global_actions_batch.append(torch.cat(action))
+            global_next_state_batch.append(torch.cat(next_state))
+            done_batch.append(done)
+
+        global_state_batch = torch.stack(global_state_batch)
+        global_actions_batch = torch.stack(global_actions_batch)
+        global_next_state_batch = torch.stack(global_next_state_batch)
+        done_batch = torch.stack(done_batch)
+        indiv_obs_batch = torch.stack([torch.FloatTensor(obs) for obs in indiv_obs_batch])
+        indiv_action_batch = torch.stack([torch.FloatTensor(act) for act in indiv_action_batch])
+        indiv_reward_batch = torch.stack([torch.FloatTensor(rew) for rew in indiv_reward_batch])
+        indiv_next_obs_batch = torch.stack([torch.FloatTensor(next_obs) for next_obs in indiv_next_obs_batch])
+
+        return indiv_obs_batch, indiv_action_batch, indiv_reward_batch, indiv_next_obs_batch, global_state_batch, global_actions_batch, global_next_state_batch, done_batch
+
+    def __len__(self):
+        return len(self.buffer)
diff --git a/genrl/environments/gym_wrapper.py b/genrl/environments/gym_wrapper.py
@@ -106,3 +106,107 @@ def close(self) -> None:
         Closes environment
         """
         self.env.close()
+
+
+class MultiGymWrapper(gym.Wrapper):
+    """
+    Wrapper class for all MultiAgent Particle Environments
+
+    :param env: Gym environment name
+    :param n_envs: Number of environments. None if not vectorised
+    :param parallel: If vectorised, should environments be run through \
+serially or parallelly
+    :type env: string
+    :type n_envs: None, int
+    :type parallel: boolean
+    """
+
+    def __init__(self, env: gym.Env):
+        super(GymWrapper, self).__init__(env)
+        self.env = env
+
+        self.observation_space = self.env.observation_space
+        self.action_space = self.env.action_space
+
+        self.state = None
+        self.action = None
+        self.reward = None
+        self.done = False
+        self.info = {}
+
+    def __getattr__(self, name: str) -> Any:
+        """
+        All other calls would go to base env
+        """
+        env = super(GymWrapper, self).__getattribute__("env")
+        return getattr(env, name)
+
+    @property
+    def obs_shape(self):
+        if isinstance(self.env.observation_space, gym.spaces.Discrete):
+            obs_shape = (1,)
+        elif isinstance(self.env.observation_space, gym.spaces.Box):
+            obs_shape = self.env.observation_space.shape
+        return obs_shape
+
+    @property
+    def action_shape(self):
+        if isinstance(self.env.action_space, gym.spaces.Box):
+            action_shape = self.env.action_space.shape
+        elif isinstance(self.env.action_space, gym.spaces.Discrete):
+            action_shape = (1,)
+        return action_shape
+
+    def sample(self) -> np.ndarray:
+        """
+        Shortcut method to directly sample from environment's action space
+
+        :returns: Random action from action space
+        :rtype: NumPy Array
+        """
+        return self.env.action_space.sample()
+
+    def render(self, mode: str = "human") -> None:
+        """
+        Renders all envs in a tiles format similar to baselines.
+
+        :param mode: Can either be 'human' or 'rgb_array'. \
+Displays tiled images in 'human' and returns tiled images in 'rgb_array'
+        :type mode: string
+        """
+        self.env.render(mode=mode)
+
+    def seed(self, seed: int = None) -> None:
+        """
+        Set environment seed
+
+        :param seed: Value of seed
+        :type seed: int
+        """
+        self.env.seed(seed)
+
+    def step(self, action: np.ndarray) -> np.ndarray:
+        """
+        Steps the env through given action
+
+        :param action: Action taken by agent
+        :type action: NumPy array
+        :returns: Next observation, reward, game status and debugging info
+        """
+        self.state, self.reward, self.done, self.info = self.env.step(action)
+        self.action = action
+        return self.state, self.reward, self.done, self.info
+
+    def reset(self) -> np.ndarray:
+        """
+        Resets environment
+
+        :returns: Initial state
+        """
+        return self.env.reset()
+
+    def close(self) -> None:
+        """
+        Closes environment
+        """
+        self.env.close()