From 1d49049a67f14b79b422dd9521f32be864f419eb Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Tue, 1 Sep 2020 15:49:49 +0530 Subject: [PATCH 01/39] Single actor critic shared params --- genrl/agents/deep/a2c/a2c.py | 30 ++++++++-- genrl/agents/deep/base/base.py | 2 + genrl/agents/deep/base/offpolicy.py | 3 +- genrl/agents/deep/ddpg/ddpg.py | 26 ++++++++- genrl/agents/deep/ppo1/ppo1.py | 29 +++++++++- genrl/core/actor_critic.py | 70 +++++++++++++++++++++++- tests/test_deep/test_agents/test_a2c.py | 8 +++ tests/test_deep/test_agents/test_ddpg.py | 26 +++++++++ tests/test_deep/test_agents/test_ppo1.py | 8 +++ 9 files changed, 188 insertions(+), 14 deletions(-) diff --git a/genrl/agents/deep/a2c/a2c.py b/genrl/agents/deep/a2c/a2c.py index 595b9f14..e990f531 100644 --- a/genrl/agents/deep/a2c/a2c.py +++ b/genrl/agents/deep/a2c/a2c.py @@ -66,7 +66,24 @@ def _create_model(self) -> None: state_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network ) - if isinstance(self.network, str): + if isinstance(self.network, str) and self.shared_layers is not None: + self.ac = get_model("ac", self.network + "s")( + state_dim, + action_dim, + shared_layers=self.shared_layers, + policy_layers=self.policy_layers, + value_layers=self.value_layers, + val_type="V", + discrete=discrete, + action_lim=action_lim, + ).to(self.device) + actor_params = list(self.ac.shared.parameters()) + list( + self.ac.actor.parameters() + ) + critic_params = list(self.ac.shared.parameters()) + list( + self.ac.critic.parameters() + ) + elif isinstance(self.network, str) and self.shared_layers is None: self.ac = get_model("ac", self.network)( state_dim, action_dim, @@ -76,18 +93,21 @@ def _create_model(self) -> None: discrete=discrete, action_lim=action_lim, ).to(self.device) + actor_params = self.ac.actor.parameters() + critic_params = self.ac.critic.parameters() + else: self.ac = self.network.to(self.device) - - # action_dim = self.network.action_dim + actor_params = self.ac.actor.parameters() + critic_params = self.ac.critic.parameters() if self.noise is not None: self.noise = self.noise( np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim) ) - self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_policy) - self.optimizer_value = opt.Adam(self.ac.critic.parameters(), lr=self.lr_value) + self.optimizer_policy = opt.Adam(actor_params, lr=self.lr_policy) + self.optimizer_value = opt.Adam(critic_params, lr=self.lr_value) def select_action( self, state: np.ndarray, deterministic: bool = False diff --git a/genrl/agents/deep/base/base.py b/genrl/agents/deep/base/base.py index f37907a9..c2067c91 100644 --- a/genrl/agents/deep/base/base.py +++ b/genrl/agents/deep/base/base.py @@ -34,6 +34,7 @@ def __init__( create_model: bool = True, batch_size: int = 64, gamma: float = 0.99, + shared_layers=None, policy_layers: Tuple = (64, 64), value_layers: Tuple = (64, 64), lr_policy: float = 0.0001, @@ -45,6 +46,7 @@ def __init__( self.create_model = create_model self.batch_size = batch_size self.gamma = gamma + self.shared_layers = shared_layers self.policy_layers = policy_layers self.value_layers = value_layers self.lr_policy = lr_policy diff --git a/genrl/agents/deep/base/offpolicy.py b/genrl/agents/deep/base/offpolicy.py index 916a60ec..f64f3dff 100644 --- a/genrl/agents/deep/base/offpolicy.py +++ b/genrl/agents/deep/base/offpolicy.py @@ -174,7 +174,7 @@ def select_action( # add noise to output from policy network if self.noise is not None: - action += self.noise() + action = action + self.noise() return np.clip( action, self.env.action_space.low[0], self.env.action_space.high[0] @@ -233,7 +233,6 @@ def get_target_q_values( next_q_target_values = self.ac_target.get_value( torch.cat([next_states, next_target_actions], dim=-1) ) - target_q_values = rewards + self.gamma * (1 - dones) * next_q_target_values return target_q_values diff --git a/genrl/agents/deep/ddpg/ddpg.py b/genrl/agents/deep/ddpg/ddpg.py index 24f004b6..9adfc27a 100644 --- a/genrl/agents/deep/ddpg/ddpg.py +++ b/genrl/agents/deep/ddpg/ddpg.py @@ -62,7 +62,23 @@ def _create_model(self) -> None: np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim) ) - if isinstance(self.network, str): + if isinstance(self.network, str) and self.shared_layers is not None: + self.ac = get_model("ac", self.network + "s")( + state_dim, + action_dim, + self.shared_layers, + self.policy_layers, + self.value_layers, + "Qsa", + False, + ).to(self.device) + actor_params = list(self.ac.actor.parameters()) + list( + self.ac.shared.parameters() + ) + critic_params = list(self.ac.critic.parameters()) + list( + self.ac.shared.parameters() + ) + elif isinstance(self.network, str) and self.shared_layers is None: self.ac = get_model("ac", self.network)( state_dim, action_dim, @@ -71,13 +87,17 @@ def _create_model(self) -> None: "Qsa", False, ).to(self.device) + actor_params = self.ac.actor.parameters() + critic_params = self.ac.critic.parameters() else: self.ac = self.network + actor_params = self.ac.actor.parameters() + critic_params = self.ac.critic.parameters() self.ac_target = deepcopy(self.ac).to(self.device) - self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_policy) - self.optimizer_value = opt.Adam(self.ac.critic.parameters(), lr=self.lr_value) + self.optimizer_policy = opt.Adam(actor_params, lr=self.lr_policy) + self.optimizer_value = opt.Adam(critic_params, lr=self.lr_value) def update_params(self, update_interval: int) -> None: """Update parameters of the model diff --git a/genrl/agents/deep/ppo1/ppo1.py b/genrl/agents/deep/ppo1/ppo1.py index 0987d078..456aa7d1 100644 --- a/genrl/agents/deep/ppo1/ppo1.py +++ b/genrl/agents/deep/ppo1/ppo1.py @@ -66,10 +66,29 @@ def _create_model(self): state_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network ) - if isinstance(self.network, str): + if isinstance(self.network, str) and self.shared_layers is not None: + self.ac = get_model("ac", self.network + "s")( + state_dim, + action_dim, + shared_layers=self.shared_layers, + policy_layers=self.policy_layers, + value_layers=self.value_layers, + val_typ="V", + discrete=discrete, + action_lim=action_lim, + activation=self.activation, + ).to(self.device) + actor_params = list(self.ac.shared.parameters()) + list( + self.ac.actor.parameters() + ) + critic_params = list(self.ac.shared.parameters()) + list( + self.ac.critic.parameters() + ) + elif isinstance(self.network, str) and self.shared_layers is None: self.ac = get_model("ac", self.network)( state_dim, action_dim, + shared_layers=self.shared_layers, policy_layers=self.policy_layers, value_layers=self.value_layers, val_typ="V", @@ -77,11 +96,15 @@ def _create_model(self): action_lim=action_lim, activation=self.activation, ).to(self.device) + actor_params = self.ac.actor.parameters() + critic_params = self.ac.critic.parameters() else: self.ac = self.network.to(self.device) + actor_params = self.ac.actor.parameters() + critic_params = self.ac.critic.parameters() - self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_policy) - self.optimizer_value = opt.Adam(self.ac.critic.parameters(), lr=self.lr_value) + self.optimizer_policy = opt.Adam(actor_params, lr=self.lr_policy) + self.optimizer_value = opt.Adam(critic_params, lr=self.lr_value) def select_action( self, state: np.ndarray, deterministic: bool = False diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 6214ec46..1ce61f72 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -9,7 +9,7 @@ from genrl.core.base import BaseActorCritic from genrl.core.policies import MlpPolicy from genrl.core.values import MlpValue -from genrl.utils.utils import cnn +from genrl.utils.utils import cnn, mlp class MlpActorCritic(BaseActorCritic): @@ -41,6 +41,73 @@ def __init__( self.critic = MlpValue(state_dim, action_dim, val_type, value_layers, **kwargs) +class MlpSharedActorCritic(BaseActorCritic): + """MLP Shared Actor Critic + + Attributes: + state_dim (int): State dimensions of the environment + action_dim (int): Action space dimensions of the environment + hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP + val_type (str): Value type of the critic network + discrete (bool): True if the action space is discrete, else False + sac (bool): True if a SAC-like network is needed, else False + activation (str): Activation function to be used. Can be either "tanh" or "relu" + """ + + def __init__( + self, + state_dim: spaces.Space, + action_dim: spaces.Space, + shared_layers: Tuple = (32, 32), + policy_layers: Tuple = (32, 32), + value_layers: Tuple = (32, 32), + val_type: str = "V", + discrete: bool = True, + **kwargs, + ): + super(MlpSharedActorCritic, self).__init__() + self.shared = mlp([state_dim] + list(shared_layers)) + self.actor = MlpPolicy( + shared_layers[-1], action_dim, policy_layers, discrete, **kwargs + ) + self.critic = MlpValue( + shared_layers[-1], action_dim, val_type, value_layers, **kwargs + ) + self.state_dim = state_dim + self.action_dim = action_dim + + def get_features(self, state: torch.Tensor): + features = self.shared(state) + return features + + def get_action(self, state: torch.Tensor, deterministic: bool = False): + state = torch.as_tensor(state).float() + features = self.get_features(state) + action_probs = self.actor(features) + action_probs = nn.Softmax(dim=-1)(action_probs) + + if deterministic: + action = torch.argmax(action_probs, dim=-1).unsqueeze(-1).float() + distribution = None + else: + distribution = Categorical(probs=action_probs) + action = distribution.sample() + + return action, distribution + + def get_value(self, state: torch.Tensor): + state = torch.as_tensor(state).float() + if self.critic.val_type == "Qsa": + features = self.shared(state[:, :, :-1]) + features = torch.cat([features, state[:, :, -1].unsqueeze(-1)], dim=-1) + print(f"features {features.shape}") + value = self.critic(features).float().squeeze(-1) + else: + features = self.shared(state) + value = self.critic(features) + return value + + class MlpSingleActorMultiCritic(BaseActorCritic): """MLP Actor Critic @@ -220,6 +287,7 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor: "mlp": MlpActorCritic, "cnn": CNNActorCritic, "mlp12": MlpSingleActorMultiCritic, + "mlps": MlpSharedActorCritic, } diff --git a/tests/test_deep/test_agents/test_a2c.py b/tests/test_deep/test_agents/test_a2c.py index 2b012069..f731f40f 100644 --- a/tests/test_deep/test_agents/test_a2c.py +++ b/tests/test_deep/test_agents/test_a2c.py @@ -19,3 +19,11 @@ def test_a2c_cnn(): trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) trainer.train() shutil.rmtree("./logs") + + +def test_a2c_shared(): + env = VectorEnv("CartPole-v0", 1) + algo = A2C("mlp", env, shared_layers=(32, 32), rollout_size=128) + trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) + trainer.train() + shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_ddpg.py b/tests/test_deep/test_agents/test_ddpg.py index ab309518..94670cef 100644 --- a/tests/test_deep/test_agents/test_ddpg.py +++ b/tests/test_deep/test_agents/test_ddpg.py @@ -29,3 +29,29 @@ def test_ddpg(): ) trainer.train() shutil.rmtree("./logs") + + +def test_ddpg_shared(): + env = VectorEnv("Pendulum-v0", 2) + algo = DDPG( + "mlp", + env, + batch_size=5, + noise=NormalActionNoise, + shared_layers=[1, 1], + policy_layers=[1, 1], + value_layers=[1, 1], + ) + + trainer = OffPolicyTrainer( + algo, + env, + log_mode=["csv"], + logdir="./logs", + epochs=4, + max_ep_len=200, + warmup_steps=10, + start_update=10, + ) + trainer.train() + shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_ppo1.py b/tests/test_deep/test_agents/test_ppo1.py index 3e9feaf2..1bb06a22 100644 --- a/tests/test_deep/test_agents/test_ppo1.py +++ b/tests/test_deep/test_agents/test_ppo1.py @@ -19,3 +19,11 @@ def test_ppo1_cnn(): trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) trainer.train() shutil.rmtree("./logs") + + +def test_ppo1_shared(): + env = VectorEnv("CartPole-v0") + algo = PPO1("mlp", env, shared_layers=(32, 32), rollout_size=128) + trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) + trainer.train() + shutil.rmtree("./logs") From ef4a179a321ed5a4f306067a77712948d7b2e93b Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Wed, 2 Sep 2020 01:17:47 +0530 Subject: [PATCH 02/39] Shared layers for multi ACs --- genrl/agents/deep/a2c/a2c.py | 32 ++--- genrl/agents/deep/base/offpolicy.py | 2 +- genrl/agents/deep/ddpg/ddpg.py | 27 +--- genrl/agents/deep/ppo1/ppo1.py | 30 +--- genrl/agents/deep/sac/sac.py | 16 +-- genrl/agents/deep/td3/td3.py | 18 ++- genrl/core/actor_critic.py | 160 ++++++++++++++++++++- tests/test_deep/test_agents/test_custom.py | 7 + tests/test_deep/test_agents/test_ppo1.py | 8 -- tests/test_deep/test_agents/test_sac.py | 25 ++++ tests/test_deep/test_agents/test_td3.py | 26 ++++ 11 files changed, 253 insertions(+), 98 deletions(-) diff --git a/genrl/agents/deep/a2c/a2c.py b/genrl/agents/deep/a2c/a2c.py index e990f531..86cd84ad 100644 --- a/genrl/agents/deep/a2c/a2c.py +++ b/genrl/agents/deep/a2c/a2c.py @@ -66,8 +66,11 @@ def _create_model(self) -> None: state_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network ) - if isinstance(self.network, str) and self.shared_layers is not None: - self.ac = get_model("ac", self.network + "s")( + if isinstance(self.network, str): + arch_type = self.network + if self.shared_layers is not None: + arch_type += "s" + self.ac = get_model("ac", arch_type)( state_dim, action_dim, shared_layers=self.shared_layers, @@ -77,37 +80,18 @@ def _create_model(self) -> None: discrete=discrete, action_lim=action_lim, ).to(self.device) - actor_params = list(self.ac.shared.parameters()) + list( - self.ac.actor.parameters() - ) - critic_params = list(self.ac.shared.parameters()) + list( - self.ac.critic.parameters() - ) - elif isinstance(self.network, str) and self.shared_layers is None: - self.ac = get_model("ac", self.network)( - state_dim, - action_dim, - policy_layers=self.policy_layers, - value_layers=self.value_layers, - val_type="V", - discrete=discrete, - action_lim=action_lim, - ).to(self.device) - actor_params = self.ac.actor.parameters() - critic_params = self.ac.critic.parameters() else: self.ac = self.network.to(self.device) - actor_params = self.ac.actor.parameters() - critic_params = self.ac.critic.parameters() if self.noise is not None: self.noise = self.noise( np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim) ) - self.optimizer_policy = opt.Adam(actor_params, lr=self.lr_policy) - self.optimizer_value = opt.Adam(critic_params, lr=self.lr_value) + actor_params, critic_params = self.ac.get_params() + self.optimizer_policy = opt.Adam(critic_params, lr=self.lr_policy) + self.optimizer_value = opt.Adam(actor_params, lr=self.lr_value) def select_action( self, state: np.ndarray, deterministic: bool = False diff --git a/genrl/agents/deep/base/offpolicy.py b/genrl/agents/deep/base/offpolicy.py index f64f3dff..656d7911 100644 --- a/genrl/agents/deep/base/offpolicy.py +++ b/genrl/agents/deep/base/offpolicy.py @@ -174,7 +174,7 @@ def select_action( # add noise to output from policy network if self.noise is not None: - action = action + self.noise() + action += self.noise() return np.clip( action, self.env.action_space.low[0], self.env.action_space.high[0] diff --git a/genrl/agents/deep/ddpg/ddpg.py b/genrl/agents/deep/ddpg/ddpg.py index 9adfc27a..0d09314b 100644 --- a/genrl/agents/deep/ddpg/ddpg.py +++ b/genrl/agents/deep/ddpg/ddpg.py @@ -62,8 +62,11 @@ def _create_model(self) -> None: np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim) ) - if isinstance(self.network, str) and self.shared_layers is not None: - self.ac = get_model("ac", self.network + "s")( + if isinstance(self.network, str): + arch_type = self.network + if self.shared_layers is not None: + arch_type += "s" + self.ac = get_model("ac", arch_type)( state_dim, action_dim, self.shared_layers, @@ -72,28 +75,10 @@ def _create_model(self) -> None: "Qsa", False, ).to(self.device) - actor_params = list(self.ac.actor.parameters()) + list( - self.ac.shared.parameters() - ) - critic_params = list(self.ac.critic.parameters()) + list( - self.ac.shared.parameters() - ) - elif isinstance(self.network, str) and self.shared_layers is None: - self.ac = get_model("ac", self.network)( - state_dim, - action_dim, - self.policy_layers, - self.value_layers, - "Qsa", - False, - ).to(self.device) - actor_params = self.ac.actor.parameters() - critic_params = self.ac.critic.parameters() else: self.ac = self.network - actor_params = self.ac.actor.parameters() - critic_params = self.ac.critic.parameters() + actor_params, critic_params = self.ac.get_params() self.ac_target = deepcopy(self.ac).to(self.device) self.optimizer_policy = opt.Adam(actor_params, lr=self.lr_policy) diff --git a/genrl/agents/deep/ppo1/ppo1.py b/genrl/agents/deep/ppo1/ppo1.py index 456aa7d1..7359e621 100644 --- a/genrl/agents/deep/ppo1/ppo1.py +++ b/genrl/agents/deep/ppo1/ppo1.py @@ -66,8 +66,11 @@ def _create_model(self): state_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network ) - if isinstance(self.network, str) and self.shared_layers is not None: - self.ac = get_model("ac", self.network + "s")( + if isinstance(self.network, str): + arch = self.network + if self.shared_layers is not None: + arch += "s" + self.ac = get_model("ac", arch)( state_dim, action_dim, shared_layers=self.shared_layers, @@ -78,31 +81,10 @@ def _create_model(self): action_lim=action_lim, activation=self.activation, ).to(self.device) - actor_params = list(self.ac.shared.parameters()) + list( - self.ac.actor.parameters() - ) - critic_params = list(self.ac.shared.parameters()) + list( - self.ac.critic.parameters() - ) - elif isinstance(self.network, str) and self.shared_layers is None: - self.ac = get_model("ac", self.network)( - state_dim, - action_dim, - shared_layers=self.shared_layers, - policy_layers=self.policy_layers, - value_layers=self.value_layers, - val_typ="V", - discrete=discrete, - action_lim=action_lim, - activation=self.activation, - ).to(self.device) - actor_params = self.ac.actor.parameters() - critic_params = self.ac.critic.parameters() else: self.ac = self.network.to(self.device) - actor_params = self.ac.actor.parameters() - critic_params = self.ac.critic.parameters() + actor_params, critic_params = self.ac.get_params() self.optimizer_policy = opt.Adam(actor_params, lr=self.lr_policy) self.optimizer_value = opt.Adam(critic_params, lr=self.lr_value) diff --git a/genrl/agents/deep/sac/sac.py b/genrl/agents/deep/sac/sac.py index b7a5572d..54c7f87b 100644 --- a/genrl/agents/deep/sac/sac.py +++ b/genrl/agents/deep/sac/sac.py @@ -76,8 +76,10 @@ def _create_model(self, **kwargs) -> None: state_dim, action_dim, discrete, _ = get_env_properties( self.env, self.network ) - - self.ac = get_model("ac", self.network + "12")( + arch = self.network + "12" + if self.shared_layers is not None: + arch += "s" + self.ac = get_model("ac", arch)( state_dim, action_dim, policy_layers=self.policy_layers, @@ -92,13 +94,9 @@ def _create_model(self, **kwargs) -> None: self.model = self.network self.ac_target = deepcopy(self.ac) - - self.critic_params = list(self.ac.critic1.parameters()) + list( - self.ac.critic2.parameters() - ) - - self.optimizer_value = opt.Adam(self.critic_params, self.lr_value) - self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), self.lr_policy) + actor_params, critic_params = self.ac.get_params() + self.optimizer_value = opt.Adam(critic_params, self.lr_value) + self.optimizer_policy = opt.Adam(actor_params, self.lr_policy) if self.entropy_tuning: self.target_entropy = -torch.prod( diff --git a/genrl/agents/deep/td3/td3.py b/genrl/agents/deep/td3/td3.py index a9687446..5a8e83d2 100644 --- a/genrl/agents/deep/td3/td3.py +++ b/genrl/agents/deep/td3/td3.py @@ -68,10 +68,13 @@ def _create_model(self) -> None: ) if isinstance(self.network, str): - # Below, the "12" corresponds to the Single Actor, Double Critic network architecture - self.ac = get_model("ac", self.network + "12")( + arch = self.network + "12" + if self.shared_layers is not None: + arch += "s" + self.ac = get_model("ac", arch)( state_dim, action_dim, + shared_layers=self.shared_layers, policy_layers=self.policy_layers, value_layers=self.value_layers, val_type="Qsa", @@ -86,14 +89,9 @@ def _create_model(self) -> None: ) self.ac_target = deepcopy(self.ac) - - self.critic_params = list(self.ac.critic1.parameters()) + list( - self.ac.critic2.parameters() - ) - self.optimizer_value = torch.optim.Adam(self.critic_params, lr=self.lr_value) - self.optimizer_policy = torch.optim.Adam( - self.ac.actor.parameters(), lr=self.lr_policy - ) + actor_params, critic_params = self.ac.get_params() + self.optimizer_value = torch.optim.Adam(critic_params, lr=self.lr_value) + self.optimizer_policy = torch.optim.Adam(actor_params, lr=self.lr_policy) def update_params(self, update_interval: int) -> None: """Update parameters of the model diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 1ce61f72..1b135ebd 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -29,6 +29,7 @@ def __init__( self, state_dim: spaces.Space, action_dim: spaces.Space, + shared_layers: None, policy_layers: Tuple = (32, 32), value_layers: Tuple = (32, 32), val_type: str = "V", @@ -40,6 +41,11 @@ def __init__( self.actor = MlpPolicy(state_dim, action_dim, policy_layers, discrete, **kwargs) self.critic = MlpValue(state_dim, action_dim, val_type, value_layers, **kwargs) + def get_params(self): + actor_params = self.actor.parameters() + critic_params = self.critic.parameters() + return actor_params, critic_params + class MlpSharedActorCritic(BaseActorCritic): """MLP Shared Actor Critic @@ -76,7 +82,20 @@ def __init__( self.state_dim = state_dim self.action_dim = action_dim + def get_params(self): + actor_params = list(self.shared.parameters()) + list(self.actor.parameters()) + critic_params = list(self.shared.parameters()) + list(self.critic.parameters()) + return actor_params, critic_params + def get_features(self, state: torch.Tensor): + """Extract features from the state, which is then an input to get_action and get_value + + Args: + state (:obj:`torch.Tensor`): The state(s) being passed + + Returns: + features (:obj:`torch.Tensor`): The feature(s) extracted from the state + """ features = self.shared(state) return features @@ -100,7 +119,6 @@ def get_value(self, state: torch.Tensor): if self.critic.val_type == "Qsa": features = self.shared(state[:, :, :-1]) features = torch.cat([features, state[:, :, -1].unsqueeze(-1)], dim=-1) - print(f"features {features.shape}") value = self.critic(features).float().squeeze(-1) else: features = self.shared(state) @@ -144,6 +162,137 @@ def __init__( self.action_scale = kwargs["action_scale"] if "action_scale" in kwargs else 1 self.action_bias = kwargs["action_bias"] if "action_bias" in kwargs else 0 + def get_params(self): + actor_params = self.actor.parameters() + critic_params = list(self.critic1.parameters()) + list( + self.critic2.parameters() + ) + return actor_params, critic_params + + def forward(self, x): + q1_values = self.critic1(x).squeeze(-1) + q2_values = self.critic2(x).squeeze(-1) + return (q1_values, q2_values) + + def get_action(self, state: torch.Tensor, deterministic: bool = False): + state = torch.as_tensor(state).float() + + if self.actor.sac: + mean, log_std = self.actor(state) + std = log_std.exp() + distribution = Normal(mean, std) + + action_probs = distribution.rsample() + log_probs = distribution.log_prob(action_probs) + action_probs = torch.tanh(action_probs) + + action = action_probs * self.action_scale + self.action_bias + + # enforcing action bound (appendix of SAC paper) + log_probs -= torch.log( + self.action_scale * (1 - action_probs.pow(2)) + np.finfo(np.float32).eps + ) + log_probs = log_probs.sum(1, keepdim=True) + mean = torch.tanh(mean) * self.action_scale + self.action_bias + + action = (action.float(), log_probs, mean) + else: + action = self.actor.get_action(state, deterministic=deterministic) + + return action + + def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: + """Get Values from the Critic + + Arg: + state (:obj:`torch.Tensor`): The state(s) being passed to the critics + mode (str): What values should be returned. Types: + "both" --> Both values will be returned + "min" --> The minimum of both values will be returned + "first" --> The value from the first critic only will be returned + + Returns: + values (:obj:`list`): List of values as estimated by each individual critic + """ + state = torch.as_tensor(state).float() + + if mode == "both": + values = self.forward(state) + elif mode == "min": + values = self.forward(state) + values = torch.min(*values).squeeze(-1) + elif mode == "first": + values = self.critic1(state) + else: + raise KeyError("Mode doesn't exist") + + return values + + +class MlpSharedSingleActorMultiCritic(BaseActorCritic): + """MLP Actor Critic + + Attributes: + state_dim (int): State dimensions of the environment + action_dim (int): Action space dimensions of the environment + hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP + val_type (str): Value type of the critic network + discrete (bool): True if the action space is discrete, else False + num_critics (int): Number of critics in the architecture + sac (bool): True if a SAC-like network is needed, else False + activation (str): Activation function to be used. Can be either "tanh" or "relu" + """ + + def __init__( + self, + state_dim: spaces.Space, + action_dim: spaces.Space, + shared_layers: Tuple = (32, 32), + policy_layers: Tuple = (32, 32), + value_layers: Tuple = (32, 32), + val_type: str = "V", + discrete: bool = True, + num_critics: int = 2, + **kwargs, + ): + super(MlpSharedSingleActorMultiCritic, self).__init__() + + self.num_critics = num_critics + self.shared = mlp([state_dim] + list(shared_layers)) + self.actor = MlpPolicy( + shared_layers[-1], action_dim, policy_layers, discrete, **kwargs + ) + self.critic1 = MlpValue( + shared_layers[-1], action_dim, "Qsa", value_layers, **kwargs + ) + self.critic2 = MlpValue( + shared_layers[-1], action_dim, "Qsa", value_layers, **kwargs + ) + + self.action_scale = kwargs["action_scale"] if "action_scale" in kwargs else 1 + self.action_bias = kwargs["action_bias"] if "action_bias" in kwargs else 0 + + def get_params(self): + actor_params = list(self.actor.parameters()) + list(self.shared.parameters()) + critic_params = ( + list(self.critic1.parameters()) + + list(self.critic2.parameters()) + + list(self.shared.parameters()) + ) + return actor_params, critic_params + + def get_features(self, state: torch.Tensor): + """Extract features from the state, which is then an input to get_action and get_value + + Args: + state (:obj:`torch.Tensor`): The state(s) being passed + + Returns: + features (:obj:`torch.Tensor`): The feature(s) extracted from the state + """ + features = self.shared(state) + return features + def forward(self, x): q1_values = self.critic1(x).squeeze(-1) q2_values = self.critic2(x).squeeze(-1) @@ -151,6 +300,7 @@ def forward(self, x): def get_action(self, state: torch.Tensor, deterministic: bool = False): state = torch.as_tensor(state).float() + state = self.get_features(state) if self.actor.sac: mean, log_std = self.actor(state) @@ -190,6 +340,8 @@ def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: values (:obj:`list`): List of values as estimated by each individual critic """ state = torch.as_tensor(state).float() + x = self.get_features(state[:, :, :-1]) + state = torch.cat([x, state[:, :, -1].unsqueeze(-1)], dim=-1) if mode == "both": values = self.forward(state) @@ -240,6 +392,11 @@ def __init__( ) self.critic = MlpValue(output_size, action_dim, val_type, value_layers) + def get_params(self): + actor_params = list(self.feature.parameters()) + list(self.actor.parameters()) + critic_params = list(self.feature.parameters()) + list(self.critic.parameters()) + return actor_params, critic_params + def get_action( self, state: torch.Tensor, deterministic: bool = False ) -> torch.Tensor: @@ -288,6 +445,7 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor: "cnn": CNNActorCritic, "mlp12": MlpSingleActorMultiCritic, "mlps": MlpSharedActorCritic, + "mlp12s": MlpSharedSingleActorMultiCritic, } diff --git a/tests/test_deep/test_agents/test_custom.py b/tests/test_deep/test_agents/test_custom.py index c4614b70..a0c97063 100644 --- a/tests/test_deep/test_agents/test_custom.py +++ b/tests/test_deep/test_agents/test_custom.py @@ -24,6 +24,7 @@ def __init__( self, state_dim, action_dim, + shared_layers=None, policy_layers=(1, 1), value_layers=(1, 1), val_type="V", @@ -32,12 +33,18 @@ def __init__( super(custom_actorcritic, self).__init__( state_dim, action_dim, + shared_layers=shared_layers, policy_layers=policy_layers, value_layers=value_layers, val_type=val_type, **kwargs ) + def get_params(self): + actor_params = self.actor.parameters() + critic_params = self.critic.parameters() + return actor_params, critic_params + def test_custom_vpg(): env = VectorEnv("CartPole-v0", 1) diff --git a/tests/test_deep/test_agents/test_ppo1.py b/tests/test_deep/test_agents/test_ppo1.py index 1bb06a22..3e9feaf2 100644 --- a/tests/test_deep/test_agents/test_ppo1.py +++ b/tests/test_deep/test_agents/test_ppo1.py @@ -19,11 +19,3 @@ def test_ppo1_cnn(): trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) trainer.train() shutil.rmtree("./logs") - - -def test_ppo1_shared(): - env = VectorEnv("CartPole-v0") - algo = PPO1("mlp", env, shared_layers=(32, 32), rollout_size=128) - trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) - trainer.train() - shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_sac.py b/tests/test_deep/test_agents/test_sac.py index 8755c5c4..3ea1cfee 100644 --- a/tests/test_deep/test_agents/test_sac.py +++ b/tests/test_deep/test_agents/test_sac.py @@ -21,3 +21,28 @@ def test_sac(): ) trainer.train() shutil.rmtree("./logs") + + +def test_sac_shared(): + env = VectorEnv("Pendulum-v0", 2) + algo = SAC( + "mlp", + env, + batch_size=5, + shared_layers=[1, 1], + policy_layers=[1, 1], + value_layers=[1, 1], + ) + + trainer = OffPolicyTrainer( + algo, + env, + log_mode=["csv"], + logdir="./logs", + epochs=5, + max_ep_len=500, + warmup_steps=10, + start_update=10, + ) + trainer.train() + shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_td3.py b/tests/test_deep/test_agents/test_td3.py index e3d59491..35def46f 100644 --- a/tests/test_deep/test_agents/test_td3.py +++ b/tests/test_deep/test_agents/test_td3.py @@ -29,3 +29,29 @@ def test_td3(): ) trainer.train() shutil.rmtree("./logs") + + +def test_td3_shared(): + env = VectorEnv("Pendulum-v0", 2) + algo = TD3( + "mlp", + env, + batch_size=5, + noise=OrnsteinUhlenbeckActionNoise, + shared_layers=[1, 1], + policy_layers=[1, 1], + value_layers=[1, 1], + ) + + trainer = OffPolicyTrainer( + algo, + env, + log_mode=["csv"], + logdir="./logs", + epochs=5, + max_ep_len=500, + warmup_steps=10, + start_update=10, + ) + trainer.train() + shutil.rmtree("./logs") From 53450a8399530e68ba179cbe59ddeaa8f354a503 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Wed, 2 Sep 2020 01:28:36 +0530 Subject: [PATCH 03/39] Fix lint errors (1) --- genrl/core/actor_critic.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 1b135ebd..1660e132 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -90,12 +90,12 @@ def get_params(self): def get_features(self, state: torch.Tensor): """Extract features from the state, which is then an input to get_action and get_value - Args: - state (:obj:`torch.Tensor`): The state(s) being passed + Args: + state (:obj:`torch.Tensor`): The state(s) being passed - Returns: - features (:obj:`torch.Tensor`): The feature(s) extracted from the state - """ + Returns: + features (:obj:`torch.Tensor`): The feature(s) extracted from the state + """ features = self.shared(state) return features @@ -392,11 +392,6 @@ def __init__( ) self.critic = MlpValue(output_size, action_dim, val_type, value_layers) - def get_params(self): - actor_params = list(self.feature.parameters()) + list(self.actor.parameters()) - critic_params = list(self.feature.parameters()) + list(self.critic.parameters()) - return actor_params, critic_params - def get_action( self, state: torch.Tensor, deterministic: bool = False ) -> torch.Tensor: From 274aff98cb11915be2a984624f1e9c9bc22b2fa4 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Wed, 2 Sep 2020 02:14:03 +0530 Subject: [PATCH 04/39] Fixed tests --- genrl/core/actor_critic.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 1660e132..b45222de 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -392,6 +392,11 @@ def __init__( ) self.critic = MlpValue(output_size, action_dim, val_type, value_layers) + def get_params(self): + actor_params = list(self.feature.parameters()) + list(self.actor.parameters()) + critic_params = list(self.feature.parameters()) + list(self.critic.parameters()) + return actor_params, critic_params + def get_action( self, state: torch.Tensor, deterministic: bool = False ) -> torch.Tensor: From 38f95f00ee2397844fb174cda61641134adc4a04 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Thu, 3 Sep 2020 01:38:45 +0530 Subject: [PATCH 05/39] Changes to dicstrings and classes --- genrl/core/actor_critic.py | 141 +++++++++++------------ tests/test_deep/test_agents/test_ppo1.py | 8 ++ 2 files changed, 76 insertions(+), 73 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index b45222de..aa460be7 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -18,7 +18,8 @@ class MlpActorCritic(BaseActorCritic): Attributes: state_dim (int): State dimensions of the environment action_dim (int): Action space dimensions of the environment - hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP + policy_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the policy MLP + value_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the value MLP val_type (str): Value type of the critic network discrete (bool): True if the action space is discrete, else False sac (bool): True if a SAC-like network is needed, else False @@ -53,7 +54,9 @@ class MlpSharedActorCritic(BaseActorCritic): Attributes: state_dim (int): State dimensions of the environment action_dim (int): Action space dimensions of the environment - hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP + shared_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the shared MLP + policy_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the policy MLP + value_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the value MLP val_type (str): Value type of the critic network discrete (bool): True if the action space is discrete, else False sac (bool): True if a SAC-like network is needed, else False @@ -100,6 +103,18 @@ def get_features(self, state: torch.Tensor): return features def get_action(self, state: torch.Tensor, deterministic: bool = False): + """Get Actions from the actor + + Arg: + state (:obj:`torch.Tensor`): The state(s) being passed to the critics + deterministic (bool): True if the action space is deterministic, else False + + Returns: + action (:obj:`list`): List of actions as estimated by the critic + distribution (): The distribution from which the action was sampled + (None if determinist + """ + state = torch.as_tensor(state).float() features = self.get_features(state) action_probs = self.actor(features) @@ -115,6 +130,14 @@ def get_action(self, state: torch.Tensor, deterministic: bool = False): return action, distribution def get_value(self, state: torch.Tensor): + """Get Values from the Critic + + Arg: + state (:obj:`torch.Tensor`): The state(s) being passed to the critics + + Returns: + values (:obj:`list`): List of values as estimated by the critic + """ state = torch.as_tensor(state).float() if self.critic.val_type == "Qsa": features = self.shared(state[:, :, :-1]) @@ -132,7 +155,8 @@ class MlpSingleActorMultiCritic(BaseActorCritic): Attributes: state_dim (int): State dimensions of the environment action_dim (int): Action space dimensions of the environment - hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP + policy_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the policy MLP + value_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the value MLP val_type (str): Value type of the critic network discrete (bool): True if the action space is discrete, else False num_critics (int): Number of critics in the architecture @@ -175,6 +199,17 @@ def forward(self, x): return (q1_values, q2_values) def get_action(self, state: torch.Tensor, deterministic: bool = False): + """Get Actions from the actor + + Arg: + state (:obj:`torch.Tensor`): The state(s) being passed to the critics + deterministic (bool): True if the action space is deterministic, else False + + Returns: + action (:obj:`list`): List of actions as estimated by the critic + distribution (): The distribution from which the action was sampled + (None if determinist + """ state = torch.as_tensor(state).float() if self.actor.sac: @@ -229,13 +264,15 @@ def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: return values -class MlpSharedSingleActorMultiCritic(BaseActorCritic): +class MlpSharedSingleActorMultiCritic(MlpSingleActorMultiCritic): """MLP Actor Critic Attributes: state_dim (int): State dimensions of the environment action_dim (int): Action space dimensions of the environment - hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP + shared_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the shared MLP + policy_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the policy MLP + value_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the value MLP val_type (str): Value type of the critic network discrete (bool): True if the action space is discrete, else False num_critics (int): Number of critics in the architecture @@ -250,36 +287,22 @@ def __init__( shared_layers: Tuple = (32, 32), policy_layers: Tuple = (32, 32), value_layers: Tuple = (32, 32), - val_type: str = "V", + val_type: str = "Qsa", discrete: bool = True, num_critics: int = 2, **kwargs, ): - super(MlpSharedSingleActorMultiCritic, self).__init__() - - self.num_critics = num_critics - self.shared = mlp([state_dim] + list(shared_layers)) - self.actor = MlpPolicy( - shared_layers[-1], action_dim, policy_layers, discrete, **kwargs - ) - self.critic1 = MlpValue( - shared_layers[-1], action_dim, "Qsa", value_layers, **kwargs + super(MlpSharedSingleActorMultiCritic, self).__init__( + shared_layers[-1], + action_dim, + policy_layers, + value_layers, + val_type, + discrete, + num_critics, + **kwargs, ) - self.critic2 = MlpValue( - shared_layers[-1], action_dim, "Qsa", value_layers, **kwargs - ) - - self.action_scale = kwargs["action_scale"] if "action_scale" in kwargs else 1 - self.action_bias = kwargs["action_bias"] if "action_bias" in kwargs else 0 - - def get_params(self): - actor_params = list(self.actor.parameters()) + list(self.shared.parameters()) - critic_params = ( - list(self.critic1.parameters()) - + list(self.critic2.parameters()) - + list(self.shared.parameters()) - ) - return actor_params, critic_params + self.shared = mlp([state_dim] + list(shared_layers)) def get_features(self, state: torch.Tensor): """Extract features from the state, which is then an input to get_action and get_value @@ -293,41 +316,24 @@ def get_features(self, state: torch.Tensor): features = self.shared(state) return features - def forward(self, x): - q1_values = self.critic1(x).squeeze(-1) - q2_values = self.critic2(x).squeeze(-1) - return (q1_values, q2_values) - def get_action(self, state: torch.Tensor, deterministic: bool = False): - state = torch.as_tensor(state).float() - state = self.get_features(state) + """Get Actions from the actor - if self.actor.sac: - mean, log_std = self.actor(state) - std = log_std.exp() - distribution = Normal(mean, std) - - action_probs = distribution.rsample() - log_probs = distribution.log_prob(action_probs) - action_probs = torch.tanh(action_probs) - - action = action_probs * self.action_scale + self.action_bias - - # enforcing action bound (appendix of SAC paper) - log_probs -= torch.log( - self.action_scale * (1 - action_probs.pow(2)) + np.finfo(np.float32).eps - ) - log_probs = log_probs.sum(1, keepdim=True) - mean = torch.tanh(mean) * self.action_scale + self.action_bias - - action = (action.float(), log_probs, mean) - else: - action = self.actor.get_action(state, deterministic=deterministic) + Arg: + state (:obj:`torch.Tensor`): The state(s) being passed to the critics + deterministic (bool): True if the action space is deterministic, else False - return action + Returns: + action (:obj:`list`): List of actions as estimated by the critic + distribution (): The distribution from which the action was sampled + (None if determinist + """ + return super(MlpSharedSingleActorMultiCritic, self).get_action( + self.get_features(state), deterministic=deterministic + ) - def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: - """Get Values from the Critic + def get_value(self, state: torch.Tensor, mode="first"): + """Get Values from both the Critic Arg: state (:obj:`torch.Tensor`): The state(s) being passed to the critics @@ -342,18 +348,7 @@ def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: state = torch.as_tensor(state).float() x = self.get_features(state[:, :, :-1]) state = torch.cat([x, state[:, :, -1].unsqueeze(-1)], dim=-1) - - if mode == "both": - values = self.forward(state) - elif mode == "min": - values = self.forward(state) - values = torch.min(*values).squeeze(-1) - elif mode == "first": - values = self.critic1(state) - else: - raise KeyError("Mode doesn't exist") - - return values + return super(MlpSharedSingleActorMultiCritic, self).get_value(state, mode) class CNNActorCritic(BaseActorCritic): diff --git a/tests/test_deep/test_agents/test_ppo1.py b/tests/test_deep/test_agents/test_ppo1.py index 3e9feaf2..97d40791 100644 --- a/tests/test_deep/test_agents/test_ppo1.py +++ b/tests/test_deep/test_agents/test_ppo1.py @@ -19,3 +19,11 @@ def test_ppo1_cnn(): trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) trainer.train() shutil.rmtree("./logs") + + +def test_ppo1_shared(): + env = VectorEnv("CartPole-v0") + algo = PPO1("mlp", env, shared_layers=[32, 32], rollout_size=128) + trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) + trainer.train() + shutil.rmtree("./logs") From 0927001fa98545de813be9cb6610bc557476cf86 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Thu, 3 Sep 2020 11:07:09 +0530 Subject: [PATCH 06/39] adding MultiAgentBuffer --- genrl/core/buffers.py | 87 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/genrl/core/buffers.py b/genrl/core/buffers.py index 0a5b6e7c..dcfe5f89 100644 --- a/genrl/core/buffers.py +++ b/genrl/core/buffers.py @@ -23,6 +23,13 @@ class PrioritizedReplayBufferSamples(NamedTuple): indices: torch.Tensor weights: torch.Tensor +class MultiAgentReplayBuffer(NamedTuple): + states: torch.Tensor + actions: torch.Tensor + rewards: torch.Tensor + next_states: torch.Tensor + dones: torch.Tensor + class ReplayBuffer: """ @@ -181,3 +188,83 @@ def __len__(self) -> int: @property def pos(self): return len(self.buffer) + + + +class MultiAgentReplayBuffer: + """ + Implements the basic Experience Replay Mechanism for MultiAgents by feeding in global states, + global actions, global rewards, global next_states, global dones + + :param capacity: Size of the replay buffer + :type capacity: int + :param num_agents: Number of agents in the environment + :type num_agents: int + """ + def __init__(self, num_agents, capacity): + self.capacity = capacity + self.num_agents = num_agents + self.buffer = deque(maxlen=max_size) + + def push(self, inp: Tuple) -> None: + """ + Adds new experience to buffer + + :param inp: (Tuple containing `state`, `action`, `reward`, + `next_state` and `done`) + :type inp: tuple + :returns: None + """ + self.buffer.append(inp) + + + def sample(self, batch_size): + + """ + Returns randomly sampled experiences from replay memory + + :param batch_size: Number of samples per batch + :type batch_size: int + :returns: (Tuple composing of `indiv_obs_batch`, `indiv_action_batch`, `indiv_reward_batch`, `indiv_next_obs_batch`, + `global_state_batch`, `global_actions_batch`, `global_next_state_batch`, `done_batch`) + """ + indiv_obs_batch = [[] for _ in range(self.num_agents)] # [ [states of agent 1], ... ,[states of agent n] ] ] + indiv_action_batch = [[] for _ in range(self.num_agents)] # [ [actions of agent 1], ... , [actions of agent n]] + indiv_reward_batch = [[] for _ in range(self.num_agents)] + indiv_next_obs_batch = [[] for _ in range(self.num_agents)] + + global_state_batch = [] + global_next_state_batch = [] + global_actions_batch = [] + done_batch = [] + + batch = random.sample(self.buffer, batch_size) + + + for experience in batch: + state, action, reward, next_state, done = experience + + for i in range(self.num_agents): + indiv_obs_batch[i].append(state[i]) + indiv_action_batch[i].append(action[i]) + indiv_reward_batch[i].append(reward[i]) + indiv_next_obs_batch[i].append(next_state[i]) + + global_state_batch.append(torch.cat(state)) + global_actions_batch.append(torch.cat(action)) + global_next_state_batch.append(torch.cat(next_state)) + done_batch.append(done) + + global_state_batch = torch.stack(global_state_batch) + global_actions_batch = torch.stack(global_actions_batch) + global_next_state_batch = torch.stack(global_next_state_batch) + done_batch = torch.stack(done_batch) + indiv_obs_batch = torch.stack([torch.FloatTensor(obs) for obs in indiv_obs_batch]) + indiv_action_batch = torch.stack([torch.FloatTensor(act) for act in indiv_action_batch]) + indiv_reward_batch = torch.stack([torch.FloatTensor(rew) for rew in indiv_reward_batch]) + indiv_next_obs_batch = torch.stack([torch.FloatTensor(next_obs) for next_obs in indiv_next_obs_batch]) + + return indiv_obs_batch, indiv_action_batch, indiv_reward_batch, indiv_next_obs_batch, global_state_batch, global_actions_batch, global_next_state_batch, done_batch + + def __len__(self): + return len(self.buffer) \ No newline at end of file From daa8b2ac9ea6b785279c2a96e6810f03d68857f0 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Thu, 3 Sep 2020 12:42:07 +0530 Subject: [PATCH 07/39] shared mlp --- genrl/utils/utils.py | 91 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/genrl/utils/utils.py b/genrl/utils/utils.py index 89e53337..fe9f1e3c 100644 --- a/genrl/utils/utils.py +++ b/genrl/utils/utils.py @@ -62,6 +62,97 @@ def mlp( return nn.Sequential(*layers) +def shared_mlp( + network1_prev, + network2_prev, + shared, + network1_post, + network2_post, + weight_init, + activation_func, + sac + ) +""" + Generates an MLP model given sizes of each layer (Mostly used for SharedActorCritic) + + :param network1_prev: Sizes of network1's initial layers + :param network2_prev: Sizes of network2's initial layers + :param shared: Sizes of shared layers + :param network1_post: Sizes of network1's latter layers + :param network2_post: Sizes of network2's latter layers + :param weight_init: type of weight initialization + :param activation_func: type of activation function + :param sac: True if Soft Actor Critic is being used, else False + :type network1_prev,network2_prev,shared,network1_post,network2_post: tuple or list + :type weight_init,activation_func: string + :type sac: bool + :returns: network1 and networ2(Neural Network with fully-connected linear layers and + activation layers) + """ + + if len(network1_prev) != 0: + network1_prev = nn.ModuleList() + if len(network2_prev) != 0: + network2_prev = nn.ModuleList() + if len(shared) != 0: + shared = nn.ModuleList() + if len(network1_post) != 0: + network1_post = nn.ModuleList() + if len(network2_post) != 0: + network2_post = nn.ModuleList() + + + # add more activation functions + if activation_func == "relu": + activation = F.relu + elif activation_func == "tanh": + activation = torch.tanh + else: + activation = None + + # add more weight init + if weight_init == "xavier_uniform": + weight_init = torch.nn.init.xavier_uniform_ + elif weight_init == "xavier_normal": + weight_init = torch.nn.init.xavier_normal_ + else: + weight_init = None + + if len(shared) != 0 or len(network1_post) != 0 or len(network2_post) != 0: + if not (network1_prev[-1]==network2_prev[-1] and network1_prev[-1]==shared[0] and network1_post[0]==network2_post[0] and network1_post[0]==shared[-1]): + raise ValueError + + for i in range(len(network1_prev)-1): + network1_prev.append(nn.Linear(network1_prev[i],network1_prev[i+1])) + if weight_init is not None: + weight_init(network1_prev[-1].weight) + + for i in range(len(network2_prev)-1): + network2_prev.append(nn.Linear(network2_prev[i],network2_prev[i+1])) + if weight_init is not None: + weight_init(network2_prev[-1].weight) + + for i in range(len(shared)-1): + shared.append(nn.Linear(shared[i], shared[i+1])) + if weight_init is not None: + weight_init(shared[-1].weight) + + for i in range(len(network1_post)-1): + network1_post.append(nn.Linear(network1_post[i],network1_post[i+1])) + if weight_init is not None: + weight_init(network1_post[-1].weight) + + for i in range(len(network2_post)-1): + network2_post.append(nn.Linear(network2_post[i],network2_post[i+1])) + if weight_init is not None: + weight_init(network2_post[-1].weight) + + + network1 = nn.Sequential(network1_prev,shared,network1_post) + network2 = nn.Sequential(network2_prev,shared,network2_post) + + return network1,network2 + def cnn( channels: Tuple = (4, 16, 32), From 44db72eafe343f5ddf10bf996af5f2a2d8f5fd96 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Thu, 3 Sep 2020 13:39:00 +0530 Subject: [PATCH 08/39] adding changes --- genrl/core/actor_critic.py | 108 ++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 62ad130c..26ec614b 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -8,7 +8,7 @@ from genrl.core.base import BaseActorCritic from genrl.core.policies import MlpPolicy from genrl.core.values import MlpValue -from genrl.utils.utils import cnn +from genrl.utils.utils import cnn, shared_mlp class MlpActorCritic(BaseActorCritic): @@ -216,10 +216,116 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor: return value +class SharedActorCritic(BaseActorCritic): + def __init__( + self, + critic_prev, + actor_prev, + shared, + critic_post, + actor_post, + weight_init, + activation_func + ): + super(SharedActorCritic, self).__init__() + + self.critic,self.actor = shared_mlp(critic_prev,actor_prev,shared,critic_post,actor_post,weight_init,activation_func) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + def forward(self, state_critic,state_action): + + if state_critic is not None: + + for i in range(len(self.actorcritic.network1_prev)): + if self.actorcritic.activation is not None: + state_critic = self.actorcritic.activation(self.actorcritic.network1_prev[i](state_critic)) + else: + state_critic = self.actorcritic.network1_prev[i](state_critic) + + for i in range(len(self.actorcritic.shared)): + if self.actorcritic.activation is not None: + state_critic = self.actorcritic.activation(self.actorcritic.shared[i](state_critic)) + else: + state_critic = self.actorcritic.shared[i](state_critic) + + for i in range(len(self.actorcritic.network1_post)): + if self.actorcritic.activation is not None: + state_critic = self.actorcritic.activation(self.actorcritic.network1_post[i](state_critic)) + else: + state_critic = self.actorcritic.network1_post[i](state_critic) + + return state_critic + + if state_action is not None: + + for i in range(len(self.actorcritic.network2_prev)): + if self.actorcritic.activation is not None: + state_action = self.actorcritic.activation(self.actorcritic.network2_prev[i](state_action)) + else: + state_action = self.actorcritic.network2_prev[i](state_action) + + for i in range(len(self.actorcritic.shared)): + if self.actorcritic.activation is not None: + state_action = self.actorcritic.activation(self.actorcritic.shared[i](state_action)) + else: + state_action = self.actorcritic.shared[i](state_action) + + for i in range(len(self.actorcritic.network2_post)): + if self.actorcritic.activations is not None: + state_action = self.actorcritic.activation(self.actorcritic.network2_post[i](state_action)) + else: + state_action = self.actorcritic.network2_post[i](state_action) + + return state_action + + + + def get_action(self, state, one_hot=False, deterministic=False): + # state = torch.FloatTensor(state).to(self.device) + logits = self.forward(None,state) + if one_hot: + if deterministic: + logits = self.onehot_from_logits(logits,eps=1.0) + else: + logits = self.onehot_from_logits(logits,eps=0.0) + return logits + + dist = F.softmax(logits, dim=0) + probs = Categorical(dist) + if deterministic: + index = torch.argmax(probs) + else: + index = probs.sample().cpu().detach().item() + return index + + def onehot_from_logits(self, logits, eps=0.0): + # get best (according to current policy) actions in one-hot form + argmax_acs = (logits == logits.max(0, keepdim=True)[0]).float() + if eps == 0.0: + return argmax_acs + # get random actions in one-hot form + rand_acs = torch.eye(logits.shape[1])[ + [np.random.choice(range(logits.shape[1]), size=logits.shape[0])] + ] + # chooses between best and random actions using epsilon greedy + return torch.stack( + [ + argmax_acs[i] if r > eps else rand_acs[i] + for i, r in enumerate(torch.rand(logits.shape[0])) + ] + ) + + def get_value(self, state): + # state = torch.FloatTensor(state).to(self.device) + value = self.forward(state,None) + return value + + actor_critic_registry = { "mlp": MlpActorCritic, "cnn": CNNActorCritic, "mlp12": MlpSingleActorMultiCritic, + "mlpshared": SharedActorCritic, } From 4ef8f4872043af2e5c5d40ad92df9be0e9e5d147 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Thu, 3 Sep 2020 14:14:36 +0530 Subject: [PATCH 09/39] new mlp for maddpg --- genrl/core/actor_critic.py | 43 +++----------------------------------- 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 26ec614b..9fe1974f 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -232,51 +232,14 @@ def __init__( self.critic,self.actor = shared_mlp(critic_prev,actor_prev,shared,critic_post,actor_post,weight_init,activation_func) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + def forward(self, state_critic,state_action): if state_critic is not None: - - for i in range(len(self.actorcritic.network1_prev)): - if self.actorcritic.activation is not None: - state_critic = self.actorcritic.activation(self.actorcritic.network1_prev[i](state_critic)) - else: - state_critic = self.actorcritic.network1_prev[i](state_critic) - - for i in range(len(self.actorcritic.shared)): - if self.actorcritic.activation is not None: - state_critic = self.actorcritic.activation(self.actorcritic.shared[i](state_critic)) - else: - state_critic = self.actorcritic.shared[i](state_critic) - - for i in range(len(self.actorcritic.network1_post)): - if self.actorcritic.activation is not None: - state_critic = self.actorcritic.activation(self.actorcritic.network1_post[i](state_critic)) - else: - state_critic = self.actorcritic.network1_post[i](state_critic) - - return state_critic + return self.critic(state_critic) if state_action is not None: - - for i in range(len(self.actorcritic.network2_prev)): - if self.actorcritic.activation is not None: - state_action = self.actorcritic.activation(self.actorcritic.network2_prev[i](state_action)) - else: - state_action = self.actorcritic.network2_prev[i](state_action) - - for i in range(len(self.actorcritic.shared)): - if self.actorcritic.activation is not None: - state_action = self.actorcritic.activation(self.actorcritic.shared[i](state_action)) - else: - state_action = self.actorcritic.shared[i](state_action) - - for i in range(len(self.actorcritic.network2_post)): - if self.actorcritic.activations is not None: - state_action = self.actorcritic.activation(self.actorcritic.network2_post[i](state_action)) - else: - state_action = self.actorcritic.network2_post[i](state_action) - - return state_action + return self.actor(state_action) From d8cf1a92cbaf3ef926e70b62cf6f6db8438bc2d8 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Thu, 3 Sep 2020 14:26:38 +0530 Subject: [PATCH 10/39] adding environment loader --- genrl/environments/suite.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/genrl/environments/suite.py b/genrl/environments/suite.py index 11310c08..299dd1af 100644 --- a/genrl/environments/suite.py +++ b/genrl/environments/suite.py @@ -97,3 +97,33 @@ def AtariEnv( env = wrapper(env) return env + + +def MultiAgentParticleEnv( + scenario_name:str, + benchmark:bool + )->gym.Env: + """ + Function to apply wrappers for all Atari envs by Trainer class + + :param scenarion_name: Environment Name + :type env: string + :param benchmark: laod benchmark results + :type wrapper_list: bool + :returns: Gym Atari Environment + :rtype: object + """ + + import multiagent.scenarios as scenarios + # load scenario from script + scenario = scenarios.load(scenario_name + ".py").Scenario() + # create world + world = scenario.make_world() + # create multiagent environment + if benchmark: + env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) + else: + env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) + + return env + From 8d2cf066b1d0f5193c16cb4790525c35d7c6ceff Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Thu, 3 Sep 2020 15:10:11 +0530 Subject: [PATCH 11/39] Adding Actor and Critic classes --- genrl/core/actor_critic.py | 89 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 9fe1974f..c56ced4f 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -284,6 +284,95 @@ def get_value(self, state): return value + +class Actor(BaseActorCritic): + def __init__( + self, + state_dim: spaces.Space, + action_dim: spaces.Space, + policy_layers: Tuple = (32, 32), + discrete: bool = True, + **kwargs, + ): + def __init__(self, layer_sizes,weight_init,activation_func): + super(Actor, self).__init__() + + self.actor = MlpPolicy(layer, action_dim, policy_layers, discrete, **kwargs) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + def forward(self, policy): + policy = self.actor(policy) + return policy + + + + def get_action(self, state, one_hot=False, deterministic=False): + # state = torch.FloatTensor(state).to(self.device) + logits = self.forward(state) + if one_hot: + if deterministic: + logits = self.onehot_from_logits(logits,eps=1.0) + else: + logits = self.onehot_from_logits(logits,eps=0.0) + return logits + + dist = F.softmax(logits, dim=0) + probs = Categorical(dist) + if deterministic: + index = torch.argmax(probs) + else: + index = probs.sample().cpu().detach().item() + return index + + def onehot_from_logits(self, logits, eps=0.0): + # get best (according to current policy) actions in one-hot form + argmax_acs = (logits == logits.max(0, keepdim=True)[0]).float() + if eps == 0.0: + return argmax_acs + # get random actions in one-hot form + rand_acs = torch.eye(logits.shape[1])[ + [np.random.choice(range(logits.shape[1]), size=logits.shape[0])] + ] + # chooses between best and random actions using epsilon greedy + return torch.stack( + [ + argmax_acs[i] if r > eps else rand_acs[i] + for i, r in enumerate(torch.rand(logits.shape[0])) + ] + ) + + +class Critic(BaseActorCritic): + def __init__( + self, + state_dim: spaces.Space, + action_dim: spaces.Space, + policy_layers: Tuple = (32, 32), + value_layers: Tuple = (32, 32), + val_type: str = "V", + discrete: bool = True, + **kwargs, + ): + super(MlpActorCritic, self).__init__() + + self.critic = MlpValue(state_dim, action_dim, val_type, value_layers, **kwargs) + + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + def forward(self, value): + + value = self.critic(value) + + return value + + + + def get_value(self, state): + # state = torch.FloatTensor(state).to(self.device) + value = self.forward(state) + return value + + actor_critic_registry = { "mlp": MlpActorCritic, "cnn": CNNActorCritic, From 1365585b4daae5c505d448db3958b8474466129c Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Thu, 3 Sep 2020 15:19:37 +0530 Subject: [PATCH 12/39] adding new functionalities --- genrl/environments/gym_wrapper.py | 104 ++++++++++++++++++++++++++++++ genrl/utils/utils.py | 71 +++++++++++++++++--- 2 files changed, 167 insertions(+), 8 deletions(-) diff --git a/genrl/environments/gym_wrapper.py b/genrl/environments/gym_wrapper.py index ceb99472..e283daa6 100644 --- a/genrl/environments/gym_wrapper.py +++ b/genrl/environments/gym_wrapper.py @@ -106,3 +106,107 @@ def close(self) -> None: Closes environment """ self.env.close() + + +class MultiGymWrapper(gym.Wrapper): + """ + Wrapper class for all MultiAgent Particle Environments + + :param env: Gym environment name + :param n_envs: Number of environments. None if not vectorised + :param parallel: If vectorised, should environments be run through \ +serially or parallelly + :type env: string + :type n_envs: None, int + :type parallel: boolean + """ + + def __init__(self, env: gym.Env): + super(GymWrapper, self).__init__(env) + self.env = env + + self.observation_space = self.env.observation_space + self.action_space = self.env.action_space + + self.state = None + self.action = None + self.reward = None + self.done = False + self.info = {} + + def __getattr__(self, name: str) -> Any: + """ + All other calls would go to base env + """ + env = super(GymWrapper, self).__getattribute__("env") + return getattr(env, name) + + @property + def obs_shape(self): + if isinstance(self.env.observation_space, gym.spaces.Discrete): + obs_shape = (1,) + elif isinstance(self.env.observation_space, gym.spaces.Box): + obs_shape = self.env.observation_space.shape + return obs_shape + + @property + def action_shape(self): + if isinstance(self.env.action_space, gym.spaces.Box): + action_shape = self.env.action_space.shape + elif isinstance(self.env.action_space, gym.spaces.Discrete): + action_shape = (1,) + return action_shape + + def sample(self) -> np.ndarray: + """ + Shortcut method to directly sample from environment's action space + + :returns: Random action from action space + :rtype: NumPy Array + """ + return self.env.action_space.sample() + + def render(self, mode: str = "human") -> None: + """ + Renders all envs in a tiles format similar to baselines. + + :param mode: Can either be 'human' or 'rgb_array'. \ +Displays tiled images in 'human' and returns tiled images in 'rgb_array' + :type mode: string + """ + self.env.render(mode=mode) + + def seed(self, seed: int = None) -> None: + """ + Set environment seed + + :param seed: Value of seed + :type seed: int + """ + self.env.seed(seed) + + def step(self, action: np.ndarray) -> np.ndarray: + """ + Steps the env through given action + + :param action: Action taken by agent + :type action: NumPy array + :returns: Next observation, reward, game status and debugging info + """ + self.state, self.reward, self.done, self.info = self.env.step(action) + self.action = action + return self.state, self.reward, self.done, self.info + + def reset(self) -> np.ndarray: + """ + Resets environment + + :returns: Initial state + """ + return self.env.reset() + + def close(self) -> None: + """ + Closes environment + """ + self.env.close() diff --git a/genrl/utils/utils.py b/genrl/utils/utils.py index fe9f1e3c..b57086fa 100644 --- a/genrl/utils/utils.py +++ b/genrl/utils/utils.py @@ -62,6 +62,52 @@ def mlp( return nn.Sequential(*layers) + +# If at all you need to concatenate states to actions after passing states through n FC layers +def mlp_( + self, + layer_sizes, + weight_init, + activation_func, + concat_ind, + sac + ): + """ + Generates an MLP model given sizes of each layer + + :param layer_sizes: Sizes of hidden layers + :param weight_init: type of weight initialization + :param activation_func: type of activation function + :param concat_ind: index of layer at which actions to be concatenated + :param sac: True if Soft Actor Critic is being used, else False + :type layer_sizes: tuple or list + :type concat_ind: int + :type sac: bool + :type weight_init,activation_func: string + :returns: (Neural Network with fully-connected linear layers and + activation layers) + """ + layers = [] + limit = len(layer_sizes) if sac is False else len(sizes) - 1 + + # add more activations + activation = nn.Tanh() if activation_func == "tanh" else nn.ReLU() + + # add more weight init + if weight_init == "xavier_uniform": + weight_init = torch.nn.init.xavier_uniform_ + elif weight_init == "xavier_normal": + weight_init = torch.nn.init.xavier_normal_ + + + for layer in range(limit - 1): + if layer==concat_ind: + continue + act = activation if layer < limit - 2 else nn.Identity() + layers += [nn.Linear(sizes[layer], sizes[layer + 1]), act] + weight_init(layers[-1][0].weight) + + def shared_mlp( network1_prev, network2_prev, @@ -102,14 +148,6 @@ def shared_mlp( network2_post = nn.ModuleList() - # add more activation functions - if activation_func == "relu": - activation = F.relu - elif activation_func == "tanh": - activation = torch.tanh - else: - activation = None - # add more weight init if weight_init == "xavier_uniform": weight_init = torch.nn.init.xavier_uniform_ @@ -118,32 +156,49 @@ def shared_mlp( else: weight_init = None + if activation_func == "relu": + activation = nn.ReLU() + elif activation_func == "tanh": + activation = nn.Tanh() + else: + activation = None + if len(shared) != 0 or len(network1_post) != 0 or len(network2_post) != 0: if not (network1_prev[-1]==network2_prev[-1] and network1_prev[-1]==shared[0] and network1_post[0]==network2_post[0] and network1_post[0]==shared[-1]): raise ValueError for i in range(len(network1_prev)-1): network1_prev.append(nn.Linear(network1_prev[i],network1_prev[i+1])) + if activation is not None: + network1_prev.append(activation) if weight_init is not None: weight_init(network1_prev[-1].weight) for i in range(len(network2_prev)-1): network2_prev.append(nn.Linear(network2_prev[i],network2_prev[i+1])) + if activation is not None: + network2_prev.append(activation) if weight_init is not None: weight_init(network2_prev[-1].weight) for i in range(len(shared)-1): shared.append(nn.Linear(shared[i], shared[i+1])) + if activation is not None: + shared.append(activation) if weight_init is not None: weight_init(shared[-1].weight) for i in range(len(network1_post)-1): network1_post.append(nn.Linear(network1_post[i],network1_post[i+1])) + if activation is not None: + network1_post.append(activation) if weight_init is not None: weight_init(network1_post[-1].weight) for i in range(len(network2_post)-1): network2_post.append(nn.Linear(network2_post[i],network2_post[i+1])) + if activation is not None: + network2_post.append(activation) if weight_init is not None: weight_init(network2_post[-1].weight) From 5067e42c7a79a648c645a05d0df6a64bbd470810 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Thu, 3 Sep 2020 18:00:17 +0530 Subject: [PATCH 13/39] minor changes --- genrl/core/actor_critic.py | 19 +++++++------------ genrl/utils/utils.py | 2 +- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index c56ced4f..fbb66a8a 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -285,23 +285,22 @@ def get_value(self, state): -class Actor(BaseActorCritic): +class Actor(MlpPolicy): def __init__( self, state_dim: spaces.Space, action_dim: spaces.Space, - policy_layers: Tuple = (32, 32), + hidden: Tuple = (32, 32), discrete: bool = True, **kwargs, ): def __init__(self, layer_sizes,weight_init,activation_func): super(Actor, self).__init__() - self.actor = MlpPolicy(layer, action_dim, policy_layers, discrete, **kwargs) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def forward(self, policy): - policy = self.actor(policy) + policy = self.model(policy) return policy @@ -342,26 +341,22 @@ def onehot_from_logits(self, logits, eps=0.0): ) -class Critic(BaseActorCritic): +class Critic(MlpValue): def __init__( self, state_dim: spaces.Space, action_dim: spaces.Space, - policy_layers: Tuple = (32, 32), - value_layers: Tuple = (32, 32), + fc_layers: Tuple = (32, 32), val_type: str = "V", - discrete: bool = True, **kwargs, ): - super(MlpActorCritic, self).__init__() - - self.critic = MlpValue(state_dim, action_dim, val_type, value_layers, **kwargs) + super(Critic, self).__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def forward(self, value): - value = self.critic(value) + value = self.model(value) return value diff --git a/genrl/utils/utils.py b/genrl/utils/utils.py index b57086fa..a1b7a501 100644 --- a/genrl/utils/utils.py +++ b/genrl/utils/utils.py @@ -117,7 +117,7 @@ def shared_mlp( weight_init, activation_func, sac - ) + ): """ Generates an MLP model given sizes of each layer (Mostly used for SharedActorCritic) From 6f0563e885738ac58d3fe4299596e8f0288cfa6b Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Thu, 3 Sep 2020 18:01:55 +0530 Subject: [PATCH 14/39] added return statement to mlp_ --- genrl/utils/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/genrl/utils/utils.py b/genrl/utils/utils.py index a1b7a501..e80ca125 100644 --- a/genrl/utils/utils.py +++ b/genrl/utils/utils.py @@ -107,6 +107,8 @@ def mlp_( layers += [nn.Linear(sizes[layer], sizes[layer + 1]), act] weight_init(layers[-1][0].weight) + return nn.Sequential(*layers) + def shared_mlp( network1_prev, From 5061abe111efbb0324b52b02920d639ba9da6362 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Fri, 4 Sep 2020 11:22:27 +0530 Subject: [PATCH 15/39] rectifying --- .pre-commit-config.yaml | 6 +- genrl/core/actor_critic.py | 39 ++++--- genrl/environments/gym_wrapper.py | 2 +- genrl/utils/utils.py | 179 +++++++++++++++--------------- 4 files changed, 110 insertions(+), 116 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2c68e57a..7daa29ec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,12 @@ repos: - repo: https://github.com/asottile/seed-isort-config - rev: v1.9.4 + rev: v2.2.0 hooks: - id: seed-isort-config args: [--exclude=^((examples|docs)/.*)$] - repo: https://github.com/timothycrosley/isort - rev: 4.3.2 + rev: 5.5.0 hooks: - id: isort @@ -14,7 +14,7 @@ repos: rev: 20.8b1 hooks: - id: black - language_version: python3.7 + language_version: python3 - repo: https://gitlab.com/pycqa/flake8 rev: 3.8.3 diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index fbb66a8a..66a49d87 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -218,22 +218,29 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor: class SharedActorCritic(BaseActorCritic): def __init__( - self, + self, critic_prev, actor_prev, shared, critic_post, actor_post, weight_init, - activation_func - ): + activation_func, + ): super(SharedActorCritic, self).__init__() - self.critic,self.actor = shared_mlp(critic_prev,actor_prev,shared,critic_post,actor_post,weight_init,activation_func) + self.critic, self.actor = shared_mlp( + critic_prev, + actor_prev, + shared, + critic_post, + actor_post, + weight_init, + activation_func, + ) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - def forward(self, state_critic,state_action): + def forward(self, state_critic, state_action): if state_critic is not None: return self.critic(state_critic) @@ -241,16 +248,14 @@ def forward(self, state_critic,state_action): if state_action is not None: return self.actor(state_action) - - def get_action(self, state, one_hot=False, deterministic=False): # state = torch.FloatTensor(state).to(self.device) - logits = self.forward(None,state) + logits = self.forward(None, state) if one_hot: if deterministic: - logits = self.onehot_from_logits(logits,eps=1.0) + logits = self.onehot_from_logits(logits, eps=1.0) else: - logits = self.onehot_from_logits(logits,eps=0.0) + logits = self.onehot_from_logits(logits, eps=0.0) return logits dist = F.softmax(logits, dim=0) @@ -280,11 +285,10 @@ def onehot_from_logits(self, logits, eps=0.0): def get_value(self, state): # state = torch.FloatTensor(state).to(self.device) - value = self.forward(state,None) + value = self.forward(state, None) return value - class Actor(MlpPolicy): def __init__( self, @@ -294,7 +298,6 @@ def __init__( discrete: bool = True, **kwargs, ): - def __init__(self, layer_sizes,weight_init,activation_func): super(Actor, self).__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -303,16 +306,14 @@ def forward(self, policy): policy = self.model(policy) return policy - - def get_action(self, state, one_hot=False, deterministic=False): # state = torch.FloatTensor(state).to(self.device) logits = self.forward(state) if one_hot: if deterministic: - logits = self.onehot_from_logits(logits,eps=1.0) + logits = self.onehot_from_logits(logits, eps=1.0) else: - logits = self.onehot_from_logits(logits,eps=0.0) + logits = self.onehot_from_logits(logits, eps=0.0) return logits dist = F.softmax(logits, dim=0) @@ -360,8 +361,6 @@ def forward(self, value): return value - - def get_value(self, state): # state = torch.FloatTensor(state).to(self.device) value = self.forward(state) diff --git a/genrl/environments/gym_wrapper.py b/genrl/environments/gym_wrapper.py index e283daa6..e6075c8f 100644 --- a/genrl/environments/gym_wrapper.py +++ b/genrl/environments/gym_wrapper.py @@ -122,7 +122,7 @@ class MultiGymWrapper(gym.Wrapper): """ def __init__(self, env: gym.Env): - super(GymWrapper, self).__init__(env) + super(MultiGymWrapper, self).__init__(env) self.env = env self.observation_space = self.env.observation_space diff --git a/genrl/utils/utils.py b/genrl/utils/utils.py index e80ca125..afbb800e 100644 --- a/genrl/utils/utils.py +++ b/genrl/utils/utils.py @@ -13,13 +13,13 @@ def get_model(type_: str, name_: str) -> Union: """ - Utility to get the class of required function + Utility to get the class of required function - :param type_: "ac" for Actor Critic, "v" for Value, "p" for Policy - :param name_: Name of the specific structure of model. ( + :param type_: "ac" for Actor Critic, "v" for Value, "p" for Policy + :param name_: Name of the specific structure of model. ( Eg. "mlp" or "cnn") - :type type_: string - :returns: Required class. Eg. MlpActorCritic + :type type_: string + :returns: Required class. Eg. MlpActorCritic """ if type_ == "ac": from genrl.core import get_actor_critic_from_name @@ -42,13 +42,13 @@ def mlp( sac: bool = False, ): """ - Generates an MLP model given sizes of each layer + Generates an MLP model given sizes of each layer - :param sizes: Sizes of hidden layers - :param sac: True if Soft Actor Critic is being used, else False - :type sizes: tuple or list - :type sac: bool - :returns: (Neural Network with fully-connected linear layers and + :param sizes: Sizes of hidden layers + :param sac: True if Soft Actor Critic is being used, else False + :type sizes: tuple or list + :type sac: bool + :returns: (Neural Network with fully-connected linear layers and activation layers) """ layers = [] @@ -64,27 +64,20 @@ def mlp( # If at all you need to concatenate states to actions after passing states through n FC layers -def mlp_( - self, - layer_sizes, - weight_init, - activation_func, - concat_ind, - sac - ): +def mlp_(self, layer_sizes, weight_init, activation_func, concat_ind, sac): """ - Generates an MLP model given sizes of each layer - - :param layer_sizes: Sizes of hidden layers - :param weight_init: type of weight initialization - :param activation_func: type of activation function - :param concat_ind: index of layer at which actions to be concatenated - :param sac: True if Soft Actor Critic is being used, else False - :type layer_sizes: tuple or list - :type concat_ind: int - :type sac: bool - :type weight_init,activation_func: string - :returns: (Neural Network with fully-connected linear layers and + Generates an MLP model given sizes of each layer + + :param layer_sizes: Sizes of hidden layers + :param weight_init: type of weight initialization + :param activation_func: type of activation function + :param concat_ind: index of layer at which actions to be concatenated + :param sac: True if Soft Actor Critic is being used, else False + :type layer_sizes: tuple or list + :type concat_ind: int + :type sac: bool + :type weight_init,activation_func: string + :returns: (Neural Network with fully-connected linear layers and activation layers) """ layers = [] @@ -99,9 +92,8 @@ def mlp_( elif weight_init == "xavier_normal": weight_init = torch.nn.init.xavier_normal_ - for layer in range(limit - 1): - if layer==concat_ind: + if layer == concat_ind: continue act = activation if layer < limit - 2 else nn.Identity() layers += [nn.Linear(sizes[layer], sizes[layer + 1]), act] @@ -118,23 +110,23 @@ def shared_mlp( network2_post, weight_init, activation_func, - sac - ): -""" - Generates an MLP model given sizes of each layer (Mostly used for SharedActorCritic) - - :param network1_prev: Sizes of network1's initial layers - :param network2_prev: Sizes of network2's initial layers - :param shared: Sizes of shared layers - :param network1_post: Sizes of network1's latter layers - :param network2_post: Sizes of network2's latter layers - :param weight_init: type of weight initialization - :param activation_func: type of activation function - :param sac: True if Soft Actor Critic is being used, else False - :type network1_prev,network2_prev,shared,network1_post,network2_post: tuple or list - :type weight_init,activation_func: string - :type sac: bool - :returns: network1 and networ2(Neural Network with fully-connected linear layers and + sac, +): + """ + Generates an MLP model given sizes of each layer (Mostly used for SharedActorCritic) + + :param network1_prev: Sizes of network1's initial layers + :param network2_prev: Sizes of network2's initial layers + :param shared: Sizes of shared layers + :param network1_post: Sizes of network1's latter layers + :param network2_post: Sizes of network2's latter layers + :param weight_init: type of weight initialization + :param activation_func: type of activation function + :param sac: True if Soft Actor Critic is being used, else False + :type network1_prev,network2_prev,shared,network1_post,network2_post: tuple or list + :type weight_init,activation_func: string + :type sac: bool + :returns: network1 and networ2(Neural Network with fully-connected linear layers and activation layers) """ @@ -149,7 +141,6 @@ def shared_mlp( if len(network2_post) != 0: network2_post = nn.ModuleList() - # add more weight init if weight_init == "xavier_uniform": weight_init = torch.nn.init.xavier_uniform_ @@ -159,56 +150,60 @@ def shared_mlp( weight_init = None if activation_func == "relu": - activation = nn.ReLU() - elif activation_func == "tanh": - activation = nn.Tanh() - else: - activation = None + activation = nn.ReLU() + elif activation_func == "tanh": + activation = nn.Tanh() + else: + activation = None if len(shared) != 0 or len(network1_post) != 0 or len(network2_post) != 0: - if not (network1_prev[-1]==network2_prev[-1] and network1_prev[-1]==shared[0] and network1_post[0]==network2_post[0] and network1_post[0]==shared[-1]): + if not ( + network1_prev[-1] == network2_prev[-1] + and network1_prev[-1] == shared[0] + and network1_post[0] == network2_post[0] + and network1_post[0] == shared[-1] + ): raise ValueError - for i in range(len(network1_prev)-1): - network1_prev.append(nn.Linear(network1_prev[i],network1_prev[i+1])) + for i in range(len(network1_prev) - 1): + network1_prev.append(nn.Linear(network1_prev[i], network1_prev[i + 1])) if activation is not None: network1_prev.append(activation) if weight_init is not None: weight_init(network1_prev[-1].weight) - for i in range(len(network2_prev)-1): - network2_prev.append(nn.Linear(network2_prev[i],network2_prev[i+1])) + for i in range(len(network2_prev) - 1): + network2_prev.append(nn.Linear(network2_prev[i], network2_prev[i + 1])) if activation is not None: network2_prev.append(activation) if weight_init is not None: weight_init(network2_prev[-1].weight) - for i in range(len(shared)-1): - shared.append(nn.Linear(shared[i], shared[i+1])) + for i in range(len(shared) - 1): + shared.append(nn.Linear(shared[i], shared[i + 1])) if activation is not None: shared.append(activation) if weight_init is not None: weight_init(shared[-1].weight) - for i in range(len(network1_post)-1): - network1_post.append(nn.Linear(network1_post[i],network1_post[i+1])) + for i in range(len(network1_post) - 1): + network1_post.append(nn.Linear(network1_post[i], network1_post[i + 1])) if activation is not None: network1_post.append(activation) if weight_init is not None: weight_init(network1_post[-1].weight) - for i in range(len(network2_post)-1): - network2_post.append(nn.Linear(network2_post[i],network2_post[i+1])) + for i in range(len(network2_post) - 1): + network2_post.append(nn.Linear(network2_post[i], network2_post[i + 1])) if activation is not None: network2_post.append(activation) if weight_init is not None: weight_init(network2_post[-1].weight) + network1 = nn.Sequential(network1_prev, shared, network1_post) + network2 = nn.Sequential(network2_prev, shared, network2_post) - network1 = nn.Sequential(network1_prev,shared,network1_post) - network2 = nn.Sequential(network2_prev,shared,network2_post) - - return network1,network2 + return network1, network2 def cnn( @@ -218,18 +213,18 @@ def cnn( **kwargs, ) -> (Tuple): """ - (Generates a CNN model given input dimensions, channels, kernel_sizes and + (Generates a CNN model given input dimensions, channels, kernel_sizes and strides) - :param channels: Input output channels before and after each convolution - :param kernel_sizes: Kernel sizes for each convolution - :param strides: Strides for each convolution - :param in_size: Input dimensions (assuming square input) - :type channels: tuple - :type kernel_sizes: tuple - :type strides: tuple - :type in_size: int - :returns: (Convolutional Neural Network with convolutional layers and + :param channels: Input output channels before and after each convolution + :param kernel_sizes: Kernel sizes for each convolution + :param strides: Strides for each convolution + :param in_size: Input dimensions (assuming square input) + :type channels: tuple + :type kernel_sizes: tuple + :type strides: tuple + :type in_size: int + :returns: (Convolutional Neural Network with convolutional layers and activation layers) """ @@ -255,12 +250,12 @@ def noisy_mlp(fc_layers: List[int], noisy_layers: List[int], activation="relu"): """Noisy MLP generating helper function Args: - fc_layers (:obj:`list` of :obj:`int`): List of fully connected layers - noisy_layers (:obj:`list` of :obj:`int`): :ist of noisy layers - activation (str): Activation function to be used. ["tanh", "relu"] + fc_layers (:obj:`list` of :obj:`int`): List of fully connected layers + noisy_layers (:obj:`list` of :obj:`int`): :ist of noisy layers + activation (str): Activation function to be used. ["tanh", "relu"] Returns: - Noisy MLP model + Noisy MLP model """ model = [] act = nn.Tanh if activation == "tanh" else nn.ReLU() @@ -282,15 +277,15 @@ def get_env_properties( env: Union[gym.Env, VecEnv], network: Union[str, Any] = "mlp" ) -> (Tuple[int]): """ - Finds important properties of environment + Finds important properties of environment - :param env: Environment that the agent is interacting with - :type env: Gym Environment - :param network: Type of network architecture, eg. "mlp", "cnn" - :type network: str - :returns: (State space dimensions, Action space dimensions, + :param env: Environment that the agent is interacting with + :type env: Gym Environment + :param network: Type of network architecture, eg. "mlp", "cnn" + :type network: str + :returns: (State space dimensions, Action space dimensions, discreteness of action space and action limit (highest action value) - :rtype: int, float, ...; int, float, ...; bool; int, float, ... + :rtype: int, float, ...; int, float, ...; bool; int, float, ... """ if network == "cnn": state_dim = env.framestack From e6a378c3a7d94afb46153b8254c50f6c3ab3350e Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Fri, 4 Sep 2020 12:18:52 +0530 Subject: [PATCH 16/39] rectifying 2 --- genrl/core/actor_critic.py | 58 +++++------------- genrl/core/buffers.py | 114 +++++++++++++++++++++--------------- genrl/environments/suite.py | 21 ++++--- genrl/utils/utils.py | 47 +++++++++++---- 4 files changed, 130 insertions(+), 110 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 66a49d87..b81d5f0c 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -2,6 +2,7 @@ import torch # noqa import torch.nn as nn # noqa +import torch.nn.functional as F from gym import spaces from torch.distributions import Categorical, Normal @@ -237,6 +238,7 @@ def __init__( actor_post, weight_init, activation_func, + False, ) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -248,7 +250,7 @@ def forward(self, state_critic, state_action): if state_action is not None: return self.actor(state_action) - def get_action(self, state, one_hot=False, deterministic=False): + def get_action(self, state, deterministic=False): # state = torch.FloatTensor(state).to(self.device) logits = self.forward(None, state) if one_hot: @@ -266,23 +268,6 @@ def get_action(self, state, one_hot=False, deterministic=False): index = probs.sample().cpu().detach().item() return index - def onehot_from_logits(self, logits, eps=0.0): - # get best (according to current policy) actions in one-hot form - argmax_acs = (logits == logits.max(0, keepdim=True)[0]).float() - if eps == 0.0: - return argmax_acs - # get random actions in one-hot form - rand_acs = torch.eye(logits.shape[1])[ - [np.random.choice(range(logits.shape[1]), size=logits.shape[0])] - ] - # chooses between best and random actions using epsilon greedy - return torch.stack( - [ - argmax_acs[i] if r > eps else rand_acs[i] - for i, r in enumerate(torch.rand(logits.shape[0])) - ] - ) - def get_value(self, state): # state = torch.FloatTensor(state).to(self.device) value = self.forward(state, None) @@ -298,15 +283,15 @@ def __init__( discrete: bool = True, **kwargs, ): - super(Actor, self).__init__() + super(Actor, self).__init__(state_dim, action_dim, hidden, discrete ** kwargs) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - def forward(self, policy): - policy = self.model(policy) - return policy + def forward(self, state): + state = self.model(state) + return state - def get_action(self, state, one_hot=False, deterministic=False): + def get_action(self, state, deterministic=False): # state = torch.FloatTensor(state).to(self.device) logits = self.forward(state) if one_hot: @@ -324,23 +309,6 @@ def get_action(self, state, one_hot=False, deterministic=False): index = probs.sample().cpu().detach().item() return index - def onehot_from_logits(self, logits, eps=0.0): - # get best (according to current policy) actions in one-hot form - argmax_acs = (logits == logits.max(0, keepdim=True)[0]).float() - if eps == 0.0: - return argmax_acs - # get random actions in one-hot form - rand_acs = torch.eye(logits.shape[1])[ - [np.random.choice(range(logits.shape[1]), size=logits.shape[0])] - ] - # chooses between best and random actions using epsilon greedy - return torch.stack( - [ - argmax_acs[i] if r > eps else rand_acs[i] - for i, r in enumerate(torch.rand(logits.shape[0])) - ] - ) - class Critic(MlpValue): def __init__( @@ -351,15 +319,17 @@ def __init__( val_type: str = "V", **kwargs, ): - super(Critic, self).__init__() + super(Critic, self).__init__( + state_dim, action_dim, fc_layers, val_type, **kwargs + ) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - def forward(self, value): + def forward(self, state): - value = self.model(value) + state = self.model(state) - return value + return state def get_value(self, state): # state = torch.FloatTensor(state).to(self.device) diff --git a/genrl/core/buffers.py b/genrl/core/buffers.py index dcfe5f89..f87d8343 100644 --- a/genrl/core/buffers.py +++ b/genrl/core/buffers.py @@ -23,13 +23,6 @@ class PrioritizedReplayBufferSamples(NamedTuple): indices: torch.Tensor weights: torch.Tensor -class MultiAgentReplayBuffer(NamedTuple): - states: torch.Tensor - actions: torch.Tensor - rewards: torch.Tensor - next_states: torch.Tensor - dones: torch.Tensor - class ReplayBuffer: """ @@ -124,15 +117,15 @@ def sample( ] ): """ - (Returns randomly sampled memories from replay memory along with their + (Returns randomly sampled memories from replay memory along with their respective indices and weights) - :param batch_size: Number of samples per batch - :param beta: (Bias exponent used to correct + :param batch_size: Number of samples per batch + :param beta: (Bias exponent used to correct Importance Sampling (IS) weights) - :type batch_size: int - :type beta: float - :returns: (Tuple containing `states`, `actions`, `next_states`, + :type batch_size: int + :type beta: float + :returns: (Tuple containing `states`, `actions`, `next_states`, `rewards`, `dones`, `indices` and `weights`) """ if beta is None: @@ -190,46 +183,52 @@ def pos(self): return len(self.buffer) - class MultiAgentReplayBuffer: """ - Implements the basic Experience Replay Mechanism for MultiAgents by feeding in global states, - global actions, global rewards, global next_states, global dones - - :param capacity: Size of the replay buffer - :type capacity: int - :param num_agents: Number of agents in the environment - :type num_agents: int + Implements the basic Experience Replay Mechanism for MultiAgents + by feeding in global states, global actions, global rewards, + global next_states, global dones + + :param capacity: Size of the replay buffer + :type capacity: int + :param num_agents: Number of agents in the environment + :type num_agents: int """ - def __init__(self, num_agents, capacity): + + def __init__(self, num_agents: int, capacity: int): self.capacity = capacity self.num_agents = num_agents - self.buffer = deque(maxlen=max_size) + self.buffer = deque(maxlen=self.capacity) def push(self, inp: Tuple) -> None: """ - Adds new experience to buffer + Adds new experience to buffer - :param inp: (Tuple containing `state`, `action`, `reward`, - `next_state` and `done`) - :type inp: tuple - :returns: None + :param inp: (Tuple containing `state`, `action`, `reward`, + `next_state` and `done`) + :type inp: tuple + :returns: None """ self.buffer.append(inp) - - + def sample(self, batch_size): """ - Returns randomly sampled experiences from replay memory - - :param batch_size: Number of samples per batch - :type batch_size: int - :returns: (Tuple composing of `indiv_obs_batch`, `indiv_action_batch`, `indiv_reward_batch`, `indiv_next_obs_batch`, - `global_state_batch`, `global_actions_batch`, `global_next_state_batch`, `done_batch`) + Returns randomly sampled experiences from replay memory + + :param batch_size: Number of samples per batch + :type batch_size: int + :returns: (Tuple composing of `indiv_obs_batch`, + `indiv_action_batch`, `indiv_reward_batch`, `indiv_next_obs_batch`, + `global_state_batch`, `global_actions_batch`, `global_next_state_batch`, + `done_batch`) """ - indiv_obs_batch = [[] for _ in range(self.num_agents)] # [ [states of agent 1], ... ,[states of agent n] ] ] - indiv_action_batch = [[] for _ in range(self.num_agents)] # [ [actions of agent 1], ... , [actions of agent n]] + indiv_obs_batch = [ + [] for _ in range(self.num_agents) + ] # [ [states of agent 1], ... ,[states of agent n] ] ] + indiv_action_batch = [ + [] for _ in range(self.num_agents) + ] # [ [actions of agent 1], ... , [actions of agent n]] indiv_reward_batch = [[] for _ in range(self.num_agents)] indiv_next_obs_batch = [[] for _ in range(self.num_agents)] @@ -240,10 +239,9 @@ def sample(self, batch_size): batch = random.sample(self.buffer, batch_size) - for experience in batch: state, action, reward, next_state, done = experience - + for i in range(self.num_agents): indiv_obs_batch[i].append(state[i]) indiv_action_batch[i].append(action[i]) @@ -259,12 +257,34 @@ def sample(self, batch_size): global_actions_batch = torch.stack(global_actions_batch) global_next_state_batch = torch.stack(global_next_state_batch) done_batch = torch.stack(done_batch) - indiv_obs_batch = torch.stack([torch.FloatTensor(obs) for obs in indiv_obs_batch]) - indiv_action_batch = torch.stack([torch.FloatTensor(act) for act in indiv_action_batch]) - indiv_reward_batch = torch.stack([torch.FloatTensor(rew) for rew in indiv_reward_batch]) - indiv_next_obs_batch = torch.stack([torch.FloatTensor(next_obs) for next_obs in indiv_next_obs_batch]) - - return indiv_obs_batch, indiv_action_batch, indiv_reward_batch, indiv_next_obs_batch, global_state_batch, global_actions_batch, global_next_state_batch, done_batch + indiv_obs_batch = torch.stack( + [torch.FloatTensor(obs) for obs in indiv_obs_batch] + ) + indiv_action_batch = torch.stack( + [torch.FloatTensor(act) for act in indiv_action_batch] + ) + indiv_reward_batch = torch.stack( + [torch.FloatTensor(rew) for rew in indiv_reward_batch] + ) + indiv_next_obs_batch = torch.stack( + [torch.FloatTensor(next_obs) for next_obs in indiv_next_obs_batch] + ) + + return ( + indiv_obs_batch, + indiv_action_batch, + indiv_reward_batch, + indiv_next_obs_batch, + global_state_batch, + global_actions_batch, + global_next_state_batch, + done_batch, + ) def __len__(self): - return len(self.buffer) \ No newline at end of file + """ + Gives number of experiences in buffer currently + + :returns: Length of replay memory + """ + return len(self.buffer) diff --git a/genrl/environments/suite.py b/genrl/environments/suite.py index 299dd1af..3e69ca40 100644 --- a/genrl/environments/suite.py +++ b/genrl/environments/suite.py @@ -99,10 +99,7 @@ def AtariEnv( return env -def MultiAgentParticleEnv( - scenario_name:str, - benchmark:bool - )->gym.Env: +def MultiAgentParticleEnv(scenario_name: str, benchmark: bool) -> gym.Env: """ Function to apply wrappers for all Atari envs by Trainer class @@ -113,17 +110,25 @@ def MultiAgentParticleEnv( :returns: Gym Atari Environment :rtype: object """ - import multiagent.scenarios as scenarios + from multiagent.environment import MultiAgentEnv + # load scenario from script scenario = scenarios.load(scenario_name + ".py").Scenario() # create world world = scenario.make_world() # create multiagent environment if benchmark: - env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) + env = MultiAgentEnv( + world, + scenario.reset_world, + scenario.reward, + scenario.observation, + scenario.benchmark_data, + ) else: - env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) + env = MultiAgentEnv( + world, scenario.reset_world, scenario.reward, scenario.observation + ) return env - diff --git a/genrl/utils/utils.py b/genrl/utils/utils.py index afbb800e..02310a30 100644 --- a/genrl/utils/utils.py +++ b/genrl/utils/utils.py @@ -64,7 +64,14 @@ def mlp( # If at all you need to concatenate states to actions after passing states through n FC layers -def mlp_(self, layer_sizes, weight_init, activation_func, concat_ind, sac): +def mlp_( + self, + layer_sizes: Tuple, + weight_init: str, + activation_func: str, + concat_ind: int, + sac: bool, +): """ Generates an MLP model given sizes of each layer @@ -81,7 +88,7 @@ def mlp_(self, layer_sizes, weight_init, activation_func, concat_ind, sac): activation layers) """ layers = [] - limit = len(layer_sizes) if sac is False else len(sizes) - 1 + limit = len(layer_sizes) if sac is False else len(layer_sizes) - 1 # add more activations activation = nn.Tanh() if activation_func == "tanh" else nn.ReLU() @@ -96,21 +103,21 @@ def mlp_(self, layer_sizes, weight_init, activation_func, concat_ind, sac): if layer == concat_ind: continue act = activation if layer < limit - 2 else nn.Identity() - layers += [nn.Linear(sizes[layer], sizes[layer + 1]), act] + layers += [nn.Linear(layer_sizes[layer], layer_sizes[layer + 1]), act] weight_init(layers[-1][0].weight) return nn.Sequential(*layers) def shared_mlp( - network1_prev, - network2_prev, - shared, - network1_post, - network2_post, - weight_init, - activation_func, - sac, + network1_prev: Tuple, + network2_prev: Tuple, + shared: Tuple, + network1_post: Tuple, + network2_post: Tuple, + weight_init: str, + activation_func: str, + sac: bool, ): """ Generates an MLP model given sizes of each layer (Mostly used for SharedActorCritic) @@ -342,3 +349,21 @@ def safe_mean(log: Union[torch.Tensor, List[int]]): else: func = np.mean return func(log) + + +def onehot_from_logits(self, logits, eps=0.0): + # get best (according to current policy) actions in one-hot form + argmax_acs = (logits == logits.max(0, keepdim=True)[0]).float() + if eps == 0.0: + return argmax_acs + # get random actions in one-hot form + rand_acs = torch.eye(logits.shape[1])[ + [np.random.choice(range(logits.shape[1]), size=logits.shape[0])] + ] + # chooses between best and random actions using epsilon greedy + return torch.stack( + [ + argmax_acs[i] if r > eps else rand_acs[i] + for i, r in enumerate(torch.rand(logits.shape[0])) + ] + ) From 915d19d8ac29a29b52da15679e49f87e1f9a69db Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Fri, 4 Sep 2020 12:58:24 +0530 Subject: [PATCH 17/39] rectifying 3 --- genrl/environments/gym_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genrl/environments/gym_wrapper.py b/genrl/environments/gym_wrapper.py index e6075c8f..cbbc96e4 100644 --- a/genrl/environments/gym_wrapper.py +++ b/genrl/environments/gym_wrapper.py @@ -138,7 +138,7 @@ def __getattr__(self, name: str) -> Any: """ All other calls would go to base env """ - env = super(GymWrapper, self).__getattribute__("env") + env = super(MultiGymWrapper, self).__getattribute__("env") return getattr(env, name) @property From b0b5025ad2a8c6f588854cebdb682735e860aac4 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Fri, 4 Sep 2020 13:23:23 +0530 Subject: [PATCH 18/39] adding test for mlp_concat --- genrl/utils/utils.py | 17 ++++++++--------- tests/test_deep/test_common/test_utils.py | 18 +++++++++++++++++- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/genrl/utils/utils.py b/genrl/utils/utils.py index 02310a30..009d66fc 100644 --- a/genrl/utils/utils.py +++ b/genrl/utils/utils.py @@ -64,13 +64,12 @@ def mlp( # If at all you need to concatenate states to actions after passing states through n FC layers -def mlp_( - self, +def mlp_concat( layer_sizes: Tuple, - weight_init: str, - activation_func: str, - concat_ind: int, - sac: bool, + weight_init: str = "xavier_uniform", + activation_func: str = "relu", + concat_ind: int = -1, # negative number means no concatenation + sac: bool = False, ): """ Generates an MLP model given sizes of each layer @@ -115,9 +114,9 @@ def shared_mlp( shared: Tuple, network1_post: Tuple, network2_post: Tuple, - weight_init: str, - activation_func: str, - sac: bool, + weight_init: str = "xavier_uniform", + activation_func: str = "relu", + sac: bool = False, ): """ Generates an MLP model given sizes of each layer (Mostly used for SharedActorCritic) diff --git a/tests/test_deep/test_common/test_utils.py b/tests/test_deep/test_common/test_utils.py index 11bf1292..58482a90 100644 --- a/tests/test_deep/test_common/test_utils.py +++ b/tests/test_deep/test_common/test_utils.py @@ -8,7 +8,15 @@ from genrl.core import CnnValue, MlpActorCritic, MlpPolicy, MlpValue from genrl.environments import VectorEnv from genrl.trainers import OnPolicyTrainer -from genrl.utils import cnn, get_env_properties, get_model, mlp, set_seeds +from genrl.utils import ( + cnn, + get_env_properties, + get_model, + mlp, + mlp_concat, + set_seeds, + shared_mlp, +) class TestUtils: @@ -33,15 +41,23 @@ def test_mlp(self): sizes = [2, 3, 3, 2] mlp_nn = mlp(sizes) mlp_nn_sac = mlp(sizes, sac=True) + mlp_nn_concat = mlp(sizes, concat_ind=1, sac=False) + mlp_nn_concat_sac = mlp_concat(sizes, concat_ind=1, sac=True) assert len(mlp_nn) == 2 * (len(sizes) - 1) assert all(isinstance(mlp_nn[i], nn.Linear) for i in range(0, 5, 2)) + assert len(mlp_nn_concat) == 2 * (len(sizes) - 1) + assert all(isinstance(mlp_nn_concat[i], nn.Linear) for i in range(0, 5, 2)) assert len(mlp_nn_sac) == 2 * (len(sizes) - 2) assert all(isinstance(mlp_nn_sac[i], nn.Linear) for i in range(0, 4, 2)) + assert len(mlp_nn_concat_sac) == 2 * (len(sizes) - 2) + assert all(isinstance(mlp_nn_concat_sac[i], nn.Linear) for i in range(0, 4, 2)) inp = torch.randn((2,)) assert mlp_nn(inp).shape == (2,) + assert mlp_nn_concat(inp).shape == (2,) assert mlp_nn_sac(inp).shape == (3,) + assert mlp_nn_concat_sac(inp).shape == (3,) def test_cnn(self): """ From 8cc732b660536f012b39afc23d722eb00e2055e7 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Fri, 4 Sep 2020 13:39:42 +0530 Subject: [PATCH 19/39] adding test for mlp_concat --- tests/test_deep/test_common/test_utils.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_deep/test_common/test_utils.py b/tests/test_deep/test_common/test_utils.py index 58482a90..4338ff61 100644 --- a/tests/test_deep/test_common/test_utils.py +++ b/tests/test_deep/test_common/test_utils.py @@ -43,6 +43,12 @@ def test_mlp(self): mlp_nn_sac = mlp(sizes, sac=True) mlp_nn_concat = mlp(sizes, concat_ind=1, sac=False) mlp_nn_concat_sac = mlp_concat(sizes, concat_ind=1, sac=True) + shared_mlp_nn1, shared_mlp_nn2 = shared_mlp( + sizes, sizes, sizes, sizes, sizes, sac=False + ) + shared_mlp_nn1_sac, shared_mlp_nn2_sac = shared_mlp( + sizes, sizes, sizes, sizes, sizes, sac=True + ) assert len(mlp_nn) == 2 * (len(sizes) - 1) assert all(isinstance(mlp_nn[i], nn.Linear) for i in range(0, 5, 2)) @@ -52,12 +58,24 @@ def test_mlp(self): assert all(isinstance(mlp_nn_sac[i], nn.Linear) for i in range(0, 4, 2)) assert len(mlp_nn_concat_sac) == 2 * (len(sizes) - 2) assert all(isinstance(mlp_nn_concat_sac[i], nn.Linear) for i in range(0, 4, 2)) + assert len(shared_mlp_nn1) == 2 * (len(sizes) - 1) * 3 + assert len(shared_mlp_nn2) == 2 * (len(sizes) - 1) * 3 + assert all(isinstance(shared_mlp_nn1[i], nn.Linear) for i in range(0, 8, 2)) + assert all(isinstance(shared_mlp_nn2[i], nn.Linear) for i in range(0, 8, 2)) + assert len(shared_mlp_nn1_sac) == 2 * (len(sizes) - 2) * 3 + assert all(isinstance(shared_mlp_nn1_sac[i], nn.Linear) for i in range(0, 4, 2)) + assert len(shared_mlp_nn2_sac) == 2 * (len(sizes) - 2) * 3 + assert all(isinstance(shared_mlp_nn2_sac[i], nn.Linear) for i in range(0, 4, 2)) inp = torch.randn((2,)) assert mlp_nn(inp).shape == (2,) assert mlp_nn_concat(inp).shape == (2,) + assert shared_mlp_nn1(inp).shape == (2,) + assert shared_mlp_nn2(inp).shape == (2,) assert mlp_nn_sac(inp).shape == (3,) assert mlp_nn_concat_sac(inp).shape == (3,) + assert shared_mlp_nn1_sac(inp).shape == (3,) + assert shared_mlp_nn2_sac(inp).shape == (3,) def test_cnn(self): """ From b8f7f6acbe234f2ed29bdc18897e1e464580bbd7 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Fri, 4 Sep 2020 14:27:07 +0530 Subject: [PATCH 20/39] fixing errors --- genrl/core/actor_critic.py | 12 ------------ tests/test_deep/test_common/test_utils.py | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index b81d5f0c..36d3f94b 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -253,12 +253,6 @@ def forward(self, state_critic, state_action): def get_action(self, state, deterministic=False): # state = torch.FloatTensor(state).to(self.device) logits = self.forward(None, state) - if one_hot: - if deterministic: - logits = self.onehot_from_logits(logits, eps=1.0) - else: - logits = self.onehot_from_logits(logits, eps=0.0) - return logits dist = F.softmax(logits, dim=0) probs = Categorical(dist) @@ -294,12 +288,6 @@ def forward(self, state): def get_action(self, state, deterministic=False): # state = torch.FloatTensor(state).to(self.device) logits = self.forward(state) - if one_hot: - if deterministic: - logits = self.onehot_from_logits(logits, eps=1.0) - else: - logits = self.onehot_from_logits(logits, eps=0.0) - return logits dist = F.softmax(logits, dim=0) probs = Categorical(dist) diff --git a/tests/test_deep/test_common/test_utils.py b/tests/test_deep/test_common/test_utils.py index 4338ff61..04f3be49 100644 --- a/tests/test_deep/test_common/test_utils.py +++ b/tests/test_deep/test_common/test_utils.py @@ -41,7 +41,7 @@ def test_mlp(self): sizes = [2, 3, 3, 2] mlp_nn = mlp(sizes) mlp_nn_sac = mlp(sizes, sac=True) - mlp_nn_concat = mlp(sizes, concat_ind=1, sac=False) + mlp_nn_concat = mlp_concat(sizes, concat_ind=1, sac=False) mlp_nn_concat_sac = mlp_concat(sizes, concat_ind=1, sac=True) shared_mlp_nn1, shared_mlp_nn2 = shared_mlp( sizes, sizes, sizes, sizes, sizes, sac=False From e50e230c0fe3f6a2633c1dea88a56e620c33d8fc Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Fri, 4 Sep 2020 14:35:47 +0530 Subject: [PATCH 21/39] adding docstring --- genrl/core/buffers.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/genrl/core/buffers.py b/genrl/core/buffers.py index f87d8343..3ba9e7e1 100644 --- a/genrl/core/buffers.py +++ b/genrl/core/buffers.py @@ -196,6 +196,14 @@ class MultiAgentReplayBuffer: """ def __init__(self, num_agents: int, capacity: int): + """ + Initialising the buffer + :param num_agents: number of agents in the environment + :type num_agents: int + :param capacity: Max buffer size + :type capacity: int + + """ self.capacity = capacity self.num_agents = num_agents self.buffer = deque(maxlen=self.capacity) From 835819e193413e59aa3b09f63cd7b385ea3a359a Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Fri, 4 Sep 2020 22:55:36 +0530 Subject: [PATCH 22/39] Renaming Multi -> Two and comments --- genrl/core/actor_critic.py | 82 +++++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 24 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index aa460be7..a6fb7e49 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -75,7 +75,7 @@ def __init__( **kwargs, ): super(MlpSharedActorCritic, self).__init__() - self.shared = mlp([state_dim] + list(shared_layers)) + self.shared_network = mlp([state_dim] + list(shared_layers)) self.actor = MlpPolicy( shared_layers[-1], action_dim, policy_layers, discrete, **kwargs ) @@ -86,8 +86,12 @@ def __init__( self.action_dim = action_dim def get_params(self): - actor_params = list(self.shared.parameters()) + list(self.actor.parameters()) - critic_params = list(self.shared.parameters()) + list(self.critic.parameters()) + actor_params = list(self.shared_network.parameters()) + list( + self.actor.parameters() + ) + critic_params = list(self.shared_network.parameters()) + list( + self.critic.parameters() + ) return actor_params, critic_params def get_features(self, state: torch.Tensor): @@ -99,7 +103,7 @@ def get_features(self, state: torch.Tensor): Returns: features (:obj:`torch.Tensor`): The feature(s) extracted from the state """ - features = self.shared(state) + features = self.shared_network(state) return features def get_action(self, state: torch.Tensor, deterministic: bool = False): @@ -116,8 +120,8 @@ def get_action(self, state: torch.Tensor, deterministic: bool = False): """ state = torch.as_tensor(state).float() - features = self.get_features(state) - action_probs = self.actor(features) + shared_features = self.get_features(state) + action_probs = self.actor(shared_features) action_probs = nn.Softmax(dim=-1)(action_probs) if deterministic: @@ -139,17 +143,28 @@ def get_value(self, state: torch.Tensor): values (:obj:`list`): List of values as estimated by the critic """ state = torch.as_tensor(state).float() + if self.critic.val_type == "Qsa": - features = self.shared(state[:, :, :-1]) - features = torch.cat([features, state[:, :, -1].unsqueeze(-1)], dim=-1) - value = self.critic(features).float().squeeze(-1) + # state shape = [batch_size, number of vec envs, (state_dim + action_dim)] + + # extract shared_features from just the state + # state[:, :, :-action_dim] -> [batch_size, number of vec envs, state_dim] + shared_features = self.shared_network(state[:, :, : -self.action_dim]) + + # concatenate the actions to the extracted shared_features + # state[:, :, -action_dim:] -> [batch_size, number of vec envs, action_dim] + shared_features = torch.cat( + [shared_features, state[:, :, -self.action_dim :]], dim=-1 + ) + + value = self.critic(shared_features).float().squeeze(-1) else: - features = self.shared(state) - value = self.critic(features) + shared_features = self.shared_network(state) + value = self.critic(shared_features) return value -class MlpSingleActorMultiCritic(BaseActorCritic): +class MlpSingleActorTwoCritic(BaseActorCritic): """MLP Actor Critic Attributes: @@ -175,7 +190,7 @@ def __init__( num_critics: int = 2, **kwargs, ): - super(MlpSingleActorMultiCritic, self).__init__() + super(MlpSingleActorTwoCritic, self).__init__() self.num_critics = num_critics @@ -264,7 +279,7 @@ def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: return values -class MlpSharedSingleActorMultiCritic(MlpSingleActorMultiCritic): +class MlpSharedSingleActorTwoCritic(MlpSingleActorTwoCritic): """MLP Actor Critic Attributes: @@ -292,7 +307,7 @@ def __init__( num_critics: int = 2, **kwargs, ): - super(MlpSharedSingleActorMultiCritic, self).__init__( + super(MlpSharedSingleActorTwoCritic, self).__init__( shared_layers[-1], action_dim, policy_layers, @@ -302,7 +317,19 @@ def __init__( num_critics, **kwargs, ) - self.shared = mlp([state_dim] + list(shared_layers)) + self.shared_network = mlp([state_dim] + list(shared_layers)) + self.action_dim = action_dim + + def get_params(self): + actor_params = list(self.shared_network.parameters()) + list( + self.actor.parameters() + ) + critic_params = ( + list(self.shared_network.parameters()) + + list(self.critic1.parameters()) + + list(self.critic2.parameters()) + ) + return actor_params, critic_params def get_features(self, state: torch.Tensor): """Extract features from the state, which is then an input to get_action and get_value @@ -313,7 +340,7 @@ def get_features(self, state: torch.Tensor): Returns: features (:obj:`torch.Tensor`): The feature(s) extracted from the state """ - features = self.shared(state) + features = self.shared_network(state) return features def get_action(self, state: torch.Tensor, deterministic: bool = False): @@ -326,9 +353,9 @@ def get_action(self, state: torch.Tensor, deterministic: bool = False): Returns: action (:obj:`list`): List of actions as estimated by the critic distribution (): The distribution from which the action was sampled - (None if determinist + (None if deterministic) """ - return super(MlpSharedSingleActorMultiCritic, self).get_action( + return super(MlpSharedSingleActorTwoCritic, self).get_action( self.get_features(state), deterministic=deterministic ) @@ -346,9 +373,16 @@ def get_value(self, state: torch.Tensor, mode="first"): values (:obj:`list`): List of values as estimated by each individual critic """ state = torch.as_tensor(state).float() - x = self.get_features(state[:, :, :-1]) - state = torch.cat([x, state[:, :, -1].unsqueeze(-1)], dim=-1) - return super(MlpSharedSingleActorMultiCritic, self).get_value(state, mode) + # state shape = [batch_size, number of vec envs, (state_dim + action_dim)] + + # extract shard features for just the state + # state[:, :, :-action_dim] -> [batch_size, number of vec envs, state_dim] + x = self.get_features(state[:, :, : -self.action_dim]) + + # concatenate the actions to the extracted shared features + # state[:, :, -action_dim:] -> [batch_size, number of vec envs, action_dim] + state = torch.cat([x, state[:, :, -self.action_dim :]], dim=-1) + return super(MlpSharedSingleActorTwoCritic, self).get_value(state, mode) class CNNActorCritic(BaseActorCritic): @@ -438,9 +472,9 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor: actor_critic_registry = { "mlp": MlpActorCritic, "cnn": CNNActorCritic, - "mlp12": MlpSingleActorMultiCritic, + "mlp12": MlpSingleActorTwoCritic, "mlps": MlpSharedActorCritic, - "mlp12s": MlpSharedSingleActorMultiCritic, + "mlp12s": MlpSharedSingleActorTwoCritic, } From 793c0457fb0006fb8c0aa970d6b333f5e41c7a3c Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Sat, 5 Sep 2020 11:22:12 +0530 Subject: [PATCH 23/39] changing names --- genrl/core/actor_critic.py | 4 ++-- tests/test_deep/test_common/test_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 36d3f94b..1e47f5e1 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -268,7 +268,7 @@ def get_value(self, state): return value -class Actor(MlpPolicy): +class MultiAgentActor(MlpPolicy): def __init__( self, state_dim: spaces.Space, @@ -298,7 +298,7 @@ def get_action(self, state, deterministic=False): return index -class Critic(MlpValue): +class MultiAgentCritic(MlpValue): def __init__( self, state_dim: spaces.Space, diff --git a/tests/test_deep/test_common/test_utils.py b/tests/test_deep/test_common/test_utils.py index 04f3be49..85dc95ab 100644 --- a/tests/test_deep/test_common/test_utils.py +++ b/tests/test_deep/test_common/test_utils.py @@ -8,7 +8,7 @@ from genrl.core import CnnValue, MlpActorCritic, MlpPolicy, MlpValue from genrl.environments import VectorEnv from genrl.trainers import OnPolicyTrainer -from genrl.utils import ( +from genrl.utils.utils import ( cnn, get_env_properties, get_model, From 2635fd521cd13c66edba36b0e85ccd59547b60df Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Sat, 5 Sep 2020 11:28:35 +0530 Subject: [PATCH 24/39] changing names --- genrl/core/actor_critic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 1e47f5e1..a9f241a3 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -277,7 +277,9 @@ def __init__( discrete: bool = True, **kwargs, ): - super(Actor, self).__init__(state_dim, action_dim, hidden, discrete ** kwargs) + super(MultiAgentActor, self).__init__( + state_dim, action_dim, hidden, discrete ** kwargs + ) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -307,7 +309,7 @@ def __init__( val_type: str = "V", **kwargs, ): - super(Critic, self).__init__( + super(MultiAgentCritic, self).__init__( state_dim, action_dim, fc_layers, val_type, **kwargs ) From 65b6520e363aa361ef2db6be3057dad04bc29911 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Sun, 6 Sep 2020 02:58:58 +0530 Subject: [PATCH 25/39] Shared params for single ACs --- .pre-commit-config.yaml | 6 +- genrl/agents/deep/a2c/a2c.py | 17 +- genrl/agents/deep/base/base.py | 4 + genrl/agents/deep/ddpg/ddpg.py | 16 +- genrl/agents/deep/ppo1/ppo1.py | 15 +- genrl/core/actor_critic.py | 364 +++++++++------------- genrl/core/buffers.py | 127 +++++++- genrl/environments/gym_wrapper.py | 104 +++++++ genrl/environments/suite.py | 35 +++ genrl/utils/utils.py | 242 ++++++++++++-- tests/test_deep/test_agents/test_a2c.py | 9 +- tests/test_deep/test_common/test_utils.py | 36 ++- 12 files changed, 697 insertions(+), 278 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2c68e57a..7daa29ec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,12 @@ repos: - repo: https://github.com/asottile/seed-isort-config - rev: v1.9.4 + rev: v2.2.0 hooks: - id: seed-isort-config args: [--exclude=^((examples|docs)/.*)$] - repo: https://github.com/timothycrosley/isort - rev: 4.3.2 + rev: 5.5.0 hooks: - id: isort @@ -14,7 +14,7 @@ repos: rev: 20.8b1 hooks: - id: black - language_version: python3.7 + language_version: python3 - repo: https://gitlab.com/pycqa/flake8 rev: 3.8.3 diff --git a/genrl/agents/deep/a2c/a2c.py b/genrl/agents/deep/a2c/a2c.py index cf5501e0..fe78ac6a 100644 --- a/genrl/agents/deep/a2c/a2c.py +++ b/genrl/agents/deep/a2c/a2c.py @@ -65,10 +65,8 @@ def _create_model(self) -> None: state_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network ) - if isinstance(self.network, str): + if isinstance(self.network, str) and self.shared_layers is None: arch_type = self.network - if self.shared_layers is not None: - arch_type += "s" self.ac = get_model("ac", arch_type)( state_dim, action_dim, @@ -79,7 +77,18 @@ def _create_model(self) -> None: discrete=discrete, action_lim=action_lim, ).to(self.device) - + elif isinstance(self.network, str) and self.shared_layers is not None: + arch_type = self.network + "s" + self.ac = get_model("ac", arch_type)( + state_dim, + action_dim, + critic_prev=self.critic_prev, + actor_prev=self.actor_prev, + shared_layers=self.shared_layers, + critic_post=self.value_layers, + actor_post=self.policy_layers, + val_type="V", + ).to(self.device) else: self.ac = self.network.to(self.device) diff --git a/genrl/agents/deep/base/base.py b/genrl/agents/deep/base/base.py index 29c02626..c9a25884 100644 --- a/genrl/agents/deep/base/base.py +++ b/genrl/agents/deep/base/base.py @@ -34,6 +34,8 @@ def __init__( create_model: bool = True, batch_size: int = 64, gamma: float = 0.99, + actor_prev=[], + critic_prev=[], shared_layers=None, policy_layers: Tuple = (64, 64), value_layers: Tuple = (64, 64), @@ -52,6 +54,8 @@ def __init__( self.value_layers = value_layers self.lr_policy = lr_policy self.lr_value = lr_value + self.actor_prev = actor_prev + self.critic_prev = critic_prev self.seed = kwargs["seed"] if "seed" in kwargs else None self.render = kwargs["render"] if "render" in kwargs else False diff --git a/genrl/agents/deep/ddpg/ddpg.py b/genrl/agents/deep/ddpg/ddpg.py index cfcd4e42..4ed0fe54 100644 --- a/genrl/agents/deep/ddpg/ddpg.py +++ b/genrl/agents/deep/ddpg/ddpg.py @@ -62,10 +62,8 @@ def _create_model(self) -> None: torch.zeros(action_dim), self.noise_std * torch.ones(action_dim) ) - if isinstance(self.network, str): + if isinstance(self.network, str) and self.shared_layers is None: arch_type = self.network - if self.shared_layers is not None: - arch_type += "s" self.ac = get_model("ac", arch_type)( state_dim, action_dim, @@ -75,6 +73,18 @@ def _create_model(self) -> None: "Qsa", False, ).to(self.device) + elif isinstance(self.network, str) and self.shared_layers is not None: + arch_type = self.network + "s" + self.ac = get_model("ac", arch_type)( + state_dim, + action_dim, + critic_prev=self.critic_prev, + actor_prev=self.actor_prev, + shared_layers=self.shared_layers, + critic_post=self.value_layers, + actor_post=self.policy_layers, + val_type="Qsa", + ).to(self.device) else: self.ac = self.network diff --git a/genrl/agents/deep/ppo1/ppo1.py b/genrl/agents/deep/ppo1/ppo1.py index 1174b4b1..189bee93 100644 --- a/genrl/agents/deep/ppo1/ppo1.py +++ b/genrl/agents/deep/ppo1/ppo1.py @@ -65,10 +65,8 @@ def _create_model(self): state_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network ) - if isinstance(self.network, str): + if isinstance(self.network, str) and self.shared_layers is None: arch = self.network - if self.shared_layers is not None: - arch += "s" self.ac = get_model("ac", arch)( state_dim, action_dim, @@ -80,6 +78,17 @@ def _create_model(self): action_lim=action_lim, activation=self.activation, ).to(self.device) + elif isinstance(self.network, str) and self.shared_layers is not None: + arch_type = self.network + "s" + self.ac = get_model("ac", arch_type)( + state_dim, + action_dim, + critic_prev=self.critic_prev, + actor_prev=self.actor_prev, + shared_layers=self.shared_layers, + critic_post=self.value_layers, + actor_post=self.policy_layers, + ).to(self.device) else: self.ac = self.network.to(self.device) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 1a96e959..c3282996 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -2,13 +2,14 @@ import torch # noqa import torch.nn as nn # noqa +import torch.nn.functional as F from gym import spaces from torch.distributions import Categorical, Normal from genrl.core.base import BaseActorCritic from genrl.core.policies import MlpPolicy from genrl.core.values import MlpValue -from genrl.utils.utils import cnn, mlp +from genrl.utils.utils import cnn, mlp, shared_mlp class MlpActorCritic(BaseActorCritic): @@ -47,122 +48,6 @@ def get_params(self): return actor_params, critic_params -class MlpSharedActorCritic(BaseActorCritic): - """MLP Shared Actor Critic - - Attributes: - state_dim (int): State dimensions of the environment - action_dim (int): Action space dimensions of the environment - shared_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the shared MLP - policy_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the policy MLP - value_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the value MLP - val_type (str): Value type of the critic network - discrete (bool): True if the action space is discrete, else False - sac (bool): True if a SAC-like network is needed, else False - activation (str): Activation function to be used. Can be either "tanh" or "relu" - """ - - def __init__( - self, - state_dim: spaces.Space, - action_dim: spaces.Space, - shared_layers: Tuple = (32, 32), - policy_layers: Tuple = (32, 32), - value_layers: Tuple = (32, 32), - val_type: str = "V", - discrete: bool = True, - **kwargs, - ): - super(MlpSharedActorCritic, self).__init__() - self.shared_network = mlp([state_dim] + list(shared_layers)) - self.actor = MlpPolicy( - shared_layers[-1], action_dim, policy_layers, discrete, **kwargs - ) - self.critic = MlpValue( - shared_layers[-1], action_dim, val_type, value_layers, **kwargs - ) - self.state_dim = state_dim - self.action_dim = action_dim - - def get_params(self): - actor_params = list(self.shared_network.parameters()) + list( - self.actor.parameters() - ) - critic_params = list(self.shared_network.parameters()) + list( - self.critic.parameters() - ) - return actor_params, critic_params - - def get_features(self, state: torch.Tensor): - """Extract features from the state, which is then an input to get_action and get_value - - Args: - state (:obj:`torch.Tensor`): The state(s) being passed - - Returns: - features (:obj:`torch.Tensor`): The feature(s) extracted from the state - """ - features = self.shared_network(state) - return features - - def get_action(self, state: torch.Tensor, deterministic: bool = False): - """Get Actions from the actor - - Arg: - state (:obj:`torch.Tensor`): The state(s) being passed to the critics - deterministic (bool): True if the action space is deterministic, else False - - Returns: - action (:obj:`list`): List of actions as estimated by the critic - distribution (): The distribution from which the action was sampled - (None if determinist - """ - - state = torch.as_tensor(state).float() - shared_features = self.get_features(state) - action_probs = self.actor(shared_features) - action_probs = nn.Softmax(dim=-1)(action_probs) - - if deterministic: - action = torch.argmax(action_probs, dim=-1).unsqueeze(-1).float() - distribution = None - else: - distribution = Categorical(probs=action_probs) - action = distribution.sample() - - return action, distribution - - def get_value(self, state: torch.Tensor): - """Get Values from the Critic - - Arg: - state (:obj:`torch.Tensor`): The state(s) being passed to the critics - - Returns: - values (:obj:`list`): List of values as estimated by the critic - """ - state = torch.as_tensor(state).float() - - if self.critic.val_type == "Qsa": - # state shape = [batch_size, number of vec envs, (state_dim + action_dim)] - - # extract shared_features from just the state - # state[:, :, :-action_dim] -> [batch_size, number of vec envs, state_dim] - shared_features = self.shared_network(state[:, :, : -self.action_dim]) - - # concatenate the actions to the extracted shared_features - # state[:, :, -action_dim:] -> [batch_size, number of vec envs, action_dim] - shared_features = torch.cat( - [shared_features, state[:, :, -self.action_dim :]], dim=-1 - ) - - value = self.critic(shared_features).float().squeeze(-1) - else: - shared_features = self.shared_network(state) - value = self.critic(shared_features) - return value - - class MlpSingleActorTwoCritic(BaseActorCritic): """MLP Actor Critic @@ -279,112 +164,6 @@ def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: return values -class MlpSharedSingleActorTwoCritic(MlpSingleActorTwoCritic): - """MLP Actor Critic - - Attributes: - state_dim (int): State dimensions of the environment - action_dim (int): Action space dimensions of the environment - shared_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the shared MLP - policy_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the policy MLP - value_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the value MLP - val_type (str): Value type of the critic network - discrete (bool): True if the action space is discrete, else False - num_critics (int): Number of critics in the architecture - sac (bool): True if a SAC-like network is needed, else False - activation (str): Activation function to be used. Can be either "tanh" or "relu" - """ - - def __init__( - self, - state_dim: spaces.Space, - action_dim: spaces.Space, - shared_layers: Tuple = (32, 32), - policy_layers: Tuple = (32, 32), - value_layers: Tuple = (32, 32), - val_type: str = "Qsa", - discrete: bool = True, - num_critics: int = 2, - **kwargs, - ): - super(MlpSharedSingleActorTwoCritic, self).__init__( - shared_layers[-1], - action_dim, - policy_layers, - value_layers, - val_type, - discrete, - num_critics, - **kwargs, - ) - self.shared_network = mlp([state_dim] + list(shared_layers)) - self.action_dim = action_dim - - def get_params(self): - actor_params = list(self.shared_network.parameters()) + list( - self.actor.parameters() - ) - critic_params = ( - list(self.shared_network.parameters()) - + list(self.critic1.parameters()) - + list(self.critic2.parameters()) - ) - return actor_params, critic_params - - def get_features(self, state: torch.Tensor): - """Extract features from the state, which is then an input to get_action and get_value - - Args: - state (:obj:`torch.Tensor`): The state(s) being passed - - Returns: - features (:obj:`torch.Tensor`): The feature(s) extracted from the state - """ - features = self.shared_network(state) - return features - - def get_action(self, state: torch.Tensor, deterministic: bool = False): - """Get Actions from the actor - - Arg: - state (:obj:`torch.Tensor`): The state(s) being passed to the critics - deterministic (bool): True if the action space is deterministic, else False - - Returns: - action (:obj:`list`): List of actions as estimated by the critic - distribution (): The distribution from which the action was sampled - (None if deterministic) - """ - return super(MlpSharedSingleActorTwoCritic, self).get_action( - self.get_features(state), deterministic=deterministic - ) - - def get_value(self, state: torch.Tensor, mode="first"): - """Get Values from both the Critic - - Arg: - state (:obj:`torch.Tensor`): The state(s) being passed to the critics - mode (str): What values should be returned. Types: - "both" --> Both values will be returned - "min" --> The minimum of both values will be returned - "first" --> The value from the first critic only will be returned - - Returns: - values (:obj:`list`): List of values as estimated by each individual critic - """ - state = torch.as_tensor(state).float() - # state shape = [batch_size, number of vec envs, (state_dim + action_dim)] - - # extract shard features for just the state - # state[:, :, :-action_dim] -> [batch_size, number of vec envs, state_dim] - x = self.get_features(state[:, :, : -self.action_dim]) - - # concatenate the actions to the extracted shared features - # state[:, :, -action_dim:] -> [batch_size, number of vec envs, action_dim] - state = torch.cat([x, state[:, :, -self.action_dim :]], dim=-1) - return super(MlpSharedSingleActorTwoCritic, self).get_value(state, mode) - - class CNNActorCritic(BaseActorCritic): """ CNN Actor Critic @@ -469,12 +248,147 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor: return value +class SharedActorCritic(BaseActorCritic): + def __init__( + self, + state_dim, + action_dim, + shared_layers, + critic_post, + actor_post, + val_type="V", + weight_init="xavier_uniform", + activation_func="relu", + critic_prev=[], + actor_prev=[], + ): + super(SharedActorCritic, self).__init__() + if len(actor_prev) > 0 and len(critic_prev) > 0: + actor_prev = [state_dim] + list(actor_prev) + if val_type == "Qsa": + critic_prev = [state_dim + action_dim] + list(critic_prev) + else: + critic_prev = [state_dim] + critic_prev + else: + shared_layers = [state_dim] + list(shared_layers) + + if val_type == "V" or val_type == "Qsa": + critic_post = list(critic_post) + [1] + elif val_type == "Qs": + critic_post = list(critic_post) + [action_dim] + else: + raise NotImplementedError + + actor_post = list(actor_post) + [action_dim] + self.critic, self.actor = shared_mlp( + critic_prev, + actor_prev, + shared_layers, + critic_post, + actor_post, + weight_init, + activation_func, + False, + ) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(self.actor, self.critic) + + def get_params(self): + actor_params = self.actor.parameters() + critic_params = self.critic.parameters() + return actor_params, critic_params + + def forward(self, state_critic, state_action): + + if state_critic is not None: + return self.critic(state_critic) + + if state_action is not None: + return self.actor(state_action) + + def get_action(self, state, deterministic=False): + # state = torch.FloatTensor(state).to(self.device) + logits = self.forward(None, state) + + probs = nn.Softmax(dim=-1)(logits) + dist = Categorical(probs) + if deterministic: + index = torch.argmax(probs, dim=-1).unsqueeze(-1).float() + else: + index = dist.sample() + print(index.shape) + return index, dist + + def get_value(self, state): + # state = torch.FloatTensor(state).to(self.device) + value = self.forward(state, None) + return value + + +class MultiAgentActor(MlpPolicy): + def __init__( + self, + state_dim: spaces.Space, + action_dim: spaces.Space, + hidden: Tuple = (32, 32), + discrete: bool = True, + **kwargs, + ): + super(MultiAgentActor, self).__init__( + state_dim, action_dim, hidden, discrete ** kwargs + ) + + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + def forward(self, state): + state = self.model(state) + return state + + def get_action(self, state, deterministic=False): + # state = torch.FloatTensor(state).to(self.device) + logits = self.forward(state) + + dist = F.softmax(logits, dim=0) + probs = Categorical(dist) + if deterministic: + index = torch.argmax(probs) + else: + index = probs.sample().cpu().detach().item() + return index + + +class MultiAgentCritic(MlpValue): + def __init__( + self, + state_dim: spaces.Space, + action_dim: spaces.Space, + fc_layers: Tuple = (32, 32), + val_type: str = "V", + **kwargs, + ): + super(MultiAgentCritic, self).__init__( + state_dim, action_dim, fc_layers, val_type, **kwargs + ) + + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + def forward(self, state): + + state = self.model(state) + + return state + + def get_value(self, state): + # state = torch.FloatTensor(state).to(self.device) + value = self.forward(state) + return value + + actor_critic_registry = { "mlp": MlpActorCritic, "cnn": CNNActorCritic, "mlp12": MlpSingleActorTwoCritic, - "mlps": MlpSharedActorCritic, - "mlp12s": MlpSharedSingleActorTwoCritic, + "mlps": SharedActorCritic, } diff --git a/genrl/core/buffers.py b/genrl/core/buffers.py index 0a5b6e7c..3ba9e7e1 100644 --- a/genrl/core/buffers.py +++ b/genrl/core/buffers.py @@ -117,15 +117,15 @@ def sample( ] ): """ - (Returns randomly sampled memories from replay memory along with their + (Returns randomly sampled memories from replay memory along with their respective indices and weights) - :param batch_size: Number of samples per batch - :param beta: (Bias exponent used to correct + :param batch_size: Number of samples per batch + :param beta: (Bias exponent used to correct Importance Sampling (IS) weights) - :type batch_size: int - :type beta: float - :returns: (Tuple containing `states`, `actions`, `next_states`, + :type batch_size: int + :type beta: float + :returns: (Tuple containing `states`, `actions`, `next_states`, `rewards`, `dones`, `indices` and `weights`) """ if beta is None: @@ -181,3 +181,118 @@ def __len__(self) -> int: @property def pos(self): return len(self.buffer) + + +class MultiAgentReplayBuffer: + """ + Implements the basic Experience Replay Mechanism for MultiAgents + by feeding in global states, global actions, global rewards, + global next_states, global dones + + :param capacity: Size of the replay buffer + :type capacity: int + :param num_agents: Number of agents in the environment + :type num_agents: int + """ + + def __init__(self, num_agents: int, capacity: int): + """ + Initialising the buffer + :param num_agents: number of agents in the environment + :type num_agents: int + :param capacity: Max buffer size + :type capacity: int + + """ + self.capacity = capacity + self.num_agents = num_agents + self.buffer = deque(maxlen=self.capacity) + + def push(self, inp: Tuple) -> None: + """ + Adds new experience to buffer + + :param inp: (Tuple containing `state`, `action`, `reward`, + `next_state` and `done`) + :type inp: tuple + :returns: None + """ + self.buffer.append(inp) + + def sample(self, batch_size): + + """ + Returns randomly sampled experiences from replay memory + + :param batch_size: Number of samples per batch + :type batch_size: int + :returns: (Tuple composing of `indiv_obs_batch`, + `indiv_action_batch`, `indiv_reward_batch`, `indiv_next_obs_batch`, + `global_state_batch`, `global_actions_batch`, `global_next_state_batch`, + `done_batch`) + """ + indiv_obs_batch = [ + [] for _ in range(self.num_agents) + ] # [ [states of agent 1], ... ,[states of agent n] ] ] + indiv_action_batch = [ + [] for _ in range(self.num_agents) + ] # [ [actions of agent 1], ... , [actions of agent n]] + indiv_reward_batch = [[] for _ in range(self.num_agents)] + indiv_next_obs_batch = [[] for _ in range(self.num_agents)] + + global_state_batch = [] + global_next_state_batch = [] + global_actions_batch = [] + done_batch = [] + + batch = random.sample(self.buffer, batch_size) + + for experience in batch: + state, action, reward, next_state, done = experience + + for i in range(self.num_agents): + indiv_obs_batch[i].append(state[i]) + indiv_action_batch[i].append(action[i]) + indiv_reward_batch[i].append(reward[i]) + indiv_next_obs_batch[i].append(next_state[i]) + + global_state_batch.append(torch.cat(state)) + global_actions_batch.append(torch.cat(action)) + global_next_state_batch.append(torch.cat(next_state)) + done_batch.append(done) + + global_state_batch = torch.stack(global_state_batch) + global_actions_batch = torch.stack(global_actions_batch) + global_next_state_batch = torch.stack(global_next_state_batch) + done_batch = torch.stack(done_batch) + indiv_obs_batch = torch.stack( + [torch.FloatTensor(obs) for obs in indiv_obs_batch] + ) + indiv_action_batch = torch.stack( + [torch.FloatTensor(act) for act in indiv_action_batch] + ) + indiv_reward_batch = torch.stack( + [torch.FloatTensor(rew) for rew in indiv_reward_batch] + ) + indiv_next_obs_batch = torch.stack( + [torch.FloatTensor(next_obs) for next_obs in indiv_next_obs_batch] + ) + + return ( + indiv_obs_batch, + indiv_action_batch, + indiv_reward_batch, + indiv_next_obs_batch, + global_state_batch, + global_actions_batch, + global_next_state_batch, + done_batch, + ) + + def __len__(self): + """ + Gives number of experiences in buffer currently + + :returns: Length of replay memory + """ + return len(self.buffer) diff --git a/genrl/environments/gym_wrapper.py b/genrl/environments/gym_wrapper.py index ceb99472..cbbc96e4 100644 --- a/genrl/environments/gym_wrapper.py +++ b/genrl/environments/gym_wrapper.py @@ -106,3 +106,107 @@ def close(self) -> None: Closes environment """ self.env.close() + + +class MultiGymWrapper(gym.Wrapper): + """ + Wrapper class for all MultiAgent Particle Environments + + :param env: Gym environment name + :param n_envs: Number of environments. None if not vectorised + :param parallel: If vectorised, should environments be run through \ +serially or parallelly + :type env: string + :type n_envs: None, int + :type parallel: boolean + """ + + def __init__(self, env: gym.Env): + super(MultiGymWrapper, self).__init__(env) + self.env = env + + self.observation_space = self.env.observation_space + self.action_space = self.env.action_space + + self.state = None + self.action = None + self.reward = None + self.done = False + self.info = {} + + def __getattr__(self, name: str) -> Any: + """ + All other calls would go to base env + """ + env = super(MultiGymWrapper, self).__getattribute__("env") + return getattr(env, name) + + @property + def obs_shape(self): + if isinstance(self.env.observation_space, gym.spaces.Discrete): + obs_shape = (1,) + elif isinstance(self.env.observation_space, gym.spaces.Box): + obs_shape = self.env.observation_space.shape + return obs_shape + + @property + def action_shape(self): + if isinstance(self.env.action_space, gym.spaces.Box): + action_shape = self.env.action_space.shape + elif isinstance(self.env.action_space, gym.spaces.Discrete): + action_shape = (1,) + return action_shape + + def sample(self) -> np.ndarray: + """ + Shortcut method to directly sample from environment's action space + + :returns: Random action from action space + :rtype: NumPy Array + """ + return self.env.action_space.sample() + + def render(self, mode: str = "human") -> None: + """ + Renders all envs in a tiles format similar to baselines. + + :param mode: Can either be 'human' or 'rgb_array'. \ +Displays tiled images in 'human' and returns tiled images in 'rgb_array' + :type mode: string + """ + self.env.render(mode=mode) + + def seed(self, seed: int = None) -> None: + """ + Set environment seed + + :param seed: Value of seed + :type seed: int + """ + self.env.seed(seed) + + def step(self, action: np.ndarray) -> np.ndarray: + """ + Steps the env through given action + + :param action: Action taken by agent + :type action: NumPy array + :returns: Next observation, reward, game status and debugging info + """ + self.state, self.reward, self.done, self.info = self.env.step(action) + self.action = action + return self.state, self.reward, self.done, self.info + + def reset(self) -> np.ndarray: + """ + Resets environment + + :returns: Initial state + """ + return self.env.reset() + + def close(self) -> None: + """ + Closes environment + """ + self.env.close() diff --git a/genrl/environments/suite.py b/genrl/environments/suite.py index 11310c08..3e69ca40 100644 --- a/genrl/environments/suite.py +++ b/genrl/environments/suite.py @@ -97,3 +97,38 @@ def AtariEnv( env = wrapper(env) return env + + +def MultiAgentParticleEnv(scenario_name: str, benchmark: bool) -> gym.Env: + """ + Function to apply wrappers for all Atari envs by Trainer class + + :param scenarion_name: Environment Name + :type env: string + :param benchmark: laod benchmark results + :type wrapper_list: bool + :returns: Gym Atari Environment + :rtype: object + """ + import multiagent.scenarios as scenarios + from multiagent.environment import MultiAgentEnv + + # load scenario from script + scenario = scenarios.load(scenario_name + ".py").Scenario() + # create world + world = scenario.make_world() + # create multiagent environment + if benchmark: + env = MultiAgentEnv( + world, + scenario.reset_world, + scenario.reward, + scenario.observation, + scenario.benchmark_data, + ) + else: + env = MultiAgentEnv( + world, scenario.reset_world, scenario.reward, scenario.observation + ) + + return env diff --git a/genrl/utils/utils.py b/genrl/utils/utils.py index 89e53337..ced06e31 100644 --- a/genrl/utils/utils.py +++ b/genrl/utils/utils.py @@ -13,13 +13,13 @@ def get_model(type_: str, name_: str) -> Union: """ - Utility to get the class of required function + Utility to get the class of required function - :param type_: "ac" for Actor Critic, "v" for Value, "p" for Policy - :param name_: Name of the specific structure of model. ( + :param type_: "ac" for Actor Critic, "v" for Value, "p" for Policy + :param name_: Name of the specific structure of model. ( Eg. "mlp" or "cnn") - :type type_: string - :returns: Required class. Eg. MlpActorCritic + :type type_: string + :returns: Required class. Eg. MlpActorCritic """ if type_ == "ac": from genrl.core import get_actor_critic_from_name @@ -42,13 +42,13 @@ def mlp( sac: bool = False, ): """ - Generates an MLP model given sizes of each layer + Generates an MLP model given sizes of each layer - :param sizes: Sizes of hidden layers - :param sac: True if Soft Actor Critic is being used, else False - :type sizes: tuple or list - :type sac: bool - :returns: (Neural Network with fully-connected linear layers and + :param sizes: Sizes of hidden layers + :param sac: True if Soft Actor Critic is being used, else False + :type sizes: tuple or list + :type sac: bool + :returns: (Neural Network with fully-connected linear layers and activation layers) """ layers = [] @@ -63,6 +63,166 @@ def mlp( return nn.Sequential(*layers) +# If at all you need to concatenate states to actions after passing states through n FC layers +def mlp_concat( + layer_sizes: Tuple, + weight_init: str = "xavier_uniform", + activation_func: str = "relu", + concat_ind: int = -1, # negative number means no concatenation + sac: bool = False, +): + """ + Generates an MLP model given sizes of each layer + + :param layer_sizes: Sizes of hidden layers + :param weight_init: type of weight initialization + :param activation_func: type of activation function + :param concat_ind: index of layer at which actions to be concatenated + :param sac: True if Soft Actor Critic is being used, else False + :type layer_sizes: tuple or list + :type concat_ind: int + :type sac: bool + :type weight_init,activation_func: string + :returns: (Neural Network with fully-connected linear layers and + activation layers) + """ + layers = [] + limit = len(layer_sizes) if sac is False else len(layer_sizes) - 1 + + # add more activations + activation = nn.Tanh() if activation_func == "tanh" else nn.ReLU() + + # add more weight init + if weight_init == "xavier_uniform": + weight_init = torch.nn.init.xavier_uniform_ + elif weight_init == "xavier_normal": + weight_init = torch.nn.init.xavier_normal_ + + for layer in range(limit - 1): + if layer == concat_ind: + continue + act = activation if layer < limit - 2 else nn.Identity() + layers += [nn.Linear(layer_sizes[layer], layer_sizes[layer + 1])] + weight_init(layers[-1][0].weight) + layers += [act] + + return nn.Sequential(*layers) + + +def shared_mlp( + network1_prev: Tuple, + network2_prev: Tuple, + shared_layers: Tuple, + network1_post: Tuple, + network2_post: Tuple, + weight_init: str = "xavier_uniform", + activation_func: str = "relu", + sac: bool = False, +): + """ + Generates an MLP model given sizes of each layer (Mostly used for SharedActorCritic) + + :param network1_prev: Sizes of network1's initial layers + :param network2_prev: Sizes of network2's initial layers + :param shared: Sizes of shared layers + :param network1_post: Sizes of network1's latter layers + :param network2_post: Sizes of network2's latter layers + :param weight_init: type of weight initialization + :param activation_func: type of activation function + :param sac: True if Soft Actor Critic is being used, else False + :type network1_prev,network2_prev,shared,network1_post,network2_post: tuple or list + :type weight_init,activation_func: string + :type sac: bool + :returns: network1 and networ2(Neural Network with fully-connected linear layers and + activation layers) + """ + + if len(network1_prev) != 0: + net1_prev = nn.ModuleList() + if len(network2_prev) != 0: + net2_prev = nn.ModuleList() + if len(shared_layers) != 0: + shared = nn.ModuleList() + if len(network1_post) != 0: + net1_post = nn.ModuleList() + if len(network2_post) != 0: + net2_post = nn.ModuleList() + + # add more weight init + if weight_init == "xavier_uniform": + weight_init = torch.nn.init.xavier_uniform_ + elif weight_init == "xavier_normal": + weight_init = torch.nn.init.xavier_normal_ + else: + weight_init = None + + if activation_func == "relu": + activation = nn.ReLU() + elif activation_func == "tanh": + activation = nn.Tanh() + else: + activation = None + + if len(shared_layers) != 0 or len(network1_post) != 0 or len(network2_post) != 0: + if len(network1_prev) != 0 and len(network2_prev) != 0: + if not ( + network1_prev[-1] == network2_prev[-1] + and network1_prev[-1] == shared_layers[0] + and network1_post[0] == network2_post[0] + and network1_post[0] == shared_layers[-1] + ): + raise ValueError + else: + if not ( + network1_post[0] == network2_post[0] + and network1_post[0] == shared_layers[-1] + ): + raise ValueError + + for i in range(len(network1_prev) - 1): + net1_prev.append(nn.Linear(network1_prev[i], network1_prev[i + 1])) + if weight_init is not None: + weight_init(net1_prev[-1].weight) + if activation is not None: + net1_prev.append(activation) + + for i in range(len(network2_prev) - 1): + net2_prev.append(nn.Linear(network2_prev[i], network2_prev[i + 1])) + if weight_init is not None: + weight_init(net2_prev[-1].weight) + if activation is not None: + net2_prev.append(activation) + + for i in range(len(shared_layers) - 1): + shared.append(nn.Linear(shared_layers[i], shared_layers[i + 1])) + if weight_init is not None: + weight_init(shared[-1].weight) + if activation is not None: + shared.append(activation) + + for i in range(len(network1_post) - 1): + net1_post.append(nn.Linear(network1_post[i], network1_post[i + 1])) + if weight_init is not None: + weight_init(net1_post[-1].weight) + if activation is not None: + net1_post.append(activation) + + for i in range(len(network2_post) - 1): + net2_post.append(nn.Linear(network2_post[i], network2_post[i + 1])) + if weight_init is not None: + weight_init(net2_post[-1].weight) + if activation is not None: + net2_post.append(activation) + + if len(network1_prev) != 0 and len(network2_prev) != 0: + network1 = nn.Sequential(*net1_prev, *shared, *net1_post) + network2 = nn.Sequential(*net2_prev, *shared, *net2_post) + else: + network1 = nn.Sequential(*shared, *net1_post) + network2 = nn.Sequential(*shared, *net2_post) + return network1, network2 + + def cnn( channels: Tuple = (4, 16, 32), kernel_sizes: Tuple = (8, 4), @@ -70,18 +230,18 @@ def cnn( **kwargs, ) -> (Tuple): """ - (Generates a CNN model given input dimensions, channels, kernel_sizes and + (Generates a CNN model given input dimensions, channels, kernel_sizes and strides) - :param channels: Input output channels before and after each convolution - :param kernel_sizes: Kernel sizes for each convolution - :param strides: Strides for each convolution - :param in_size: Input dimensions (assuming square input) - :type channels: tuple - :type kernel_sizes: tuple - :type strides: tuple - :type in_size: int - :returns: (Convolutional Neural Network with convolutional layers and + :param channels: Input output channels before and after each convolution + :param kernel_sizes: Kernel sizes for each convolution + :param strides: Strides for each convolution + :param in_size: Input dimensions (assuming square input) + :type channels: tuple + :type kernel_sizes: tuple + :type strides: tuple + :type in_size: int + :returns: (Convolutional Neural Network with convolutional layers and activation layers) """ @@ -107,12 +267,12 @@ def noisy_mlp(fc_layers: List[int], noisy_layers: List[int], activation="relu"): """Noisy MLP generating helper function Args: - fc_layers (:obj:`list` of :obj:`int`): List of fully connected layers - noisy_layers (:obj:`list` of :obj:`int`): :ist of noisy layers - activation (str): Activation function to be used. ["tanh", "relu"] + fc_layers (:obj:`list` of :obj:`int`): List of fully connected layers + noisy_layers (:obj:`list` of :obj:`int`): :ist of noisy layers + activation (str): Activation function to be used. ["tanh", "relu"] Returns: - Noisy MLP model + Noisy MLP model """ model = [] act = nn.Tanh if activation == "tanh" else nn.ReLU() @@ -134,15 +294,15 @@ def get_env_properties( env: Union[gym.Env, VecEnv], network: Union[str, Any] = "mlp" ) -> (Tuple[int]): """ - Finds important properties of environment + Finds important properties of environment - :param env: Environment that the agent is interacting with - :type env: Gym Environment - :param network: Type of network architecture, eg. "mlp", "cnn" - :type network: str - :returns: (State space dimensions, Action space dimensions, + :param env: Environment that the agent is interacting with + :type env: Gym Environment + :param network: Type of network architecture, eg. "mlp", "cnn" + :type network: str + :returns: (State space dimensions, Action space dimensions, discreteness of action space and action limit (highest action value) - :rtype: int, float, ...; int, float, ...; bool; int, float, ... + :rtype: int, float, ...; int, float, ...; bool; int, float, ... """ if network == "cnn": state_dim = env.framestack @@ -199,3 +359,21 @@ def safe_mean(log: Union[torch.Tensor, List[int]]): else: func = np.mean return func(log) + + +def onehot_from_logits(self, logits, eps=0.0): + # get best (according to current policy) actions in one-hot form + argmax_acs = (logits == logits.max(0, keepdim=True)[0]).float() + if eps == 0.0: + return argmax_acs + # get random actions in one-hot form + rand_acs = torch.eye(logits.shape[1])[ + [np.random.choice(range(logits.shape[1]), size=logits.shape[0])] + ] + # chooses between best and random actions using epsilon greedy + return torch.stack( + [ + argmax_acs[i] if r > eps else rand_acs[i] + for i, r in enumerate(torch.rand(logits.shape[0])) + ] + ) diff --git a/tests/test_deep/test_agents/test_a2c.py b/tests/test_deep/test_agents/test_a2c.py index f731f40f..ed2a6904 100644 --- a/tests/test_deep/test_agents/test_a2c.py +++ b/tests/test_deep/test_agents/test_a2c.py @@ -23,7 +23,14 @@ def test_a2c_cnn(): def test_a2c_shared(): env = VectorEnv("CartPole-v0", 1) - algo = A2C("mlp", env, shared_layers=(32, 32), rollout_size=128) + algo = A2C( + "mlp", + env, + policy_layers=(32, 32), + value_layers=(32, 32), + shared_layers=(32, 32), + rollout_size=128, + ) trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) trainer.train() shutil.rmtree("./logs") diff --git a/tests/test_deep/test_common/test_utils.py b/tests/test_deep/test_common/test_utils.py index 11bf1292..85dc95ab 100644 --- a/tests/test_deep/test_common/test_utils.py +++ b/tests/test_deep/test_common/test_utils.py @@ -8,7 +8,15 @@ from genrl.core import CnnValue, MlpActorCritic, MlpPolicy, MlpValue from genrl.environments import VectorEnv from genrl.trainers import OnPolicyTrainer -from genrl.utils import cnn, get_env_properties, get_model, mlp, set_seeds +from genrl.utils.utils import ( + cnn, + get_env_properties, + get_model, + mlp, + mlp_concat, + set_seeds, + shared_mlp, +) class TestUtils: @@ -33,15 +41,41 @@ def test_mlp(self): sizes = [2, 3, 3, 2] mlp_nn = mlp(sizes) mlp_nn_sac = mlp(sizes, sac=True) + mlp_nn_concat = mlp_concat(sizes, concat_ind=1, sac=False) + mlp_nn_concat_sac = mlp_concat(sizes, concat_ind=1, sac=True) + shared_mlp_nn1, shared_mlp_nn2 = shared_mlp( + sizes, sizes, sizes, sizes, sizes, sac=False + ) + shared_mlp_nn1_sac, shared_mlp_nn2_sac = shared_mlp( + sizes, sizes, sizes, sizes, sizes, sac=True + ) assert len(mlp_nn) == 2 * (len(sizes) - 1) assert all(isinstance(mlp_nn[i], nn.Linear) for i in range(0, 5, 2)) + assert len(mlp_nn_concat) == 2 * (len(sizes) - 1) + assert all(isinstance(mlp_nn_concat[i], nn.Linear) for i in range(0, 5, 2)) assert len(mlp_nn_sac) == 2 * (len(sizes) - 2) assert all(isinstance(mlp_nn_sac[i], nn.Linear) for i in range(0, 4, 2)) + assert len(mlp_nn_concat_sac) == 2 * (len(sizes) - 2) + assert all(isinstance(mlp_nn_concat_sac[i], nn.Linear) for i in range(0, 4, 2)) + assert len(shared_mlp_nn1) == 2 * (len(sizes) - 1) * 3 + assert len(shared_mlp_nn2) == 2 * (len(sizes) - 1) * 3 + assert all(isinstance(shared_mlp_nn1[i], nn.Linear) for i in range(0, 8, 2)) + assert all(isinstance(shared_mlp_nn2[i], nn.Linear) for i in range(0, 8, 2)) + assert len(shared_mlp_nn1_sac) == 2 * (len(sizes) - 2) * 3 + assert all(isinstance(shared_mlp_nn1_sac[i], nn.Linear) for i in range(0, 4, 2)) + assert len(shared_mlp_nn2_sac) == 2 * (len(sizes) - 2) * 3 + assert all(isinstance(shared_mlp_nn2_sac[i], nn.Linear) for i in range(0, 4, 2)) inp = torch.randn((2,)) assert mlp_nn(inp).shape == (2,) + assert mlp_nn_concat(inp).shape == (2,) + assert shared_mlp_nn1(inp).shape == (2,) + assert shared_mlp_nn2(inp).shape == (2,) assert mlp_nn_sac(inp).shape == (3,) + assert mlp_nn_concat_sac(inp).shape == (3,) + assert shared_mlp_nn1_sac(inp).shape == (3,) + assert shared_mlp_nn2_sac(inp).shape == (3,) def test_cnn(self): """ From 2be8df5e9d5a22ff28b79e3662f018623abc7895 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Sun, 4 Oct 2020 11:39:28 +0530 Subject: [PATCH 26/39] rollout buffer for MA --- genrl/core/rollout_storage.py | 206 ++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) diff --git a/genrl/core/rollout_storage.py b/genrl/core/rollout_storage.py index faf5e982..7feaf9ec 100644 --- a/genrl/core/rollout_storage.py +++ b/genrl/core/rollout_storage.py @@ -310,3 +310,209 @@ def _get_samples(self, batch_inds: np.ndarray) -> RolloutBufferSamples: self.returns[batch_inds].flatten(), ) return RolloutBufferSamples(*tuple(map(self.to_torch, data))) + + +class MultiAgentRolloutBuffer(BaseBuffer): + """ + Rollout buffer used in on-policy algorithms like MAA2C/MAA3C. + :param num_agents: (int) Max number of agents in the environment + :param buffer_size: (int) Max number of element in the buffer + :param env: (Environment) The environment being trained on + :param device: (torch.device) + :param gae_lambda: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator + Equivalent to classic advantage when set to 1. + :param gamma: (float) Discount factor + :param n_envs: (int) Number of parallel environments + """ + + def __init__( + self, + num_agents: int, + buffer_size: int, + env, + device: Union[torch.device, str] = "cpu", + gae_lambda: float = 1, + gamma: float = 0.99, + ): + super(MultiAgentRolloutBuffer, self).__init__(buffer_size, env, device) + """ + Initialising the buffer + :param num_agents: number of agents in the environment + :type num_agents: int + :param capacity: Max buffer size + :type capacity: int + + """ + self.buffer_size = buffer_size + self.num_agents = num_agents + self.env = env + self.device = device + self.gae_lambda = gae_lambda + self.gamma = gamma + + self.observations, self.actions, self.rewards, self.advantages = ( + [None] * self.num_agents, + [None] * self.num_agents, + [None] * self.num_agents, + [None] * self.num_agents, + ) + self.returns, self.dones, self.values, self.log_probs = ( + [None] * self.num_agents, + [None] * self.num_agents, + [None] * self.num_agents, + [None] * self.num_agents, + ) + self.generator_ready = False + self.reset() + + def reset(self) -> None: + self.observations = torch.zeros( + *(self.buffer_size, self.env.n_envs, self.num_agents, *self.env.obs_shape) + ) + self.actions = torch.zeros( + *( + self.buffer_size, + self.env.n_envs, + self.num_agents, + *self.env.action_shape, + ) + ) + self.rewards = torch.zeros(self.buffer_size, self.env.n_envs, self.num_agents) + self.returns = torch.zeros(self.buffer_size, self.env.n_envs, self.num_agents) + self.dones = torch.zeros(self.buffer_size, self.env.n_envs, self.num_agents) + self.values = torch.zeros(self.buffer_size, self.env.n_envs, self.num_agents) + self.log_probs = torch.zeros(self.buffer_size, self.env.n_envs, self.num_agents) + self.advantages = torch.zeros( + self.buffer_size, self.env.n_envs, self.num_agents + ) + self.generator_ready = False + super(RolloutBuffer, self).reset() + + def add( + self, + obs: torch.zeros, + action: torch.zeros, + reward: torch.zeros, + done: torch.zeros, + value: torch.Tensor, + log_prob: torch.Tensor, + ) -> None: + """ + :param obs: (torch.zeros) Observation + :param action: (torch.zeros) Action + :param reward: (torch.zeros) + :param done: (torch.zeros) End of episode signal. + :param value: (torch.Tensor) estimated value of the current state + following the current policy. + :param log_prob: (torch.Tensor) log probability of the action + following the current policy. + """ + if len(log_prob.shape) == 0: + # Reshape 0-d tensor to avoid error + log_prob = log_prob.reshape(-1, 1) + + self.observations[self.pos] = obs.detach().clone() + self.actions[self.pos] = action.squeeze().detach().clone() + self.rewards[self.pos] = reward.detach().clone() + self.dones[self.pos] = done.detach().clone() + self.values[self.pos] = ( + value.detach().clone().flatten().reshape(-1, self.num_agents) + ) + self.log_probs[self.pos] = ( + log_prob.detach().clone().flatten().reshape(-1, self.num_agents) + ) + self.pos += 1 + + if self.pos == self.buffer_size: + self.full = True + + def get( + self, batch_size: Optional[int] = None + ) -> Generator[RolloutBufferSamples, None, None]: + assert self.full, "" + indices = np.random.permutation(self.buffer_size * self.env.n_envs) + # Prepare the data + if not self.generator_ready: + for tensor in [ + "observations", + "actions", + "values", + "log_probs", + "advantages", + "returns", + ]: + self.__dict__[tensor] = self.swap_and_flatten(self.__dict__[tensor]) + self.generator_ready = True + + # Return everything, don't create minibatches + if batch_size is None: + batch_size = self.buffer_size * self.env.n_envs + + start_idx = 0 + while start_idx < self.buffer_size * self.env.n_envs: + yield self._get_samples(indices[start_idx : start_idx + batch_size]) + start_idx += batch_size + + def _get_samples(self, batch_inds: np.ndarray) -> RolloutBufferSamples: + data = ( + self.observations[batch_inds], + self.actions[batch_inds], + self.values[batch_inds].flatten().reshape(-1, self.num_agents), + self.log_probs[batch_inds].flatten().reshape(-1, self.num_agents), + self.advantages[batch_inds].flatten().reshape(-1, self.num_agents), + self.returns[batch_inds].flatten().reshape(-1, self.num_agents), + ) + return RolloutBufferSamples(*tuple(map(self.to_torch, data))) + + def compute_returns_and_advantage( + self, last_value: torch.Tensor, dones: torch.zeros, use_gae: bool = False + ) -> None: + """ + Post-processing step: compute the returns (sum of discounted rewards) + and advantage (A(s) = R - V(S)). + Adapted from Stable-Baselines PPO2. + :param last_value: (torch.Tensor) + :param dones: (torch.zeros) + :param use_gae: (bool) Whether to use Generalized Advantage Estimation + or normal advantage for advantage computation. + """ + last_value = last_value.flatten().reshape(-1, self.num_agents) + + if use_gae: + last_gae_lam = 0 + for step in reversed(range(self.buffer_size)): + if step == self.buffer_size - 1: + next_non_terminal = 1.0 - dones + next_value = last_value + else: + next_non_terminal = 1.0 - self.dones[step + 1] + next_value = self.values[step + 1] + delta = ( + self.rewards[step] + + self.gamma * next_value * next_non_terminal + - self.values[step] + ) + last_gae_lam = ( + delta + + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam + ) + self.advantages[step] = last_gae_lam + self.returns = self.advantages + self.values + else: + # Discounted return with value bootstrap + # Note: this is equivalent to GAE computation + # with gae_lambda = 1.0 + last_return = 0.0 + for step in reversed(range(self.buffer_size)): + if step == self.buffer_size - 1: + next_non_terminal = 1.0 - dones + next_value = last_value + last_return = self.rewards[step] + next_non_terminal * next_value + else: + next_non_terminal = 1.0 - self.dones[step + 1] + last_return = ( + self.rewards[step] + + self.gamma * last_return * next_non_terminal + ) + self.returns[step] = last_return + self.advantages = self.returns - self.values From 10282f06bb3485e8d380c67c21da6de467033b6e Mon Sep 17 00:00:00 2001 From: Aditya Kapoor <39992294+AdityaKapoor74@users.noreply.github.com> Date: Sun, 4 Oct 2020 11:49:27 +0530 Subject: [PATCH 27/39] Update genrl/utils/utils.py Co-authored-by: Vedant Shah --- genrl/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genrl/utils/utils.py b/genrl/utils/utils.py index ced06e31..7e6c849c 100644 --- a/genrl/utils/utils.py +++ b/genrl/utils/utils.py @@ -103,7 +103,7 @@ def mlp_concat( continue act = activation if layer < limit - 2 else nn.Identity() layers += [nn.Linear(layer_sizes[layer], layer_sizes[layer + 1])] - weight_init(layers[-1][0].weight) + weight_init(layers[-1].weight) layers += [act] return nn.Sequential(*layers) From 79b531b305b5890ad55d2b74fc23a0c048c19cf9 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor <39992294+AdityaKapoor74@users.noreply.github.com> Date: Sun, 4 Oct 2020 12:05:47 +0530 Subject: [PATCH 28/39] Update genrl/agents/deep/ppo1/ppo1.py Co-authored-by: Vedant Shah --- genrl/agents/deep/ppo1/ppo1.py | 1 - 1 file changed, 1 deletion(-) diff --git a/genrl/agents/deep/ppo1/ppo1.py b/genrl/agents/deep/ppo1/ppo1.py index db826403..daef962d 100644 --- a/genrl/agents/deep/ppo1/ppo1.py +++ b/genrl/agents/deep/ppo1/ppo1.py @@ -74,7 +74,6 @@ def _create_model(self): arch = self.network if self.shared_layers is not None: arch += "s" - self.ac = get_model("ac", arch)( state_dim, action_dim, From a3885a0b5f22f07a6ed718f07e95c9a68963e7c2 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor <39992294+AdityaKapoor74@users.noreply.github.com> Date: Sun, 4 Oct 2020 12:06:00 +0530 Subject: [PATCH 29/39] Update genrl/core/actor_critic.py Co-authored-by: Vedant Shah --- genrl/core/actor_critic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 4309a329..54706617 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -13,7 +13,6 @@ from genrl.utils.utils import cnn, mlp - class MlpActorCritic(BaseActorCritic): """MLP Actor Critic From e3dc677fdccd851d66a001401b7bb18ced894448 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor <39992294+AdityaKapoor74@users.noreply.github.com> Date: Sun, 4 Oct 2020 12:06:15 +0530 Subject: [PATCH 30/39] Update genrl/core/actor_critic.py Co-authored-by: Vedant Shah --- genrl/core/actor_critic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 54706617..8224baed 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -49,7 +49,6 @@ def get_params(self): return actor_params, critic_params - class MlpSharedActorCritic(BaseActorCritic): """MLP Shared Actor Critic From 4c2ad51ce150ea468153b18d899491940388818d Mon Sep 17 00:00:00 2001 From: Aditya Kapoor <39992294+AdityaKapoor74@users.noreply.github.com> Date: Sun, 4 Oct 2020 12:06:27 +0530 Subject: [PATCH 31/39] Update genrl/core/actor_critic.py Co-authored-by: Vedant Shah --- genrl/core/actor_critic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 8224baed..e4eb4f27 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -531,7 +531,6 @@ def get_action(self, state, deterministic=False): # state = torch.FloatTensor(state).to(self.device) logits = self.forward(None, state) - probs = nn.Softmax(dim=-1)(logits) dist = Categorical(probs) if deterministic: From 43554e453fcc8ba013b86b61baf62dfae6e9b3e7 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor <39992294+AdityaKapoor74@users.noreply.github.com> Date: Sun, 4 Oct 2020 12:06:43 +0530 Subject: [PATCH 32/39] Update genrl/core/actor_critic.py Co-authored-by: Vedant Shah --- genrl/core/actor_critic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index e4eb4f27..0a190556 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -164,6 +164,7 @@ def get_value(self, state: torch.Tensor): value = self.critic(shared_features) return value + class MlpSingleActorTwoCritic(BaseActorCritic): """MLP Actor Critic From 6828e9388af0109f33bcd0b940dd476a4ea0c39c Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Sun, 4 Oct 2020 12:12:33 +0530 Subject: [PATCH 33/39] removing SharedAC class --- genrl/core/actor_critic.py | 81 +------------------------------------- 1 file changed, 1 insertion(+), 80 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 4309a329..fa3cc30e 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -9,11 +9,9 @@ from genrl.core.base import BaseActorCritic from genrl.core.policies import MlpPolicy from genrl.core.values import MlpValue - from genrl.utils.utils import cnn, mlp - class MlpActorCritic(BaseActorCritic): """MLP Actor Critic @@ -50,7 +48,6 @@ def get_params(self): return actor_params, critic_params - class MlpSharedActorCritic(BaseActorCritic): """MLP Shared Actor Critic @@ -166,6 +163,7 @@ def get_value(self, state: torch.Tensor): value = self.critic(shared_features) return value + class MlpSingleActorTwoCritic(BaseActorCritic): """MLP Actor Critic @@ -472,83 +470,6 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor: return value -class SharedActorCritic(BaseActorCritic): - def __init__( - self, - state_dim, - action_dim, - shared_layers, - critic_post, - actor_post, - val_type="V", - weight_init="xavier_uniform", - activation_func="relu", - critic_prev=[], - actor_prev=[], - ): - super(SharedActorCritic, self).__init__() - if len(actor_prev) > 0 and len(critic_prev) > 0: - actor_prev = [state_dim] + list(actor_prev) - if val_type == "Qsa": - critic_prev = [state_dim + action_dim] + list(critic_prev) - else: - critic_prev = [state_dim] + critic_prev - else: - shared_layers = [state_dim] + list(shared_layers) - - if val_type == "V" or val_type == "Qsa": - critic_post = list(critic_post) + [1] - elif val_type == "Qs": - critic_post = list(critic_post) + [action_dim] - else: - raise NotImplementedError - - actor_post = list(actor_post) + [action_dim] - self.critic, self.actor = shared_mlp( - critic_prev, - actor_prev, - shared_layers, - critic_post, - actor_post, - weight_init, - activation_func, - False, - ) - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - def get_params(self): - actor_params = self.actor.parameters() - critic_params = self.critic.parameters() - return actor_params, critic_params - - def forward(self, state_critic, state_action): - - if state_critic is not None: - return self.critic(state_critic) - - if state_action is not None: - return self.actor(state_action) - - def get_action(self, state, deterministic=False): - # state = torch.FloatTensor(state).to(self.device) - logits = self.forward(None, state) - - - probs = nn.Softmax(dim=-1)(logits) - dist = Categorical(probs) - if deterministic: - index = torch.argmax(probs, dim=-1).unsqueeze(-1).float() - else: - index = dist.sample() - print(index.shape) - return index, dist - - def get_value(self, state): - # state = torch.FloatTensor(state).to(self.device) - value = self.forward(state, None) - return value - - class MultiAgentActor(MlpPolicy): def __init__( self, From 194065f13697a9a6b4f9180ed48e51d6ab3a79c1 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Sun, 4 Oct 2020 12:24:34 +0530 Subject: [PATCH 34/39] rectify --- genrl/agents/deep/base/base.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/genrl/agents/deep/base/base.py b/genrl/agents/deep/base/base.py index 48334274..cf3e40d3 100644 --- a/genrl/agents/deep/base/base.py +++ b/genrl/agents/deep/base/base.py @@ -34,7 +34,6 @@ def __init__( create_model: bool = True, batch_size: int = 64, gamma: float = 0.99, - shared_layers=None, policy_layers: Tuple = (64, 64), value_layers: Tuple = (64, 64), @@ -53,8 +52,6 @@ def __init__( self.value_layers = value_layers self.lr_policy = lr_policy self.lr_value = lr_value - self.actor_prev = actor_prev - self.critic_prev = critic_prev self.seed = kwargs["seed"] if "seed" in kwargs else None self.render = kwargs["render"] if "render" in kwargs else False From c0198bcd549e52e21afa33e10032746dd4850d8e Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Sun, 4 Oct 2020 12:25:59 +0530 Subject: [PATCH 35/39] rectify --- genrl/core/rollout_storage.py | 1 - 1 file changed, 1 deletion(-) diff --git a/genrl/core/rollout_storage.py b/genrl/core/rollout_storage.py index eb6b602e..96383343 100644 --- a/genrl/core/rollout_storage.py +++ b/genrl/core/rollout_storage.py @@ -288,7 +288,6 @@ def __init__( :type num_agents: int :param capacity: Max buffer size :type capacity: int - """ self.buffer_size = buffer_size self.num_agents = num_agents From fe4083562f882296731b8ecbe8f9c6513c3ce3d7 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Sun, 4 Oct 2020 12:26:41 +0530 Subject: [PATCH 36/39] rectify --- genrl/core/rollout_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genrl/core/rollout_storage.py b/genrl/core/rollout_storage.py index 96383343..6612cfe1 100644 --- a/genrl/core/rollout_storage.py +++ b/genrl/core/rollout_storage.py @@ -332,7 +332,7 @@ def reset(self) -> None: self.buffer_size, self.env.n_envs, self.num_agents ) self.generator_ready = False - super(RolloutBuffer, self).reset() + super(MultiAgentRolloutBuffer, self).reset() def add( self, From a50204a99055a7f43ca17017b46b9f69fc3d774a Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Sun, 4 Oct 2020 12:31:40 +0530 Subject: [PATCH 37/39] rectifying --- tests/test_deep/test_agents/test_ddpg.py | 26 ------------------------ tests/test_deep/test_agents/test_sac.py | 25 ----------------------- tests/test_deep/test_agents/test_td3.py | 26 ------------------------ 3 files changed, 77 deletions(-) diff --git a/tests/test_deep/test_agents/test_ddpg.py b/tests/test_deep/test_agents/test_ddpg.py index 2205b764..9211517f 100644 --- a/tests/test_deep/test_agents/test_ddpg.py +++ b/tests/test_deep/test_agents/test_ddpg.py @@ -58,29 +58,3 @@ def test_ddpg_shared(): ) trainer.train() shutil.rmtree("./logs") - - -def test_ddpg_shared(): - env = VectorEnv("Pendulum-v0", 2) - algo = DDPG( - "mlp", - env, - batch_size=5, - noise=NormalActionNoise, - shared_layers=[1, 1], - policy_layers=[1, 1], - value_layers=[1, 1], - ) - - trainer = OffPolicyTrainer( - algo, - env, - log_mode=["csv"], - logdir="./logs", - epochs=4, - max_ep_len=200, - warmup_steps=10, - start_update=10, - ) - trainer.train() - shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_sac.py b/tests/test_deep/test_agents/test_sac.py index 18b17502..38c0d490 100644 --- a/tests/test_deep/test_agents/test_sac.py +++ b/tests/test_deep/test_agents/test_sac.py @@ -49,28 +49,3 @@ def test_sac_shared(): ) trainer.train() shutil.rmtree("./logs") - - -def test_sac_shared(): - env = VectorEnv("Pendulum-v0", 2) - algo = SAC( - "mlp", - env, - batch_size=5, - shared_layers=[1, 1], - policy_layers=[1, 1], - value_layers=[1, 1], - ) - - trainer = OffPolicyTrainer( - algo, - env, - log_mode=["csv"], - logdir="./logs", - epochs=5, - max_ep_len=500, - warmup_steps=10, - start_update=10, - ) - trainer.train() - shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_td3.py b/tests/test_deep/test_agents/test_td3.py index da715e65..f4515fc6 100644 --- a/tests/test_deep/test_agents/test_td3.py +++ b/tests/test_deep/test_agents/test_td3.py @@ -59,29 +59,3 @@ def test_td3_shared(): ) trainer.train() shutil.rmtree("./logs") - - -def test_td3_shared(): - env = VectorEnv("Pendulum-v0", 2) - algo = TD3( - "mlp", - env, - batch_size=5, - noise=OrnsteinUhlenbeckActionNoise, - shared_layers=[1, 1], - policy_layers=[1, 1], - value_layers=[1, 1], - ) - - trainer = OffPolicyTrainer( - algo, - env, - log_mode=["csv"], - logdir="./logs", - epochs=5, - max_ep_len=500, - warmup_steps=10, - start_update=10, - ) - trainer.train() - shutil.rmtree("./logs") From 4a3cd745b6d7a566d0ee94c92ff9240d136e2d6c Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Sun, 4 Oct 2020 12:46:04 +0530 Subject: [PATCH 38/39] removing unecessary code --- genrl/core/actor_critic.py | 59 -------------------------------------- 1 file changed, 59 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index fa3cc30e..e892cda4 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -470,65 +470,6 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor: return value -class MultiAgentActor(MlpPolicy): - def __init__( - self, - state_dim: spaces.Space, - action_dim: spaces.Space, - hidden: Tuple = (32, 32), - discrete: bool = True, - **kwargs, - ): - super(MultiAgentActor, self).__init__( - state_dim, action_dim, hidden, discrete ** kwargs - ) - - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - def forward(self, state): - state = self.model(state) - return state - - def get_action(self, state, deterministic=False): - # state = torch.FloatTensor(state).to(self.device) - logits = self.forward(state) - - dist = F.softmax(logits, dim=0) - probs = Categorical(dist) - if deterministic: - index = torch.argmax(probs) - else: - index = probs.sample().cpu().detach().item() - return index - - -class MultiAgentCritic(MlpValue): - def __init__( - self, - state_dim: spaces.Space, - action_dim: spaces.Space, - fc_layers: Tuple = (32, 32), - val_type: str = "V", - **kwargs, - ): - super(MultiAgentCritic, self).__init__( - state_dim, action_dim, fc_layers, val_type, **kwargs - ) - - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - def forward(self, state): - - state = self.model(state) - - return state - - def get_value(self, state): - # state = torch.FloatTensor(state).to(self.device) - value = self.forward(state) - return value - - actor_critic_registry = { "mlp": MlpActorCritic, "cnn": CNNActorCritic, From 602a7b5434eb62f9388537136f7390edb8b0ea62 Mon Sep 17 00:00:00 2001 From: Aditya Kapoor Date: Sun, 4 Oct 2020 12:52:31 +0530 Subject: [PATCH 39/39] removing unecessary code --- genrl/core/actor_critic.py | 1 - genrl/core/rollout_storage.py | 8 +------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index e892cda4..1a96e959 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -2,7 +2,6 @@ import torch # noqa import torch.nn as nn # noqa -import torch.nn.functional as F from gym import spaces from torch.distributions import Categorical, Normal diff --git a/genrl/core/rollout_storage.py b/genrl/core/rollout_storage.py index 6612cfe1..6fcad2c6 100644 --- a/genrl/core/rollout_storage.py +++ b/genrl/core/rollout_storage.py @@ -282,13 +282,7 @@ def __init__( gamma: float = 0.99, ): super(MultiAgentRolloutBuffer, self).__init__(buffer_size, env, device) - """ - Initialising the buffer - :param num_agents: number of agents in the environment - :type num_agents: int - :param capacity: Max buffer size - :type capacity: int - """ + self.buffer_size = buffer_size self.num_agents = num_agents self.env = env