SforAiDl · sampreet-arthi · Aug 22, 2020 · Aug 24, 2020 · Aug 25, 2020 · Sep 6, 2020
diff --git a/genrl/agents/__init__.py b/genrl/agents/__init__.py
@@ -15,6 +15,7 @@
     NeuralNoiseSamplingAgent,
 )
 from genrl.agents.bandits.contextual.variational import VariationalAgent  # noqa
+from genrl.agents.bandits.multiarmed.base import MABAgent  # noqa
 from genrl.agents.bandits.multiarmed.bayesian import BayesianUCBMABAgent  # noqa
 from genrl.agents.bandits.multiarmed.bernoulli_mab import BernoulliMAB  # noqa
 from genrl.agents.bandits.multiarmed.epsgreedy import EpsGreedyMABAgent  # noqa
@@ -41,5 +42,3 @@
 from genrl.agents.deep.sac.sac import SAC  # noqa
 from genrl.agents.deep.td3.td3 import TD3  # noqa
 from genrl.agents.deep.vpg.vpg import VPG  # noqa
-
-from genrl.agents.bandits.multiarmed.base import MABAgent  # noqa; noqa; noqa
diff --git a/genrl/trainers/offpolicy.py b/genrl/trainers/offpolicy.py
@@ -142,11 +142,12 @@ def train(self) -> None:
 
         self.training_rewards = []
         self.episodes = 0
+        self.timesteps = 0
 
-        for timestep in range(0, self.max_timesteps, self.env.n_envs):
-            self.agent.update_params_before_select_action(timestep)
+        while self.timesteps <= self.max_timesteps and self.episodes <= self.epochs:
+            self.agent.update_params_before_select_action(self.timesteps)
 
-            action = self.get_action(state, timestep)
+            action = self.get_action(state, self.timesteps)
             next_state, reward, done, info = self.env.step(action)
 
             if self.render:
@@ -164,20 +165,22 @@ def train(self) -> None:
                 self.noise_reset()
 
                 if self.episodes % self.log_interval == 0:
-                    self.log(timestep)
+                    self.log(self.timesteps)
 
-                if self.episodes == self.epochs:
-                    break
-
-            if timestep >= self.start_update and timestep % self.update_interval == 0:
+            if (
+                self.timesteps >= self.start_update
+                and self.timesteps % self.update_interval == 0
+            ):
                 self.agent.update_params(self.update_interval)
 
             if (
-                timestep >= self.start_update
+                self.timesteps >= self.start_update
                 and self.save_interval != 0
-                and timestep % self.save_interval == 0
+                and self.timesteps % self.save_interval == 0
             ):
-                self.save(timestep)
+                self.save(self.timesteps)
+
+            self.timesteps += self.env.n_envs
 
         self.env.close()
         self.logger.close()
diff --git a/tests/test_agents/test_bandit/__init__.py b/tests/test_agents/test_bandit/__init__.py
@@ -1,6 +1,6 @@
 from tests.test_agents.test_bandit.test_cb_agents import TestCBAgent  # noqa
 from tests.test_agents.test_bandit.test_data_bandits import TestDataBandit  # noqa
 from tests.test_agents.test_bandit.test_mab_agents import TestMABAgent  # noqa
-from tests.test_agents.test_bandit.test_multi_armed_bandits import (
-    TestMultiArmedBandit,  # noqa
+from tests.test_agents.test_bandit.test_multi_armed_bandits import (  # noqa
+    TestMultiArmedBandit,
 )