Update seeding process on env.reset(); update np_random to be _np_ran…

…dom for the envs' PRNG; update Mujoco env version from v3 to v4; all in accordance with gym v0.29.0
automl · Sep 16, 2024 · 3ce485f · 3ce485f
1 parent febad86
commit 3ce485f
Show file tree

Hide file tree

Showing 6 changed files with 150 additions and 71 deletions.
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ pip install -r requirements.txt
 pip install -e .[extras_disc]
 ```
 
-Please follow the following commands to install for the continuous and complex experiments. **IMPORTANT**: In case, you do not have MuJoCo, please ignore any mujoco-py related installation errors below:
+Please follow the following commands to install for the continuous and complex experiments. **IMPORTANT**: In case, you do not have MuJoCo, please ignore any mujoco related installation errors below:
 ```bash
 conda create -n py36_toy_rl_cont_comp python=3.6
 conda activate py36_toy_rl_cont_comp

diff --git a/example.py b/example.py
@@ -11,8 +11,8 @@
     one for grid environments with image representations
     one for wrapping Atari env qbert
     one for wrapping Mujoco env HalfCheetah
-    one for wrapping MiniGrid env
-    one for wrapping ProcGen env
+    one for wrapping MiniGrid env  # Currently commented out due to some errors
+    one for wrapping ProcGen env  # Currently commented out due to some errors
     two examples at the end showing how to create toy envs using gym.make()
 
 Many further examples can be found in test_mdp_playground.py.
@@ -383,32 +383,98 @@ def atari_wrapper_example():
     display_image(next_state)
 
 
-def mujoco_wrapper_example():
+def mujoco_wrapper_examples():
 
+    # For Mujoco envs, a few specific dimensions need to be changed by fiddling with 
+    # attributes of the MujocoEnv class. This is achieved through a Mujoco
+    # wrapper that subclasses the Mujoco env and modifies relevant properties.
+    # Please see the documentation of mujoco_env_wrapper.py for more details. 
+    # Below, we specify 2 dicts: one for the specific dimensions that are changed
+    # using the Mujoco wrapper and the other for the general dimensions that are
+    # changed using a GymEnvWrapper.
+
+    # 1: Mujoco wrapper config:
     # The scalar values for the dimensions passed in this dict are used to
     # multiply the base environments' values. For these Mujoco envs, the
     # time_unit is achieved by multiplying the Gym Mujoco env's frame_skip and
-    # thus will be the integer part of time_unit * frame_skip. The time_unit
+    # thus will be the integer part of time_unit * frame_skip. (For HalfCheetah-v4
+    # and Pusher-v4, frame_skip is 5; for Reacher-v4, it is 2.) The time_unit
     # is NOT achieved by changing Mujoco's timestep because that would change
     # the numerical integration done my Mujoco and thus the environment
     # dynamics.
-    config = {
-        "seed": 0,
+    mujoco_wrap_config = {
         "action_space_max": 0.5,
         "time_unit": 0.5,
     }
 
-    # This actually makes a subclass and not a wrapper. Because, some
+    # 2: Gym wrapper config:
+    gym_wrap_config = {
+        "seed": 0,
+        "state_space_type": "continuous",
+        "transition_noise": 0.25,
+    }
+
+
+    # This makes a subclass and not a wrapper because some
     # frameworks might need an instance of this class to also be an instance
     # of the Mujoco base_class.
     try:
         from mdp_playground.envs import get_mujoco_wrapper
-        from gymnasium.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv
 
+        # HalfCheetah example
+        from gymnasium.envs.mujoco.half_cheetah_v4 import HalfCheetahEnv
         wrapped_mujoco_env = get_mujoco_wrapper(HalfCheetahEnv)
 
-        env = wrapped_mujoco_env(**config)
-        state = env.reset()[0]
+        env = wrapped_mujoco_env(**mujoco_wrap_config)
+
+        from mdp_playground.envs import GymEnvWrapper
+        import gymnasium as gym
+        env = GymEnvWrapper(env, **gym_wrap_config)
+
+        # From Gymnasium v26, the seed is set in the reset method.
+        state = env.reset(seed=gym_wrap_config["seed"])[0]
+
+        print(
+            "Taking a step in the environment with a random action and printing the transition:"
+        )
+        action = env.action_space.sample()
+        next_state, reward, done, trunc, info = env.step(action)
+        print("sars', done =", state, action, reward, next_state, done)
+
+        env.close()
+
+        # Pusher example
+        from gymnasium.envs.mujoco.pusher_v4 import PusherEnv
+        wrapped_mujoco_env = get_mujoco_wrapper(PusherEnv)
+
+        env = wrapped_mujoco_env(**mujoco_wrap_config)
+
+        from mdp_playground.envs import GymEnvWrapper
+        import gymnasium as gym
+        env = GymEnvWrapper(env, **gym_wrap_config)
+
+        state = env.reset(seed=gym_wrap_config["seed"])[0]
+
+        print(
+            "Taking a step in the environment with a random action and printing the transition:"
+        )
+        action = env.action_space.sample()
+        next_state, reward, done, trunc, info = env.step(action)
+        print("sars', done =", state, action, reward, next_state, done)
+
+        env.close()
+
+        # Reacher example
+        from gymnasium.envs.mujoco.reacher_v4 import ReacherEnv
+        wrapped_mujoco_env = get_mujoco_wrapper(ReacherEnv)
+
+        env = wrapped_mujoco_env(**mujoco_wrap_config)
+
+        from mdp_playground.envs import GymEnvWrapper
+        import gymnasium as gym
+        env = GymEnvWrapper(env, **gym_wrap_config)
+
+        state = env.reset(seed=gym_wrap_config["seed"])[0]
 
         print(
             "Taking a step in the environment with a random action and printing the transition:"
@@ -424,7 +490,7 @@ def mujoco_wrapper_example():
             "Exception:",
             type(e),
             e,
-            "caught. You may need to install mujoco-py. NOT running mujoco_wrapper_example.",
+            "caught. You may need to install mujoco with pip. NOT running mujoco_wrapper_examples.",
         )
         return
 
@@ -567,7 +633,7 @@ def procgen_wrapper_example():
     atari_wrapper_example()
 
     print(set_ansi_escape + "\nRunning Mujoco wrapper example:\n" + reset_ansi_escape)
-    mujoco_wrapper_example()
+    mujoco_wrapper_examples()
 
     print(set_ansi_escape + "\nRunning MiniGrid wrapper example:\n" + reset_ansi_escape)
     # minigrid_wrapper_example()
@@ -579,6 +645,7 @@ def procgen_wrapper_example():
     import mdp_playground
     import gymnasium as gym
 
+    # The following are with seed=None:
     gym.make("RLToy-v0")
 
     env = gym.make(

diff --git a/mdp_playground/envs/__init__.py b/mdp_playground/envs/__init__.py
@@ -5,4 +5,4 @@
     from mdp_playground.envs.gym_env_wrapper import GymEnvWrapper
     from mdp_playground.envs.mujoco_env_wrapper import get_mujoco_wrapper
 except error.DependencyNotInstalled as e:
-    print("Exception:", type(e), e, "caught. You may need to install Ray or mujoco-py.")
+    print("Exception:", type(e), e, "caught. You may need to install Ray or mujoco with pip.")
diff --git a/mdp_playground/envs/gym_env_wrapper.py b/mdp_playground/envs/gym_env_wrapper.py
@@ -55,11 +55,12 @@ def __init__(self, env, **config):
         # during the run of an env, the expectation is that all obs., act. space,
         # etc. seeds are set during that call? Only Atari in Gym seems to do something 
         # similar, the others I saw there don't seem to set seed for obs., act. spaces.
-        self.env.seed(
-            seed_int
-        )  # #seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env?
-        obs_space_seed = self.np_random.integers(sys.maxsize).item()  # random
-        act_space_seed = self.np_random.integers(sys.maxsize).item()  # random
+        if "seed" in dir(self.env):  # hack
+            self.env.seed(
+                seed_int
+            )  # #seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env?
+        obs_space_seed = self._np_random.integers(sys.maxsize).item()  # random
+        act_space_seed = self._np_random.integers(sys.maxsize).item()  # random
         self.env.observation_space.seed(obs_space_seed)  # seed
         self.env.action_space.seed(act_space_seed)  # seed
 
@@ -207,7 +208,7 @@ def __init__(self, env, **config):
             # self.irrelevant_features =  config["irrelevant_features"]
             irr_toy_env_conf = config["irrelevant_features"]
             if "seed" not in irr_toy_env_conf:
-                irr_toy_env_conf["seed"] = self.np_random.integers(sys.maxsize).item()  # random
+                irr_toy_env_conf["seed"] = self._np_random.integers(sys.maxsize).item()  # random
 
             if config["state_space_type"] == "discrete":
                 pass
@@ -340,15 +341,15 @@ def step(self, action):
                 probs[action] = 1 - self.transition_noise
                 old_action = action
                 action = int(
-                    self.np_random.choice(self.env.action_space.n, size=1, p=probs)
+                    self._np_random.choice(self.env.action_space.n, size=1, p=probs)
                 )  # random
                 if old_action != action:
                     # print("NOISE inserted", old_action, action)
                     self.total_noisy_transitions_episode += 1
         else:  # cont. envs
             if self.transition_noise is not None:
                 noise_in_transition = (
-                    self.transition_noise(self.np_random)
+                    self.transition_noise(self._np_random)
                     if self.transition_noise
                     else 0
                 )  # #random
@@ -400,7 +401,7 @@ def step(self, action):
         # action and time_step as well. Would need to change implementation to
         # have a queue for the rewards achieved and then pick the reward that was
         # generated delay timesteps ago.
-        noise_in_reward = self.reward_noise(self.np_random) if self.reward_noise else 0
+        noise_in_reward = self.reward_noise(self._np_random) if self.reward_noise else 0
         self.total_abs_noise_in_reward_episode += np.abs(noise_in_reward)
         self.total_reward_episode += reward
         reward += noise_in_reward
@@ -409,7 +410,11 @@ def step(self, action):
 
         return next_state, reward, done, trunc, info
 
-    def reset(self):
+    def reset(self, seed=None):
+        '''
+        From Gymnasium v26, the reset method has a seed parameter. 
+        '''
+
         # on episode "end" stuff (to not be invoked when reset() called when
         # self.total_episodes = 0; end is in quotes because it may not be a true
         # episode end reached by reaching a terminal state, but reset() may have
@@ -445,18 +450,18 @@ def reset(self):
 
         if "irrelevant_features" in self.config:
             if self.config["state_space_type"] == "discrete":
-                reset_state = self.env.reset()[0]
-                reset_state_irr = self.irr_toy_env.reset()[0]
-                reset_state = tuple([reset_state, reset_state_irr])
+                reset_state, reset_state_info = self.env.reset(seed=seed)
+                reset_state_irr, reset_state_irr_info = self.irr_toy_env.reset(seed=seed)
+                reset_state = tuple([reset_state, reset_state_irr]), tuple([reset_state_info, reset_state_irr_info])
             else:
-                reset_state = self.env.reset()[0]
-                reset_state_irr = self.irr_toy_env.reset()[0]
-                reset_state = np.concatenate((reset_state, reset_state_irr))
+                reset_state, reset_state_info = self.env.reset(seed=seed)
+                reset_state_irr, reset_state_irr_info = self.irr_toy_env.reset(seed=seed)
+                reset_state = np.concatenate((reset_state, reset_state_irr)), tuple([reset_state_info, reset_state_irr_info])
         else:
-            reset_state = self.env.reset()[0]
+            reset_state = self.env.reset(seed=seed)
 
         if self.image_transforms:
-            reset_state = self.get_transformed_image(reset_state)
+            reset_state = (self.get_transformed_image(reset_state[0]), reset_state[1])
 
         return reset_state
         # return super(GymEnvWrapper, self).reset()
@@ -467,15 +472,15 @@ def seed(self, seed=None):
         Parameters
         ----------
         seed : int
-            seed to initialise the np_random instance held by the environment. Cannot use numpy.int64 or similar because Gym doesn't accept it.
+            seed to initialise the _np_random instance held by the environment. Cannot use numpy.int64 or similar because Gym doesn't accept it.
 
         Returns
         -------
         int
             The seed returned by Gym
         """
         # If seed is None, you get a randomly generated seed from gymnasium.utils...
-        self.np_random, self.seed_ = gym.utils.seeding.np_random(seed)  # random
+        self._np_random, self.seed_ = gym.utils.seeding.np_random(seed)  # random
         print(
             "Env SEED set to: "
             + str(seed)
@@ -540,16 +545,16 @@ def get_transformed_image(self, env_img):
         #             + str(min_R)
         #         )
         #     min_R = np.log(min_R)
-        #     log_sample = min_R + self.np_random.random() * (max_R - min_R)
+        #     log_sample = min_R + self._np_random.random() * (max_R - min_R)
         #     sample_ = np.exp(log_sample)
         #     R = int(sample_)
         #     # print("R", min_R, max_R)
         #
         if "shift" in self.image_transforms:
             max_shift_w = (tot_width - R) // 2
             max_shift_h = (tot_height - R) // 2
-            add_shift_w = self.np_random.integers(-max_shift_w + 1, max_shift_w).item()
-            add_shift_h = self.np_random.integers(-max_shift_h + 1, max_shift_h).item()
+            add_shift_w = self._np_random.integers(-max_shift_w + 1, max_shift_w).item()
+            add_shift_h = self._np_random.integers(-max_shift_h + 1, max_shift_h).item()
             # print("add_shift_w, add_shift_h", add_shift_w, add_shift_h)
             add_shift_w = int(add_shift_w / sh_quant) * sh_quant
             add_shift_h = int(add_shift_h / sh_quant) * sh_quant

diff --git a/mdp_playground/envs/mujoco_env_wrapper.py b/mdp_playground/envs/mujoco_env_wrapper.py
@@ -1,20 +1,20 @@
 # from gymnasium.envs.mujoco.mujoco_env import MujocoEnv
-from gymnasium.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv
-from gymnasium.envs.mujoco.pusher import PusherEnv
-from gymnasium.envs.mujoco.reacher import ReacherEnv
+from gymnasium.envs.mujoco.half_cheetah_v4 import HalfCheetahEnv
+from gymnasium.envs.mujoco.pusher_v4 import PusherEnv
+from gymnasium.envs.mujoco.reacher_v4 import ReacherEnv
 import copy
 
 
 def get_mujoco_wrapper(base_class):
-    """Wraps a mujoco-py environment to be able to modify its low-level Mujoco XML attributes and inject the dimensions of MDP Playground. Please see [`example.py`](example.py) for some simple examples of how to use this class. The values for these dimensions are passed in a config dict as for mdp_playground.envs.RLToyEnv. The description for the supported dimensions below can be found in mdp_playground/envs/rl_toy_env.py.
+    """Wraps a mujoco environment, by subclassing it, to be able to modify its low-level Mujoco XML attributes and inject the dimensions of MDP Playground. Please see [`example.py`](example.py) for some simple examples of how to use this class. The values for these dimensions are passed in a config dict as for mdp_playground.envs.RLToyEnv. The description for the supported dimensions below can be found in mdp_playground/envs/rl_toy_env.py.
 
     Currently supported dimensions:
         time_unit
         action_space_max
 
     For both of these dimensions, the scalar value passed in the dict is used to multiply the base environments' values.
 
-    For the Mujoco environments, the time_unit is achieved by multiplying the Gym Mujoco environments's frame_skip and thus needs to be such that time_unit * frame_skip is an integer. The time_unit is NOT achieved by changing Mujoco's timestep because that would change the numerical integration done by Mujoco and thus the objective of the environment. The _ctrl_cost_weight and _forward_reward_weight used by the underlying mujoco-py class to calculate rewards in th e environment are proportionally multiplied by the time_unit, so that the rewards are on the same scale across different time_units on average.
+    For the Mujoco environments, the time_unit is achieved by multiplying the Gym Mujoco environments's frame_skip and thus needs to be such that time_unit * frame_skip is an integer. The time_unit is NOT achieved by changing Mujoco's timestep because that would change the numerical integration done by Mujoco and thus the objective of the environment. The _ctrl_cost_weight and _forward_reward_weight used by the underlying MujocoEnv class to calculate rewards in the environment are proportionally multiplied by the time_unit, so that the rewards are on the same scale across different time_units on average.
 
     Similarly for the action_space_max (which controls the action range), the new action range is achieved by multiplying the Gym Mujoco environments's action_max and action_min by the action_space_max passed in the dict.
 
@@ -102,6 +102,12 @@ def __init__(self, **config):  # Gets passed env_config from run_experiments.py
                         self._forward_reward_weight,
                         "corresponding to time_unit in config.",
                     )
+                else:
+                    print("Current mujoco env is not HalfCheetah v4, so only modified frameskip when changing time_unit. "\
+                        "Not changing the _ctrl_cost_weight or _forward_reward_weight. It may make sense to also modify "\
+                        "these variables depending on their relation with the time_unit. You will need to look deeper into "\
+                        "how the reward function is defined to know if this is needed.")
+
 
         def step(self, action):  # hack
             obs, reward, done, trunc, info = super(MujocoEnvWrapper, self).step(action)