Skip to content

Commit

Permalink
Update seeding process on env.reset(); update np_random to be _np_ran…
Browse files Browse the repository at this point in the history
…dom for the envs' PRNG; update Mujoco env version from v3 to v4; all in accordance with gym v0.29.0
  • Loading branch information
RaghuSpaceRajan committed Sep 16, 2024
1 parent febad86 commit 3ce485f
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 71 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ pip install -r requirements.txt
pip install -e .[extras_disc]
```

Please follow the following commands to install for the continuous and complex experiments. **IMPORTANT**: In case, you do not have MuJoCo, please ignore any mujoco-py related installation errors below:
Please follow the following commands to install for the continuous and complex experiments. **IMPORTANT**: In case, you do not have MuJoCo, please ignore any mujoco related installation errors below:
```bash
conda create -n py36_toy_rl_cont_comp python=3.6
conda activate py36_toy_rl_cont_comp
Expand Down
91 changes: 79 additions & 12 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
one for grid environments with image representations
one for wrapping Atari env qbert
one for wrapping Mujoco env HalfCheetah
one for wrapping MiniGrid env
one for wrapping ProcGen env
one for wrapping MiniGrid env # Currently commented out due to some errors
one for wrapping ProcGen env # Currently commented out due to some errors
two examples at the end showing how to create toy envs using gym.make()
Many further examples can be found in test_mdp_playground.py.
Expand Down Expand Up @@ -383,32 +383,98 @@ def atari_wrapper_example():
display_image(next_state)


def mujoco_wrapper_example():
def mujoco_wrapper_examples():

# For Mujoco envs, a few specific dimensions need to be changed by fiddling with
# attributes of the MujocoEnv class. This is achieved through a Mujoco
# wrapper that subclasses the Mujoco env and modifies relevant properties.
# Please see the documentation of mujoco_env_wrapper.py for more details.
# Below, we specify 2 dicts: one for the specific dimensions that are changed
# using the Mujoco wrapper and the other for the general dimensions that are
# changed using a GymEnvWrapper.

# 1: Mujoco wrapper config:
# The scalar values for the dimensions passed in this dict are used to
# multiply the base environments' values. For these Mujoco envs, the
# time_unit is achieved by multiplying the Gym Mujoco env's frame_skip and
# thus will be the integer part of time_unit * frame_skip. The time_unit
# thus will be the integer part of time_unit * frame_skip. (For HalfCheetah-v4
# and Pusher-v4, frame_skip is 5; for Reacher-v4, it is 2.) The time_unit
# is NOT achieved by changing Mujoco's timestep because that would change
# the numerical integration done my Mujoco and thus the environment
# dynamics.
config = {
"seed": 0,
mujoco_wrap_config = {
"action_space_max": 0.5,
"time_unit": 0.5,
}

# This actually makes a subclass and not a wrapper. Because, some
# 2: Gym wrapper config:
gym_wrap_config = {
"seed": 0,
"state_space_type": "continuous",
"transition_noise": 0.25,
}


# This makes a subclass and not a wrapper because some
# frameworks might need an instance of this class to also be an instance
# of the Mujoco base_class.
try:
from mdp_playground.envs import get_mujoco_wrapper
from gymnasium.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv

# HalfCheetah example
from gymnasium.envs.mujoco.half_cheetah_v4 import HalfCheetahEnv
wrapped_mujoco_env = get_mujoco_wrapper(HalfCheetahEnv)

env = wrapped_mujoco_env(**config)
state = env.reset()[0]
env = wrapped_mujoco_env(**mujoco_wrap_config)

from mdp_playground.envs import GymEnvWrapper
import gymnasium as gym
env = GymEnvWrapper(env, **gym_wrap_config)

# From Gymnasium v26, the seed is set in the reset method.
state = env.reset(seed=gym_wrap_config["seed"])[0]

print(
"Taking a step in the environment with a random action and printing the transition:"
)
action = env.action_space.sample()
next_state, reward, done, trunc, info = env.step(action)
print("sars', done =", state, action, reward, next_state, done)

env.close()

# Pusher example
from gymnasium.envs.mujoco.pusher_v4 import PusherEnv
wrapped_mujoco_env = get_mujoco_wrapper(PusherEnv)

env = wrapped_mujoco_env(**mujoco_wrap_config)

from mdp_playground.envs import GymEnvWrapper
import gymnasium as gym
env = GymEnvWrapper(env, **gym_wrap_config)

state = env.reset(seed=gym_wrap_config["seed"])[0]

print(
"Taking a step in the environment with a random action and printing the transition:"
)
action = env.action_space.sample()
next_state, reward, done, trunc, info = env.step(action)
print("sars', done =", state, action, reward, next_state, done)

env.close()

# Reacher example
from gymnasium.envs.mujoco.reacher_v4 import ReacherEnv
wrapped_mujoco_env = get_mujoco_wrapper(ReacherEnv)

env = wrapped_mujoco_env(**mujoco_wrap_config)

from mdp_playground.envs import GymEnvWrapper
import gymnasium as gym
env = GymEnvWrapper(env, **gym_wrap_config)

state = env.reset(seed=gym_wrap_config["seed"])[0]

print(
"Taking a step in the environment with a random action and printing the transition:"
Expand All @@ -424,7 +490,7 @@ def mujoco_wrapper_example():
"Exception:",
type(e),
e,
"caught. You may need to install mujoco-py. NOT running mujoco_wrapper_example.",
"caught. You may need to install mujoco with pip. NOT running mujoco_wrapper_examples.",
)
return

Expand Down Expand Up @@ -567,7 +633,7 @@ def procgen_wrapper_example():
atari_wrapper_example()

print(set_ansi_escape + "\nRunning Mujoco wrapper example:\n" + reset_ansi_escape)
mujoco_wrapper_example()
mujoco_wrapper_examples()

print(set_ansi_escape + "\nRunning MiniGrid wrapper example:\n" + reset_ansi_escape)
# minigrid_wrapper_example()
Expand All @@ -579,6 +645,7 @@ def procgen_wrapper_example():
import mdp_playground
import gymnasium as gym

# The following are with seed=None:
gym.make("RLToy-v0")

env = gym.make(
Expand Down
2 changes: 1 addition & 1 deletion mdp_playground/envs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
from mdp_playground.envs.gym_env_wrapper import GymEnvWrapper
from mdp_playground.envs.mujoco_env_wrapper import get_mujoco_wrapper
except error.DependencyNotInstalled as e:
print("Exception:", type(e), e, "caught. You may need to install Ray or mujoco-py.")
print("Exception:", type(e), e, "caught. You may need to install Ray or mujoco with pip.")
51 changes: 28 additions & 23 deletions mdp_playground/envs/gym_env_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,12 @@ def __init__(self, env, **config):
# during the run of an env, the expectation is that all obs., act. space,
# etc. seeds are set during that call? Only Atari in Gym seems to do something
# similar, the others I saw there don't seem to set seed for obs., act. spaces.
self.env.seed(
seed_int
) # #seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env?
obs_space_seed = self.np_random.integers(sys.maxsize).item() # random
act_space_seed = self.np_random.integers(sys.maxsize).item() # random
if "seed" in dir(self.env): # hack
self.env.seed(
seed_int
) # #seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env?
obs_space_seed = self._np_random.integers(sys.maxsize).item() # random
act_space_seed = self._np_random.integers(sys.maxsize).item() # random
self.env.observation_space.seed(obs_space_seed) # seed
self.env.action_space.seed(act_space_seed) # seed

Expand Down Expand Up @@ -207,7 +208,7 @@ def __init__(self, env, **config):
# self.irrelevant_features = config["irrelevant_features"]
irr_toy_env_conf = config["irrelevant_features"]
if "seed" not in irr_toy_env_conf:
irr_toy_env_conf["seed"] = self.np_random.integers(sys.maxsize).item() # random
irr_toy_env_conf["seed"] = self._np_random.integers(sys.maxsize).item() # random

if config["state_space_type"] == "discrete":
pass
Expand Down Expand Up @@ -340,15 +341,15 @@ def step(self, action):
probs[action] = 1 - self.transition_noise
old_action = action
action = int(
self.np_random.choice(self.env.action_space.n, size=1, p=probs)
self._np_random.choice(self.env.action_space.n, size=1, p=probs)
) # random
if old_action != action:
# print("NOISE inserted", old_action, action)
self.total_noisy_transitions_episode += 1
else: # cont. envs
if self.transition_noise is not None:
noise_in_transition = (
self.transition_noise(self.np_random)
self.transition_noise(self._np_random)
if self.transition_noise
else 0
) # #random
Expand Down Expand Up @@ -400,7 +401,7 @@ def step(self, action):
# action and time_step as well. Would need to change implementation to
# have a queue for the rewards achieved and then pick the reward that was
# generated delay timesteps ago.
noise_in_reward = self.reward_noise(self.np_random) if self.reward_noise else 0
noise_in_reward = self.reward_noise(self._np_random) if self.reward_noise else 0
self.total_abs_noise_in_reward_episode += np.abs(noise_in_reward)
self.total_reward_episode += reward
reward += noise_in_reward
Expand All @@ -409,7 +410,11 @@ def step(self, action):

return next_state, reward, done, trunc, info

def reset(self):
def reset(self, seed=None):
'''
From Gymnasium v26, the reset method has a seed parameter.
'''

# on episode "end" stuff (to not be invoked when reset() called when
# self.total_episodes = 0; end is in quotes because it may not be a true
# episode end reached by reaching a terminal state, but reset() may have
Expand Down Expand Up @@ -445,18 +450,18 @@ def reset(self):

if "irrelevant_features" in self.config:
if self.config["state_space_type"] == "discrete":
reset_state = self.env.reset()[0]
reset_state_irr = self.irr_toy_env.reset()[0]
reset_state = tuple([reset_state, reset_state_irr])
reset_state, reset_state_info = self.env.reset(seed=seed)
reset_state_irr, reset_state_irr_info = self.irr_toy_env.reset(seed=seed)
reset_state = tuple([reset_state, reset_state_irr]), tuple([reset_state_info, reset_state_irr_info])
else:
reset_state = self.env.reset()[0]
reset_state_irr = self.irr_toy_env.reset()[0]
reset_state = np.concatenate((reset_state, reset_state_irr))
reset_state, reset_state_info = self.env.reset(seed=seed)
reset_state_irr, reset_state_irr_info = self.irr_toy_env.reset(seed=seed)
reset_state = np.concatenate((reset_state, reset_state_irr)), tuple([reset_state_info, reset_state_irr_info])
else:
reset_state = self.env.reset()[0]
reset_state = self.env.reset(seed=seed)

if self.image_transforms:
reset_state = self.get_transformed_image(reset_state)
reset_state = (self.get_transformed_image(reset_state[0]), reset_state[1])

return reset_state
# return super(GymEnvWrapper, self).reset()
Expand All @@ -467,15 +472,15 @@ def seed(self, seed=None):
Parameters
----------
seed : int
seed to initialise the np_random instance held by the environment. Cannot use numpy.int64 or similar because Gym doesn't accept it.
seed to initialise the _np_random instance held by the environment. Cannot use numpy.int64 or similar because Gym doesn't accept it.
Returns
-------
int
The seed returned by Gym
"""
# If seed is None, you get a randomly generated seed from gymnasium.utils...
self.np_random, self.seed_ = gym.utils.seeding.np_random(seed) # random
self._np_random, self.seed_ = gym.utils.seeding.np_random(seed) # random
print(
"Env SEED set to: "
+ str(seed)
Expand Down Expand Up @@ -540,16 +545,16 @@ def get_transformed_image(self, env_img):
# + str(min_R)
# )
# min_R = np.log(min_R)
# log_sample = min_R + self.np_random.random() * (max_R - min_R)
# log_sample = min_R + self._np_random.random() * (max_R - min_R)
# sample_ = np.exp(log_sample)
# R = int(sample_)
# # print("R", min_R, max_R)
#
if "shift" in self.image_transforms:
max_shift_w = (tot_width - R) // 2
max_shift_h = (tot_height - R) // 2
add_shift_w = self.np_random.integers(-max_shift_w + 1, max_shift_w).item()
add_shift_h = self.np_random.integers(-max_shift_h + 1, max_shift_h).item()
add_shift_w = self._np_random.integers(-max_shift_w + 1, max_shift_w).item()
add_shift_h = self._np_random.integers(-max_shift_h + 1, max_shift_h).item()
# print("add_shift_w, add_shift_h", add_shift_w, add_shift_h)
add_shift_w = int(add_shift_w / sh_quant) * sh_quant
add_shift_h = int(add_shift_h / sh_quant) * sh_quant
Expand Down
16 changes: 11 additions & 5 deletions mdp_playground/envs/mujoco_env_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
# from gymnasium.envs.mujoco.mujoco_env import MujocoEnv
from gymnasium.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv
from gymnasium.envs.mujoco.pusher import PusherEnv
from gymnasium.envs.mujoco.reacher import ReacherEnv
from gymnasium.envs.mujoco.half_cheetah_v4 import HalfCheetahEnv
from gymnasium.envs.mujoco.pusher_v4 import PusherEnv
from gymnasium.envs.mujoco.reacher_v4 import ReacherEnv
import copy


def get_mujoco_wrapper(base_class):
"""Wraps a mujoco-py environment to be able to modify its low-level Mujoco XML attributes and inject the dimensions of MDP Playground. Please see [`example.py`](example.py) for some simple examples of how to use this class. The values for these dimensions are passed in a config dict as for mdp_playground.envs.RLToyEnv. The description for the supported dimensions below can be found in mdp_playground/envs/rl_toy_env.py.
"""Wraps a mujoco environment, by subclassing it, to be able to modify its low-level Mujoco XML attributes and inject the dimensions of MDP Playground. Please see [`example.py`](example.py) for some simple examples of how to use this class. The values for these dimensions are passed in a config dict as for mdp_playground.envs.RLToyEnv. The description for the supported dimensions below can be found in mdp_playground/envs/rl_toy_env.py.
Currently supported dimensions:
time_unit
action_space_max
For both of these dimensions, the scalar value passed in the dict is used to multiply the base environments' values.
For the Mujoco environments, the time_unit is achieved by multiplying the Gym Mujoco environments's frame_skip and thus needs to be such that time_unit * frame_skip is an integer. The time_unit is NOT achieved by changing Mujoco's timestep because that would change the numerical integration done by Mujoco and thus the objective of the environment. The _ctrl_cost_weight and _forward_reward_weight used by the underlying mujoco-py class to calculate rewards in th e environment are proportionally multiplied by the time_unit, so that the rewards are on the same scale across different time_units on average.
For the Mujoco environments, the time_unit is achieved by multiplying the Gym Mujoco environments's frame_skip and thus needs to be such that time_unit * frame_skip is an integer. The time_unit is NOT achieved by changing Mujoco's timestep because that would change the numerical integration done by Mujoco and thus the objective of the environment. The _ctrl_cost_weight and _forward_reward_weight used by the underlying MujocoEnv class to calculate rewards in the environment are proportionally multiplied by the time_unit, so that the rewards are on the same scale across different time_units on average.
Similarly for the action_space_max (which controls the action range), the new action range is achieved by multiplying the Gym Mujoco environments's action_max and action_min by the action_space_max passed in the dict.
Expand Down Expand Up @@ -102,6 +102,12 @@ def __init__(self, **config): # Gets passed env_config from run_experiments.py
self._forward_reward_weight,
"corresponding to time_unit in config.",
)
else:
print("Current mujoco env is not HalfCheetah v4, so only modified frameskip when changing time_unit. "\
"Not changing the _ctrl_cost_weight or _forward_reward_weight. It may make sense to also modify "\
"these variables depending on their relation with the time_unit. You will need to look deeper into "\
"how the reward function is defined to know if this is needed.")


def step(self, action): # hack
obs, reward, done, trunc, info = super(MujocoEnvWrapper, self).step(action)
Expand Down
Loading

0 comments on commit 3ce485f

Please sign in to comment.