From 08a6c52a50efd16fc361a3f4eb83cd319dc6b011 Mon Sep 17 00:00:00 2001 From: zjowowen <93968541+zjowowen@users.noreply.github.com> Date: Thu, 21 Sep 2023 17:08:13 +0800 Subject: [PATCH] feature(zjow): polish ppof agent code for opendilab huggingface (#730) * polish ppof code --- ding/bonus/config.py | 136 ++++++++--------- ding/bonus/ppof.py | 140 ++++++++++++------ .../framework/middleware/functional/logger.py | 50 +++++-- 3 files changed, 199 insertions(+), 127 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 676474c28a..c449c3fdc5 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -7,24 +7,24 @@ from ding.policy import PPOFPolicy -def get_instance_config(env: str, algorithm: str) -> EasyDict: +def get_instance_config(env_id: str, algorithm: str) -> EasyDict: if algorithm == 'PPOF': cfg = PPOFPolicy.default_config() - if env == 'lunarlander_discrete': + if env_id == 'LunarLander-v2': cfg.n_sample = 512 cfg.value_norm = 'popart' cfg.entropy_weight = 1e-3 - elif env == 'lunarlander_continuous': + elif env_id == 'LunarLanderContinuous-v2': cfg.action_space = 'continuous' cfg.n_sample = 400 - elif env == 'bipedalwalker': + elif env_id == 'BipedalWalker-v3': cfg.learning_rate = 1e-3 cfg.action_space = 'continuous' cfg.n_sample = 1024 - elif env == 'acrobot': + elif env_id == 'acrobot': cfg.learning_rate = 1e-4 cfg.n_sample = 400 - elif env == 'rocket_landing': + elif env_id == 'rocket_landing': cfg.n_sample = 2048 cfg.adv_norm = False cfg.model = dict( @@ -32,13 +32,13 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: actor_head_hidden_size=128, critic_head_hidden_size=128, ) - elif env == 'drone_fly': + elif env_id == 'drone_fly': cfg.action_space = 'continuous' cfg.adv_norm = False cfg.epoch_per_collect = 5 cfg.learning_rate = 5e-5 cfg.n_sample = 640 - elif env == 'hybrid_moving': + elif env_id == 'hybrid_moving': cfg.action_space = 'hybrid' cfg.n_sample = 3200 cfg.entropy_weight = 0.03 @@ -50,13 +50,13 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: fixed_sigma_value=0.3, bound_type='tanh', ) - elif env == 'evogym_carrier': + elif env_id == 'evogym_carrier': cfg.action_space = 'continuous' cfg.n_sample = 2048 cfg.batch_size = 256 cfg.epoch_per_collect = 10 cfg.learning_rate = 3e-3 - elif env == 'mario': + elif env_id == 'mario': cfg.n_sample = 256 cfg.batch_size = 64 cfg.epoch_per_collect = 2 @@ -66,14 +66,14 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_hidden_size=128, actor_head_hidden_size=128, ) - elif env == 'di_sheep': + elif env_id == 'di_sheep': cfg.n_sample = 3200 cfg.batch_size = 320 cfg.epoch_per_collect = 10 cfg.learning_rate = 3e-4 cfg.adv_norm = False cfg.entropy_weight = 0.001 - elif env == 'procgen_bigfish': + elif env_id == 'procgen_bigfish': cfg.n_sample = 16384 cfg.batch_size = 16384 cfg.epoch_per_collect = 10 @@ -83,7 +83,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_hidden_size=256, actor_head_hidden_size=256, ) - elif env in ['atari_qbert', 'atari_kangaroo', 'atari_bowling']: + elif env_id in ['KangarooNoFrameskip-v4', 'BowlingNoFrameskip-v4']: cfg.n_sample = 1024 cfg.batch_size = 128 cfg.epoch_per_collect = 10 @@ -94,7 +94,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_hidden_size=128, critic_head_layer_num=2, ) - elif env == 'PongNoFrameskip': + elif env_id == 'PongNoFrameskip-v4': cfg.n_sample = 3200 cfg.batch_size = 320 cfg.epoch_per_collect = 10 @@ -104,7 +104,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: actor_head_hidden_size=128, critic_head_hidden_size=128, ) - elif env == 'SpaceInvadersNoFrameskip': + elif env_id == 'SpaceInvadersNoFrameskip-v4': cfg.n_sample = 320 cfg.batch_size = 320 cfg.epoch_per_collect = 1 @@ -116,7 +116,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: actor_head_hidden_size=128, critic_head_hidden_size=128, ) - elif env == 'QbertNoFrameskip': + elif env_id == 'QbertNoFrameskip-v4': cfg.n_sample = 3200 cfg.batch_size = 320 cfg.epoch_per_collect = 10 @@ -127,13 +127,13 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: actor_head_hidden_size=128, critic_head_hidden_size=128, ) - elif env == 'minigrid_fourroom': + elif env_id == 'minigrid_fourroom': cfg.n_sample = 3200 cfg.batch_size = 320 cfg.learning_rate = 3e-4 cfg.epoch_per_collect = 10 cfg.entropy_weight = 0.001 - elif env == 'metadrive': + elif env_id == 'metadrive': cfg.learning_rate = 3e-4 cfg.action_space = 'continuous' cfg.entropy_weight = 0.001 @@ -146,49 +146,61 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_hidden_size=128, critic_head_layer_num=2, ) - elif env in ['hopper']: + elif env_id == 'Hopper-v3': + cfg.action_space = "continuous" + cfg.n_sample = 3200 + cfg.batch_size = 320 + cfg.epoch_per_collect = 10 + cfg.learning_rate = 3e-4 + elif env_id == 'HalfCheetah-v3': + cfg.action_space = "continuous" + cfg.n_sample = 3200 + cfg.batch_size = 320 + cfg.epoch_per_collect = 10 + cfg.learning_rate = 3e-4 + elif env_id == 'Walker2d-v3': cfg.action_space = "continuous" cfg.n_sample = 3200 cfg.batch_size = 320 cfg.epoch_per_collect = 10 cfg.learning_rate = 3e-4 else: - raise KeyError("not supported env type: {}".format(env)) + raise KeyError("not supported env type: {}".format(env_id)) else: raise KeyError("not supported algorithm type: {}".format(algorithm)) return cfg -def get_instance_env(env: str) -> BaseEnv: - if env == 'lunarlander_discrete': +def get_instance_env(env_id: str) -> BaseEnv: + if env_id == 'LunarLander-v2': return DingEnvWrapper(gym.make('LunarLander-v2')) - elif env == 'lunarlander_continuous': - return DingEnvWrapper(gym.make('LunarLander-v2', continuous=True)) - elif env == 'bipedalwalker': + elif env_id == 'LunarLanderContinuous-v2': + return DingEnvWrapper(gym.make('LunarLanderContinuous-v2', continuous=True)) + elif env_id == 'BipedalWalker-v3': return DingEnvWrapper(gym.make('BipedalWalker-v3'), cfg={'act_scale': True, 'rew_clip': True}) - elif env == 'pendulum': + elif env_id == 'Pendulum-v1': return DingEnvWrapper(gym.make('Pendulum-v1'), cfg={'act_scale': True}) - elif env == 'acrobot': + elif env_id == 'acrobot': return DingEnvWrapper(gym.make('Acrobot-v1')) - elif env == 'rocket_landing': + elif env_id == 'rocket_landing': from dizoo.rocket.envs import RocketEnv cfg = EasyDict({ 'task': 'landing', 'max_steps': 800, }) return RocketEnv(cfg) - elif env == 'drone_fly': + elif env_id == 'drone_fly': from dizoo.gym_pybullet_drones.envs import GymPybulletDronesEnv cfg = EasyDict({ 'env_id': 'flythrugate-aviary-v0', 'action_type': 'VEL', }) return GymPybulletDronesEnv(cfg) - elif env == 'hybrid_moving': + elif env_id == 'hybrid_moving': import gym_hybrid return DingEnvWrapper(gym.make('Moving-v0')) - elif env == 'evogym_carrier': + elif env_id == 'evogym_carrier': import evogym.envs from evogym import sample_robot, WorldObject path = os.path.join(os.path.dirname(__file__), '../../dizoo/evogym/envs/world_data/carry_bot.json') @@ -203,7 +215,7 @@ def get_instance_env(env: str) -> BaseEnv: ] } ) - elif env == 'mario': + elif env_id == 'mario': import gym_super_mario_bros from nes_py.wrappers import JoypadSpace return DingEnvWrapper( @@ -219,10 +231,10 @@ def get_instance_env(env: str) -> BaseEnv: ] } ) - elif env == 'di_sheep': + elif env_id == 'di_sheep': from sheep_env import SheepEnv return DingEnvWrapper(SheepEnv(level=9)) - elif env == 'procgen_bigfish': + elif env_id == 'procgen_bigfish': return DingEnvWrapper( gym.make('procgen:procgen-bigfish-v0', start_level=0, num_levels=1), cfg={ @@ -234,7 +246,7 @@ def get_instance_env(env: str) -> BaseEnv: }, seed_api=False, ) - elif env == 'hopper': + elif env_id == 'Hopper-v3': cfg = EasyDict( env_id='Hopper-v3', env_wrapper='mujoco_default', @@ -242,7 +254,7 @@ def get_instance_env(env: str) -> BaseEnv: rew_clip=True, ) return DingEnvWrapper(gym.make('Hopper-v3'), cfg=cfg) - elif env == 'HalfCheetah': + elif env_id == 'HalfCheetah-v3': cfg = EasyDict( env_id='HalfCheetah-v3', env_wrapper='mujoco_default', @@ -250,7 +262,7 @@ def get_instance_env(env: str) -> BaseEnv: rew_clip=True, ) return DingEnvWrapper(gym.make('HalfCheetah-v3'), cfg=cfg) - elif env == 'Walker2d': + elif env_id == 'Walker2d-v3': cfg = EasyDict( env_id='Walker2d-v3', env_wrapper='mujoco_default', @@ -258,42 +270,24 @@ def get_instance_env(env: str) -> BaseEnv: rew_clip=True, ) return DingEnvWrapper(gym.make('Walker2d-v3'), cfg=cfg) - elif env == "SpaceInvadersNoFrameskip": - cfg = EasyDict({ - 'env_id': "SpaceInvadersNoFrameskip-v4", - 'env_wrapper': 'atari_default', - }) - return DingEnvWrapper(gym.make("SpaceInvadersNoFrameskip-v4"), cfg=cfg) - elif env == "PongNoFrameskip": - cfg = EasyDict({ - 'env_id': "PongNoFrameskip-v4", - 'env_wrapper': 'atari_default', - }) - return DingEnvWrapper(gym.make("PongNoFrameskip-v4"), cfg=cfg) - elif env == "QbertNoFrameskip": - cfg = EasyDict({ - 'env_id': "QbertNoFrameskip-v4", - 'env_wrapper': 'atari_default', - }) - return DingEnvWrapper(gym.make("QbertNoFrameskip-v4"), cfg=cfg) - elif env in ['atari_qbert', 'atari_kangaroo', 'atari_bowling', 'atari_breakout', 'atari_spaceinvader', - 'atari_gopher']: - from dizoo.atari.envs.atari_env import AtariEnv - atari_env_list = { - 'atari_qbert': 'QbertNoFrameskip-v4', - 'atari_kangaroo': 'KangarooNoFrameskip-v4', - 'atari_bowling': 'BowlingNoFrameskip-v4', - 'atari_breakout': 'BreakoutNoFrameskip-v4', - 'atari_spaceinvader': 'SpaceInvadersNoFrameskip-v4', - 'atari_gopher': 'GopherNoFrameskip-v4' - } + + elif env_id in [ + 'BowlingNoFrameskip-v4', + 'BreakoutNoFrameskip-v4', + 'GopherNoFrameskip-v4' + 'KangarooNoFrameskip-v4', + 'PongNoFrameskip-v4', + 'QbertNoFrameskip-v4', + 'SpaceInvadersNoFrameskip-v4', + ]: + cfg = EasyDict({ - 'env_id': atari_env_list[env], + 'env_id': env_id, 'env_wrapper': 'atari_default', }) - ding_env_atari = DingEnvWrapper(gym.make(atari_env_list[env]), cfg=cfg) + ding_env_atari = DingEnvWrapper(gym.make(env_id), cfg=cfg) return ding_env_atari - elif env == 'minigrid_fourroom': + elif env_id == 'minigrid_fourroom': import gymnasium return DingEnvWrapper( gymnasium.make('MiniGrid-FourRooms-v0'), @@ -306,7 +300,7 @@ def get_instance_env(env: str) -> BaseEnv: ] } ) - elif env == 'metadrive': + elif env_id == 'metadrive': from dizoo.metadrive.env.drive_env import MetaDrivePPOOriginEnv from dizoo.metadrive.env.drive_wrapper import DriveEnvWrapper cfg = dict( @@ -319,7 +313,7 @@ def get_instance_env(env: str) -> BaseEnv: cfg = EasyDict(cfg) return DriveEnvWrapper(MetaDrivePPOOriginEnv(cfg)) else: - raise KeyError("not supported env type: {}".format(env)) + raise KeyError("not supported env type: {}".format(env_id)) def get_hybrid_shape(action_space) -> EasyDict: diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 3948f726ce..149b42c3c1 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -5,6 +5,7 @@ import os import gym import gymnasium +import numpy as np import torch from ding.framework import task, OnlineRLContext from ding.framework.middleware import interaction_evaluator_ttorch, PPOFStepCollector, multistep_trainer, CkptSaver, \ @@ -12,6 +13,7 @@ from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 from ding.policy import PPOFPolicy, single_env_forward_wrapper_ttorch from ding.utils import set_pkg_seed +from ding.utils import get_env_fps, render from ding.config import save_config_py from .model import PPOFModel from .config import get_instance_config, get_instance_env, get_hybrid_shape @@ -21,9 +23,9 @@ class PPOF: supported_env_list = [ # common - 'lunarlander_discrete', - 'lunarlander_continuous', - 'bipedalwalker', + 'LunarLander-v2', + 'LunarLanderContinuous-v2', + 'BipedalWalker-v3', 'acrobot', # ch2: action 'rocket_landing', @@ -38,42 +40,64 @@ class PPOF: 'minigrid_fourroom', 'metadrive', # atari - 'atari_qbert', - 'atari_kangaroo', - 'atari_bowling', - 'PongNoFrameskip', - 'SpaceInvadersNoFrameskip', - 'QbertNoFrameskip', + 'BowlingNoFrameskip-v4', + 'BreakoutNoFrameskip-v4', + 'GopherNoFrameskip-v4' + 'KangarooNoFrameskip-v4', + 'PongNoFrameskip-v4', + 'QbertNoFrameskip-v4', + 'SpaceInvadersNoFrameskip-v4', # mujoco - 'hopper', + 'Hopper-v3', + 'HalfCheetah-v3', + 'Walker2d-v3', ] def __init__( self, - env: Union[str, BaseEnv], + env_id: str = None, + env: BaseEnv = None, seed: int = 0, - exp_name: str = 'default_experiment', + exp_name: str = None, model: Optional[torch.nn.Module] = None, - cfg: Optional[EasyDict] = None, - policy_state_dict: str = None, + cfg: Optional[Union[EasyDict, dict]] = None, + policy_state_dict: str = None ) -> None: - if isinstance(env, str): - assert env in PPOF.supported_env_list, "Please use supported envs: {}".format(PPOF.supported_env_list) - self.env = get_instance_env(env) + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." + + if cfg is not None and not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) + + if env_id is not None: + assert env_id in PPOF.supported_env_list, "Please use supported envs: {}".format(PPOF.supported_env_list) if cfg is None: - # 'It should be default env tuned config' - self.cfg = get_instance_config(env, algorithm="PPO") - else: - self.cfg = cfg - elif isinstance(env, BaseEnv): - self.cfg = cfg - raise NotImplementedError + cfg = get_instance_config(env_id, algorithm="PPOF") + + if not hasattr(cfg, "env_id"): + cfg.env_id = env_id + assert cfg.env_id == env_id, "env_id in cfg should be the same as env_id in args." + else: + assert hasattr(cfg, "env_id"), "Please specify env_id in cfg." + assert cfg.env_id in PPOF.supported_env_list, "Please use supported envs: {}".format( + PPOF.supported_env_list + ) + + if exp_name is not None: + cfg.exp_name = exp_name + elif not hasattr(cfg, "exp_name"): + cfg.exp_name = "{}-{}".format(cfg.env_id, "PPO") + self.cfg = cfg + self.exp_name = self.cfg.exp_name + + if env is None: + self.env = get_instance_env(self.cfg.env_id) else: - raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + self.env = env + logging.getLogger().setLevel(logging.INFO) self.seed = seed - set_pkg_seed(self.seed) - self.exp_name = exp_name + set_pkg_seed(self.seed, use_cuda=self.cfg.cuda) + if not os.path.exists(self.exp_name): os.makedirs(self.exp_name) save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) @@ -152,19 +176,36 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: + def deploy( + self, + enable_save_replay: bool = False, + concatenate_all_replay: bool = False, + replay_save_path: str = None, + seed: Optional[Union[int, List]] = None, + debug: bool = False + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self.env.clone(caller='evaluator') - env.seed(self.seed, dynamic_seed=False) - if enable_save_replay and replay_save_path: + if seed is not None and isinstance(seed, int): + seeds = [seed] + elif seed is not None and isinstance(seed, list): + seeds = seed + else: + seeds = [self.seed] + + returns = [] + images = [] + if enable_save_replay: + replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path env.enable_save_replay(replay_path=replay_save_path) - elif enable_save_replay: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) else: logging.warning('No video would be generated during the deploy.') + if concatenate_all_replay: + logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.') + concatenate_all_replay = False forward_fn = single_env_forward_wrapper_ttorch(self.policy.eval, self.cfg.cuda) @@ -172,22 +213,31 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, # env will be reset again in the main loop env.reset() - # main loop - return_ = 0. - step = 0 - obs = env.reset() - while True: - action = forward_fn(obs) - obs, rew, done, info = env.step(action) - return_ += rew - step += 1 - if done: - break - logging.info(f'PPOF deploy is finished, final episode return with {step} steps is: {return_}') + for seed in seeds: + env.seed(seed, dynamic_seed=False) + return_ = 0. + step = 0 + obs = env.reset() + images.append(render(env)[None]) if concatenate_all_replay else None + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + images.append(render(env)[None]) if concatenate_all_replay else None + return_ += rew + step += 1 + if done: + break + logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + returns.append(return_) env.close() - return return_ + if concatenate_all_replay: + images = np.concatenate(images, axis=0) + import imageio + imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env)) + + return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns)) def collect_data( self, diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index eba569cfbc..9f62e2f429 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -303,16 +303,30 @@ def _plot(ctx: "OnlineRLContext"): ) if ctx.eval_value != -np.inf: - info_for_logging.update( - { + if hasattr(ctx, "eval_value_min"): + info_for_logging.update({ "episode return min": ctx.eval_value_min, + }) + if hasattr(ctx, "eval_value_max"): + info_for_logging.update({ "episode return max": ctx.eval_value_max, - "episode return mean": ctx.eval_value, + }) + if hasattr(ctx, "eval_value_std"): + info_for_logging.update({ "episode return std": ctx.eval_value_std, + }) + if hasattr(ctx, "eval_value"): + info_for_logging.update({ + "episode return mean": ctx.eval_value, + }) + if hasattr(ctx, "train_iter"): + info_for_logging.update({ "train iter": ctx.train_iter, - "env step": ctx.env_step - } - ) + }) + if hasattr(ctx, "env_step"): + info_for_logging.update({ + "env step": ctx.env_step, + }) eval_output = ctx.eval_output['output'] episode_return = ctx.eval_output['episode_return'] @@ -597,16 +611,30 @@ def _plot(ctx: "OfflineRLContext"): ) if ctx.eval_value != -np.inf: - info_for_logging.update( - { + if hasattr(ctx, "eval_value_min"): + info_for_logging.update({ "episode return min": ctx.eval_value_min, + }) + if hasattr(ctx, "eval_value_max"): + info_for_logging.update({ "episode return max": ctx.eval_value_max, - "episode return mean": ctx.eval_value, + }) + if hasattr(ctx, "eval_value_std"): + info_for_logging.update({ "episode return std": ctx.eval_value_std, + }) + if hasattr(ctx, "eval_value"): + info_for_logging.update({ + "episode return mean": ctx.eval_value, + }) + if hasattr(ctx, "train_iter"): + info_for_logging.update({ "train iter": ctx.train_iter, + }) + if hasattr(ctx, "train_epoch"): + info_for_logging.update({ "train_epoch": ctx.train_epoch, - } - ) + }) eval_output = ctx.eval_output['output'] episode_return = ctx.eval_output['episode_return']