v0.7 (#26)

* update from scratch configs * update gym pretraining configs - use fewer epochs * update robomimic pretraining configs - use fewer epochs * allow trajectory plotting in eval agent * add simple vit unet * update avoid pretraining configs - use fewer epochs * update furniture pretraining configs - use same amount of epochs as before * add robomimic diffusion unet pretraining configs * update robomimic finetuning configs - higher lr * add vit unet checkpoint urls * update pretraining and finetuning instructions as configs are updated
irom-princeton · Nov 20, 2024 · 1d04211 · 1d04211
1 parent d2929f6
commit 1d04211
Show file tree

Hide file tree

Showing 158 changed files with 3,350 additions and 410 deletions.
diff --git a/agent/eval/eval_agent.py b/agent/eval/eval_agent.py
@@ -57,6 +57,7 @@ def __init__(self, cfg):
         self.horizon_steps = cfg.horizon_steps
         self.max_episode_steps = cfg.env.max_episode_steps
         self.reset_at_iteration = cfg.env.get("reset_at_iteration", True)
+        self.save_full_observations = cfg.env.get("save_full_observations", False)
         self.furniture_sparse_reward = (
             cfg.env.specific.get("sparse_reward", False)
             if "specific" in cfg.env
@@ -85,6 +86,10 @@ def __init__(self, cfg):
         assert not (
             self.n_render <= 0 and self.render_video
         ), "Need to set n_render > 0 if saving video"
+        self.traj_plotter = (
+            hydra.utils.instantiate(cfg.plotter)
+            if "plotter" in cfg else None
+        )
 
     def run(self):
         pass

diff --git a/agent/eval/eval_diffusion_agent.py b/agent/eval/eval_diffusion_agent.py
@@ -37,6 +37,11 @@ def run(self):
         prev_obs_venv = self.reset_env_all(options_venv=options_venv)
         firsts_trajs[0] = 1
         reward_trajs = np.zeros((self.n_steps, self.n_envs))
+        if self.save_full_observations:  # state-only
+            obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
+            obs_full_trajs = np.vstack(
+                (obs_full_trajs, prev_obs_venv["state"][:, -1][None])
+            )
 
         # Collect a set of trajectories from env
         for step in range(self.n_steps):
@@ -62,6 +67,13 @@ def run(self):
             )
             reward_trajs[step] = reward_venv
             firsts_trajs[step + 1] = terminated_venv | truncated_venv
+            if self.save_full_observations:  # state-only
+                obs_full_venv = np.array(
+                    [info["full_obs"]["state"] for info in info_venv]
+                )  # n_envs x act_steps x obs_dim
+                obs_full_trajs = np.vstack(
+                    (obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
+                )
 
             # update for next step
             prev_obs_venv = obs_venv
@@ -108,6 +120,16 @@ def run(self):
             success_rate = 0
             log.info("[WARNING] No episode completed within the iteration!")
 
+        # Plot state trajectories (only in D3IL)
+        if self.traj_plotter is not None:
+            self.traj_plotter(
+                obs_full_trajs=obs_full_trajs,
+                n_render=self.n_render,
+                max_episode_steps=self.max_episode_steps,
+                render_dir=self.render_dir,
+                itr=0,
+            )
+
         # Log loss and save metrics
         time = timer()
         log.info(

diff --git a/cfg/d3il/eval/avoid_m1/eval_diffusion_mlp.yaml b/cfg/d3il/eval/avoid_m1/eval_diffusion_mlp.yaml
@@ -0,0 +1,68 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: avoiding-m5
+obs_dim: 4
+action_dim: 2
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+n_steps: 25
+render_num: 40
+
+plotter:
+  _target_: env.plot_traj.TrajPlotter
+  env_type: avoid
+  normalization_path: ${normalization_path}
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 100
+  reset_at_iteration: True
+  save_video: False
+  best_reward_threshold_for_success: 2
+  save_full_observations: True
+  wrappers:
+    d3il_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      pass_full_observations: ${env.save_full_observations}
+      reset_within_step: False
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
diff --git a/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml b/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml
@@ -25,12 +25,12 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_epochs: 15000
+  n_epochs: 5000
   batch_size: 16
   learning_rate: 1e-4
   weight_decay: 1e-6
   lr_scheduler:
-    first_cycle_steps: 15000
+    first_cycle_steps: 5000
     warmup_steps: 100
     min_lr: 1e-5
   save_model_freq: 500

diff --git a/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml b/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml
@@ -24,12 +24,12 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_epochs: 10000
+  n_epochs: 5000
   batch_size: 16
   learning_rate: 1e-4
   weight_decay: 1e-6
   lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
     warmup_steps: 100
     min_lr: 1e-5
   save_model_freq: 500

diff --git a/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml b/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml
@@ -25,12 +25,12 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_epochs: 10000
-  batch_size: 32
+  n_epochs: 5000
+  batch_size: 16
   learning_rate: 1e-4
   weight_decay: 1e-6
   lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
     warmup_steps: 100
     min_lr: 1e-5
   save_model_freq: 500

diff --git a/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml b/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml
@@ -25,12 +25,12 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_epochs: 15000
+  n_epochs: 5000
   batch_size: 16
   learning_rate: 1e-4
   weight_decay: 1e-6
   lr_scheduler:
-    first_cycle_steps: 15000
+    first_cycle_steps: 5000
     warmup_steps: 100
     min_lr: 1e-5
   save_model_freq: 500

diff --git a/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml b/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml
@@ -24,12 +24,12 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_epochs: 10000
+  n_epochs: 5000
   batch_size: 16
   learning_rate: 1e-4
   weight_decay: 1e-6
   lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
     warmup_steps: 100
     min_lr: 1e-5
   save_model_freq: 500

diff --git a/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml b/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml
@@ -25,12 +25,12 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_epochs: 10000
-  batch_size: 32
+  n_epochs: 5000
+  batch_size: 16
   learning_rate: 1e-4
   weight_decay: 1e-6
   lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
     warmup_steps: 100
     min_lr: 1e-5
   save_model_freq: 500

diff --git a/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml b/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml
@@ -25,12 +25,12 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_epochs: 15000
+  n_epochs: 5000
   batch_size: 16
   learning_rate: 1e-4
   weight_decay: 1e-6
   lr_scheduler:
-    first_cycle_steps: 15000
+    first_cycle_steps: 5000
     warmup_steps: 100
     min_lr: 1e-5
   save_model_freq: 500

diff --git a/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml b/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml
@@ -24,12 +24,12 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_epochs: 10000
+  n_epochs: 5000
   batch_size: 16
   learning_rate: 1e-4
   weight_decay: 1e-6
   lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
     warmup_steps: 100
     min_lr: 1e-5
   save_model_freq: 500

diff --git a/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml b/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml
@@ -25,12 +25,12 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_epochs: 10000
+  n_epochs: 5000
   batch_size: 32
   learning_rate: 1e-4
   weight_decay: 1e-6
   lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
     warmup_steps: 100
     min_lr: 1e-5
   save_model_freq: 500

diff --git a/cfg/finetuning.md b/cfg/finetuning.md
@@ -1,5 +1,7 @@
 ## Fine-tuning experiments
 
+**Update, Nov 20 2024**: In v0.7 we updated the fine-tuning configs as we find sample efficiency can be improved with higher actor learning rate and other hyperparameters. If you would like to replicate the original experimental results from the paper, please use the configs from v0.6. Otherwise we recommmend starting with configs from v0.7 for your applications.
+
 ### Comparing diffusion-based RL algorithms (Sec. 5.1)
 Gym configs are under `cfg/gym/finetune/<env_name>/`, and the naming follows `ft_<alg_name>_diffusion_mlp`, e.g., `ft_awr_diffusion_mlp`. `alg_name` is one of `rwr`, `awr`, `dipo`, `idql`, `dql`, `qsm`, `ppo` (DPPO), `ppo_exact` (exact likelihood). They share the same pre-trained checkpoint in each env.
 

diff --git a/cfg/furniture/eval/lamp_low/eval_diffusion_mlp.yaml b/cfg/furniture/eval/lamp_low/eval_diffusion_mlp.yaml
@@ -0,0 +1,66 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+denoising_steps: 100
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+use_ddim: True
+ddim_steps: 5
+
+n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+render_num: 0
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: lamp
+    randomness: low
+    normalization_path: ${normalization_path}
+    obs_steps: ${cond_steps}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}