diff --git a/examples/benchmarking/README.md b/examples/benchmarking/README.md
index 9781448b5..72832d960 100644
--- a/examples/benchmarking/README.md
+++ b/examples/benchmarking/README.md
@@ -8,16 +8,17 @@ Code here is used to benchmark the performance of various simulators/benchmarks
 To benchmark ManiSkill + SAPIEN, after following the setup instructions on this repository's README.md, run
 
 ```
-python benchmark_gpu_sim.py -e "PickCube-v1" --num-envs=1024 --obs-mode=state # test just state simulation
-python benchmark_gpu_sim.py -e "PickCube-v1" --num-envs=128 --obs-mode=rgbd # test state sim + parallel rendering one 128x128 RGBD cameras per environment
-python benchmark_gpu_sim.py -e "PickCube-v1" --num-envs=128 --save-video # save a video showing all 128 visual observations
+python benchmark_maniskill.py -e "PickCube-v1" --num-envs=1024 --obs-mode=state # test just state simulation
+python benchmark_maniskill.py -e "PickCube-v1" --num-envs=128 --obs-mode=rgbd # test state sim + parallel rendering one 128x128 RGBD cameras per environment
+python benchmark_maniskill.py -e "PickCube-v1" --num-envs=128 --save-video # save a video showing all 128 visual observations
 ```
 
 
-To get the best reported results, we run two commands on a machine with a RTX 4090:
+To get the reported results, we run two commands on a machine with a RTX 4090:
 ```
-python benchmark_gpu_sim.py -e "PickCube-v1" --num-envs=4096 --obs-mode=state
-python benchmark_gpu_sim.py -e "PickCube-v1" --num-envs=1536 --obs-mode=rgbd
+python benchmark_maniskill.py -e "PickCube-v1" --num-envs=4096 --obs-mode=state --control-freq=50
+python benchmark_maniskill.py -e "PickCube-v1" --num-envs=1536 --obs-mode=rgbd --control-freq=50
+# note we use --control-freq=50 as this is the control frequency isaac sim based repos tend to use
 ```
 
 These are the expected state-based only results:
diff --git a/examples/benchmarking/benchmark_cpu_sim.py b/examples/benchmarking/benchmark_cpu_sim.py
deleted file mode 100644
index b87b0f05f..000000000
--- a/examples/benchmarking/benchmark_cpu_sim.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import time
-
-import gymnasium as gym
-import numpy as np
-import sapien
-import sapien.physx
-import sapien.render
-import tqdm
-
-import mani_skill2.envs
-
-if __name__ == "__main__":
-    num_envs = 12
-    env_id = "PickCube-v0"
-    env = gym.make_vec(
-        env_id,
-        num_envs,
-        vectorization_mode="async",
-        vector_kwargs=dict(context="forkserver"),
-    )
-    np.random.seed(2022)
-    env.reset(seed=2022)
-    print("GPU Simulation Enabled:", sapien.physx.is_gpu_enabled())
-    N = 1000
-    stime = time.time()
-    for i in tqdm.tqdm(range(N)):
-        env.step(env.action_space.sample())
-    dtime = time.time() - stime
-    FPS = num_envs * N / dtime
-    print(f"{FPS=:0.3f}. {N=} frames in {dtime:0.3f}s with {num_envs} parallel envs")
-    N = 1000
-    stime = time.time()
-    for i in tqdm.tqdm(range(N)):
-        actions = env.action_space.sample()
-        obs, rew, terminated, truncated, info = env.step(actions)
-        if i % 200 == 0 and i != 0:
-            env.reset()
-            print("RESET")
-    dtime = time.time() - stime
-    FPS = num_envs * N / dtime
-    print(
-        f"{FPS=:0.3f}. {N=} frames in {dtime:0.3f}s with {num_envs} parallel envs with step+reset"
-    )
-    env.close()
diff --git a/examples/benchmarking/benchmark_gpu_sim.py b/examples/benchmarking/benchmark_maniskill.py
similarity index 74%
rename from examples/benchmarking/benchmark_gpu_sim.py
rename to examples/benchmarking/benchmark_maniskill.py
index 7a379c9d7..6a1e3060a 100644
--- a/examples/benchmarking/benchmark_gpu_sim.py
+++ b/examples/benchmarking/benchmark_maniskill.py
@@ -12,6 +12,10 @@
 import tqdm
 
 import mani_skill2.envs
+from mani_skill2.envs.scenes.tasks.planner.planner import PickSubtask
+from mani_skill2.envs.scenes.tasks.sequential_task import SequentialTaskEnv
+from mani_skill2.utils.scene_builder.ai2thor.variants import ArchitecTHORSceneBuilder
+from mani_skill2.utils.scene_builder.replicacad.scene_builder import ReplicaCADSceneBuilder
 from mani_skill2.vector.wrappers.gymnasium import ManiSkillVectorEnv
 from profiling import Profiler
 from mani_skill2.utils.visualization.misc import images_to_video, tile_images
@@ -21,20 +25,30 @@
 def main(args):
     profiler = Profiler(output_format=args.format)
     num_envs = args.num_envs
-    env = gym.make(
-        args.env_id,
-        num_envs=num_envs,
-        obs_mode=args.obs_mode,
-        # enable_shadow=True,
-        render_mode=args.render_mode,
-        control_mode=args.control_mode,
-        sim_cfg=dict(control_freq=50)
-    )
-    if isinstance(env.action_space, gym.spaces.Dict):
-        env = FlattenActionSpaceWrapper(env)
-    env = ManiSkillVectorEnv(env)
+    sim_cfg = dict()
+    if args.control_freq:
+        sim_cfg["control_freq"] = args.control_freq
+    if args.sim_freq:
+        sim_cfg["sim_freq"] = args.sim_freq
+    if not args.cpu_sim:
+        env = gym.make(
+            args.env_id,
+            num_envs=num_envs,
+            obs_mode=args.obs_mode,
+            # enable_shadow=True,
+            render_mode=args.render_mode,
+            control_mode=args.control_mode,
+            sim_cfg=sim_cfg
+        )
+        if isinstance(env.action_space, gym.spaces.Dict):
+            env = FlattenActionSpaceWrapper(env)
+        env = ManiSkillVectorEnv(env)
+        base_env = env.base_env
+    else:
+        env = gym.make_vec(args.env_id, num_envs=args.num_envs, vectorization_mode="async", vector_kwargs=dict(context="spawn"), obs_mode=args.obs_mode,)
+        base_env = gym.make(args.env_id, obs_mode=args.obs_mode).unwrapped
     sensor_settings_str = []
-    for uid, cam in env.base_env._sensors.items():
+    for uid, cam in base_env._sensors.items():
         cfg = cam.cfg
         sensor_settings_str.append(f"{cfg.width}x{cfg.height}")
     sensor_settings_str = "_".join(sensor_settings_str)
@@ -51,10 +65,10 @@ def main(args):
         f"render_mode={args.render_mode}, sensor_details={sensor_settings_str}, save_video={args.save_video}"
     )
     print(
-        f"sim_freq={env.base_env.sim_freq}, control_freq={env.base_env.control_freq}"
+        f"sim_freq={base_env.sim_freq}, control_freq={base_env.control_freq}"
     )
     print(f"observation space: {env.observation_space}")
-    print(f"action space: {env.base_env.single_action_space}")
+    print(f"action space: {base_env.single_action_space}")
     print(
         "# -------------------------------------------------------------------------- #"
     )
@@ -70,7 +84,7 @@ def main(args):
         with profiler.profile("env.step", total_steps=N, num_envs=num_envs):
             for i in range(N):
                 actions = (
-                    2 * torch.rand(env.action_space.shape, device=env.base_env.device)
+                    2 * torch.rand(env.action_space.shape, device=base_env.device)
                     - 1
                 )
                 obs, rew, terminated, truncated, info = env.step(actions)
@@ -126,6 +140,9 @@ def parse_args():
     parser.add_argument("-o", "--obs-mode", type=str, default="state")
     parser.add_argument("-c", "--control-mode", type=str, default="pd_joint_delta_pos")
     parser.add_argument("-n", "--num-envs", type=int, default=1024)
+    parser.add_argument("--cpu-sim", action="store_true", help="Whether to use the CPU or GPU simulation")
+    parser.add_argument("--control-freq", type=int, default=None, help="The control frequency to use")
+    parser.add_argument("--sim-freq", type=int, default=None, help="The simulation frequency to use")
     parser.add_argument(
         "--render-mode",
         type=str,
diff --git a/mani_skill2/__init__.py b/mani_skill2/__init__.py
index da500af2e..49b8a1fe6 100644
--- a/mani_skill2/__init__.py
+++ b/mani_skill2/__init__.py
@@ -9,7 +9,12 @@
 PACKAGE_DIR = Path(__file__).parent.resolve()
 PACKAGE_ASSET_DIR = PACKAGE_DIR / "assets"
 # Non-package data
-ASSET_DIR = Path(os.getenv("MS2_ASSET_DIR", "data"))
+ASSET_DIR = Path(
+    os.getenv("MS_ASSET_DIR", os.path.join(os.path.expanduser("~"), ".maniskill/data"))
+)
+DEMO_DIR = Path(
+    os.getenv("MS_ASSET_DIR", os.path.join(os.path.expanduser("~"), ".maniskill/demos"))
+)
 
 
 def format_path(p: str):
diff --git a/mani_skill2/agents/robots/fetch/fetch.py b/mani_skill2/agents/robots/fetch/fetch.py
index c95753e74..d2d6dcdf8 100644
--- a/mani_skill2/agents/robots/fetch/fetch.py
+++ b/mani_skill2/agents/robots/fetch/fetch.py
@@ -51,13 +51,25 @@ class Fetch(BaseAgent):
             q=[1, 0, 0, 0],
             width=128,
             height=128,
-            fov=1.57,
+            fov=2,
             near=0.01,
             far=10,
             entity_uid="head_camera_link",
+        ),
+        CameraConfig(
+            uid="fetch_hand",
+            p=[-0.1, 0, 0.1],
+            q=[1, 0, 0, 0],
+            width=128,
+            height=128,
+            fov=2,
+            near=0.01,
+            far=10,
+            entity_uid="gripper_link",
         )
     ]
     REACHABLE_DIST = 1.5
+    RESTING_QPOS = np.array([0, 0, 0, 0.386, 0, -0.370, 0.562, -1.032, 0.695, 0.955, -0.1, 2.077, 0, 0.015, 0.015])
 
     def __init__(self, *args, **kwargs):
         self.arm_joint_names = [
@@ -204,14 +216,14 @@ def controller_configs(self):
         # -------------------------------------------------------------------------- #
         # Body
         # -------------------------------------------------------------------------- #
-        body_pd_joint_pos = PDJointPosControllerConfig(
+        body_pd_joint_delta_pos = PDJointPosControllerConfig(
             self.body_joint_names,
-            [-1.57, -1.6056, 0],
-            [1.57, 1.6056, 0.38615],
+            -0.1,
+            0.1,
             self.body_stiffness,
             self.body_damping,
             self.body_force_limit,
-            normalize_action=True,
+            use_delta=True,
         )
 
         # -------------------------------------------------------------------------- #
@@ -229,69 +241,69 @@ def controller_configs(self):
             pd_joint_delta_pos=dict(
                 arm=arm_pd_joint_delta_pos,
                 gripper=gripper_pd_joint_pos,
-                body=body_pd_joint_pos,
+                body=body_pd_joint_delta_pos,
                 base=base_pd_joint_vel,
             ),
             pd_joint_pos=dict(
                 arm=arm_pd_joint_pos,
                 gripper=gripper_pd_joint_pos,
-                body=body_pd_joint_pos,
+                body=body_pd_joint_delta_pos,
                 base=base_pd_joint_vel,
             ),
             pd_ee_delta_pos=dict(
                 arm=arm_pd_ee_delta_pos,
                 gripper=gripper_pd_joint_pos,
-                body=body_pd_joint_pos,
+                body=body_pd_joint_delta_pos,
                 base=base_pd_joint_vel,
             ),
             pd_ee_delta_pose=dict(
                 arm=arm_pd_ee_delta_pose,
                 gripper=gripper_pd_joint_pos,
-                body=body_pd_joint_pos,
+                body=body_pd_joint_delta_pos,
                 base=base_pd_joint_vel,
             ),
             pd_ee_delta_pose_align=dict(
                 arm=arm_pd_ee_delta_pose_align,
                 gripper=gripper_pd_joint_pos,
-                body=body_pd_joint_pos,
+                body=body_pd_joint_delta_pos,
                 base=base_pd_joint_vel,
             ),
             # TODO(jigu): how to add boundaries for the following controllers
             pd_joint_target_delta_pos=dict(
                 arm=arm_pd_joint_target_delta_pos,
                 gripper=gripper_pd_joint_pos,
-                body=body_pd_joint_pos,
+                body=body_pd_joint_delta_pos,
                 base=base_pd_joint_vel,
             ),
             pd_ee_target_delta_pos=dict(
                 arm=arm_pd_ee_target_delta_pos,
                 gripper=gripper_pd_joint_pos,
-                body=body_pd_joint_pos,
+                body=body_pd_joint_delta_pos,
                 base=base_pd_joint_vel,
             ),
             pd_ee_target_delta_pose=dict(
                 arm=arm_pd_ee_target_delta_pose,
                 gripper=gripper_pd_joint_pos,
-                body=body_pd_joint_pos,
+                body=body_pd_joint_delta_pos,
                 base=base_pd_joint_vel,
             ),
             # Caution to use the following controllers
             pd_joint_vel=dict(
                 arm=arm_pd_joint_vel,
                 gripper=gripper_pd_joint_pos,
-                body=body_pd_joint_pos,
+                body=body_pd_joint_delta_pos,
                 base=base_pd_joint_vel,
             ),
             pd_joint_pos_vel=dict(
                 arm=arm_pd_joint_pos_vel,
                 gripper=gripper_pd_joint_pos,
-                body=body_pd_joint_pos,
+                body=body_pd_joint_delta_pos,
                 base=base_pd_joint_vel,
             ),
             pd_joint_delta_pos_vel=dict(
                 arm=arm_pd_joint_delta_pos_vel,
                 gripper=gripper_pd_joint_pos,
-                body=body_pd_joint_pos,
+                body=body_pd_joint_delta_pos,
                 base=base_pd_joint_vel,
             ),
         )
@@ -315,7 +327,7 @@ def _after_init(self):
         self.r_wheel_link: Link = get_obj_by_name(
             self.robot.get_links(), "r_wheel_link"
         )
-        for link in [self.base_link, self.l_wheel_link, self.r_wheel_link]:
+        for link in [self.l_wheel_link, self.r_wheel_link]:
             for body in link._bodies:
                 cs = body.get_collision_shapes()[0]
                 cg = cs.get_collision_groups()
@@ -326,8 +338,8 @@ def _after_init(self):
             self.robot.get_links(), "torso_lift_link"
         )
 
-        self.torso_lift_link: Link = get_obj_by_name(
-            self.robot.get_links(), "torso_lift_link"
+        self.head_camera_link: Link = get_obj_by_name(
+            self.robot.get_links(), "head_camera_link"
         )
 
         self.queries: Dict[
@@ -419,7 +431,7 @@ def is_grasping(self, object: Actor = None, min_impulse=1e-6, max_angle=85):
                     and np.rad2deg(rangle) <= max_angle
                 )
 
-                return all([lflag, rflag])
+                return torch.tensor([all([lflag, rflag])], dtype=bool)
 
     def is_static(self, threshold: float = 0.2):
         qvel = self.robot.get_qvel()[..., :-2]
diff --git a/mani_skill2/assets/robots/fetch/fetch.urdf b/mani_skill2/assets/robots/fetch/fetch.urdf
index a629eb952..dddf70f78 100644
--- a/mani_skill2/assets/robots/fetch/fetch.urdf
+++ b/mani_skill2/assets/robots/fetch/fetch.urdf
@@ -22,14 +22,14 @@
     <parent link="root"/>
     <child link="root_arm_1_link_1"/>
     <axis xyz="1 0 0"/>
-    <limit effort="100" lower="-10" upper="10" velocity="0.50"/>
+    <limit effort="100" lower="-20" upper="20" velocity="0.50"/>
     <dynamics damping="10.0" friction="10"/>
   </joint>
   <joint name="root_y_axis_joint" type="prismatic">
     <parent link="root_arm_1_link_1"/>
     <child link="root_arm_1_link_2"/>
     <axis xyz="0 1 0"/>
-    <limit effort="100" lower="-10" upper="10" velocity="0.50"/>
+    <limit effort="100" lower="-20" upper="20" velocity="0.50"/>
     <dynamics damping="10.0" friction="10"/>
   </joint>
   <joint name="root_z_rotation_joint" type="continuous">
diff --git a/mani_skill2/envs/__init__.py b/mani_skill2/envs/__init__.py
index 6e73415d0..e9c81a4a5 100644
--- a/mani_skill2/envs/__init__.py
+++ b/mani_skill2/envs/__init__.py
@@ -1,4 +1,4 @@
 # from .ms1 import *
 from .ms2 import *
-from .scenes.pick_object import *
+from .scenes import *
 from .tasks import *
diff --git a/mani_skill2/envs/sapien_env.py b/mani_skill2/envs/sapien_env.py
index 04895748c..f1912793d 100644
--- a/mani_skill2/envs/sapien_env.py
+++ b/mani_skill2/envs/sapien_env.py
@@ -965,6 +965,10 @@ def render_human(self):
         if self._viewer is None:
             self._viewer = Viewer()
             self._setup_viewer()
+            if "render_camera" in self._human_render_cameras:
+                self._viewer.set_camera_pose(
+                    self._human_render_cameras["render_camera"].camera.global_pose
+                )
 
         for obj in self._hidden_objects:
             obj.show_visual()
diff --git a/mani_skill2/envs/scenes/__init__.py b/mani_skill2/envs/scenes/__init__.py
new file mode 100644
index 000000000..1c69ac41f
--- /dev/null
+++ b/mani_skill2/envs/scenes/__init__.py
@@ -0,0 +1,28 @@
+from mani_skill2.utils.registration import register_env
+from mani_skill2.utils.scene_builder.ai2thor.variants import (
+    ArchitecTHORSceneBuilder,
+    ProcTHORSceneBuilder,
+    RoboTHORSceneBuilder,
+    iTHORSceneBuilder,
+)
+from mani_skill2.utils.scene_builder.replicacad.scene_builder import (
+    ReplicaCADSceneBuilder,
+)
+
+from .base_env import SceneManipulationEnv
+
+scene_builders = {
+    "ReplicaCAD": ReplicaCADSceneBuilder,
+    "ArchitecTHOR": ArchitecTHORSceneBuilder,
+    "ProcTHOR": ProcTHORSceneBuilder,
+    "RoboTHOR": RoboTHORSceneBuilder,
+    "iTHOR": iTHORSceneBuilder,
+}
+
+# Register environments just for benchmarking/exploration and to be creatable by just ID, these don't have any specific tasks designed in them.
+for k, scene_builder in scene_builders.items():
+    register_env(
+        f"{k}_SceneManipulation-v1",
+        max_episode_steps=None,
+        scene_builder_cls=scene_builder,
+    )(SceneManipulationEnv)
diff --git a/mani_skill2/envs/scenes/base_env.py b/mani_skill2/envs/scenes/base_env.py
index 96311f264..aeea02c40 100644
--- a/mani_skill2/envs/scenes/base_env.py
+++ b/mani_skill2/envs/scenes/base_env.py
@@ -1,25 +1,29 @@
+from typing import Any, Dict, Union
+
 import numpy as np
 import sapien as sapien
 import sapien.physx as physx
+import torch
 from sapien import Pose
 
-from mani_skill2.agents.robots import Panda
+from mani_skill2.agents.robots import Fetch, Panda
 from mani_skill2.envs.sapien_env import BaseEnv
 from mani_skill2.sensors.camera import CameraConfig
+from mani_skill2.utils.registration import register_env
 from mani_skill2.utils.sapien_utils import look_at
 from mani_skill2.utils.scene_builder import SceneBuilder
-from mani_skill2.utils.scene_builder.ai2thor import (
-    ArchitecTHORSceneBuilder,
-    ProcTHORSceneBuilder,
-    RoboTHORSceneBuilder,
-    iTHORSceneBuilder,
+from mani_skill2.utils.scene_builder.replicacad.scene_builder import (
+    ReplicaCADSceneBuilder,
 )
-from mani_skill2.utils.structs.pose import vectorize_pose
+from mani_skill2.utils.structs.types import GPUMemoryConfig, SimConfig
 
 
+@register_env("SceneManipulation-v1", max_episode_steps=200)
 class SceneManipulationEnv(BaseEnv):
-    agent: Panda
     """
+    A base environment for simulating manipulation tasks in more complex scenes. Creating this environment is only useful for explorations/visualization, there are no success/failure
+    metrics or rewards
+
     Args:
         robot_uids: Which robot to place into the scene. Default is "panda"
 
@@ -32,85 +36,121 @@ class SceneManipulationEnv(BaseEnv):
         convex_decomposition: Choice of convex decomposition algorithm to generate collision meshes for objects. Default is `coacd` which uses https://github.com/SarahWeiii/CoACD
     """
 
+    SUPPORTED_ROBOTS = ["panda", "fetch"]
+    sim_cfg = SimConfig(
+        spacing=50,
+        gpu_memory_cfg=GPUMemoryConfig(
+            found_lost_pairs_capacity=2**25,
+            max_rigid_patch_count=2**19,
+            max_rigid_contact_count=2**21,
+        ),
+    )
+    agent: Union[Panda, Fetch]
+
     def __init__(
         self,
         *args,
-        robot_uids="panda",
+        robot_uids="fetch",
         robot_init_qpos_noise=0.02,
         fixed_scene=True,
-        scene_builder_cls: SceneBuilder = ArchitecTHORSceneBuilder,
+        scene_builder_cls: SceneBuilder = ReplicaCADSceneBuilder,
         convex_decomposition="coacd",
+        scene_idxs=None,
         **kwargs
     ):
         self.robot_init_qpos_noise = robot_init_qpos_noise
         self.fixed_scene = fixed_scene
         self.sampled_scene_idx: int = None
-        self.scene_builder = scene_builder_cls()
-        self.scene_ids = np.arange(0, len(self.scene_builder.scene_configs))
+        self.scene_builder: SceneBuilder = scene_builder_cls(
+            self, robot_init_qpos_noise=robot_init_qpos_noise
+        )
+        if isinstance(scene_idxs, int):
+            self.scene_idxs = [scene_idxs]
+        elif isinstance(scene_idxs, list):
+            self.scene_idxs = scene_idxs
+        else:
+            self.scene_idxs = np.arange(0, len(self.scene_builder.scene_configs))
         self.convex_decomposition = convex_decomposition
         super().__init__(*args, robot_uids=robot_uids, **kwargs)
 
     def reset(self, seed=None, options=None):
         self._set_episode_rng(seed)
         if options is None:
-            options = dict(reconfigure=True)
+            options = dict(reconfigure=False)
         if not self.fixed_scene:
             options["reconfigure"] = True
-        if options["reconfigure"]:
-            self.sampled_scene_idx = self._episode_rng.randint(0, len(self.scene_ids))
+        if "reconfigure" in options and options["reconfigure"]:
+            self.sampled_scene_idx = self.scene_idxs[
+                self._episode_rng.randint(0, len(self.scene_idxs))
+            ]
+            self.sampled_scene_idx = int(self.sampled_scene_idx)
         return super().reset(seed, options)
 
+    def _setup_lighting(self):
+        if self.scene_builder.builds_lighting:
+            return
+        return super()._setup_lighting()
+
     def _load_actors(self):
         self.scene_builder.build(
             self._scene,
-            scene_id=self.sampled_scene_idx,
+            scene_idx=self.sampled_scene_idx,
             convex_decomposition=self.convex_decomposition,
         )
 
-    def _initialize_agent(self):
-        if self.robot_uids == "panda":
-            # fmt: off
-            # EE at [0.615, 0, 0.17]
-            qpos = np.array(
-                [0.0, np.pi / 8, 0, -np.pi * 5 / 8, 0, np.pi * 3 / 4, np.pi / 4, 0.04, 0.04]
-            )
-            # fmt: on
-            qpos[:-2] += self._episode_rng.normal(
-                0, self.robot_init_qpos_noise, len(qpos) - 2
-            )
-            self.agent.reset(qpos)
-            self.agent.robot.set_pose(Pose([-0.615, 0, 0]))
-        elif self.robot_uids == "xmate3_robotiq":
-            qpos = np.array(
-                [0, np.pi / 6, 0, np.pi / 3, 0, np.pi / 2, -np.pi / 2, 0, 0]
-            )
-            qpos[:-2] += self._episode_rng.normal(
-                0, self.robot_init_qpos_noise, len(qpos) - 2
-            )
-            self.agent.reset(qpos)
-            self.agent.robot.set_pose(Pose([-0.562, 0, 0]))
-        else:
-            raise NotImplementedError(self.robot_uids)
+    def _initialize_actors(self, env_idx: torch.Tensor):
+        with torch.device(self.device):
+            self.scene_builder.initialize(env_idx)
+
+    def evaluate(self) -> dict:
+        return dict()
+
+    def compute_dense_reward(self, obs: Any, action: torch.Tensor, info: Dict):
+        return 0
+
+    def compute_normalized_dense_reward(
+        self, obs: Any, action: torch.Tensor, info: Dict
+    ):
+        return self.compute_dense_reward(obs=obs, action=action, info=info) / 1
 
     def _register_sensors(self):
+        if self.robot_uids == "fetch":
+            return ()
+
         pose = look_at([0.3, 0, 0.6], [-0.1, 0, 0.1])
         return CameraConfig(
             "base_camera", pose.p, pose.q, 128, 128, np.pi / 2, 0.01, 10
         )
 
     def _register_human_render_cameras(self):
+        if self.robot_uids == "fetch":
+            room_camera_pose = look_at([2.5, -2.5, 3], [0.0, 0.0, 0])
+            room_camera_config = CameraConfig(
+                "render_camera",
+                room_camera_pose.p,
+                room_camera_pose.q,
+                512,
+                512,
+                1,
+                0.01,
+                10,
+            )
+            robot_camera_pose = look_at([2, 0, 1], [0, 0, -1])
+            robot_camera_config = CameraConfig(
+                "robot_render_camera",
+                robot_camera_pose.p,
+                robot_camera_pose.q,
+                512,
+                512,
+                1.5,
+                0.01,
+                10,
+                link=self.agent.torso_lift_link,
+            )
+            return [room_camera_config, robot_camera_config]
+
         if self.robot_uids == "panda":
             pose = look_at([0.4, 0.4, 0.8], [0.0, 0.0, 0.4])
         else:
-            pose = look_at([0.5, 0.5, 1.0], [0.0, 0.0, 0.5])
+            pose = look_at([0, 10, -3], [0, 0, 0])
         return CameraConfig("render_camera", pose.p, pose.q, 512, 512, 1, 0.01, 10)
-
-    def _setup_viewer(self):
-        super()._setup_viewer()
-        self._viewer.set_camera_xyz(0.8, 0, 1.0)
-        self._viewer.set_camera_rpy(0, -0.5, 3.14)
-
-    def _get_obs_agent(self):
-        obs = self.agent.get_proprioception()
-        obs["base_pose"] = vectorize_pose(self.agent.robot.pose)
-        return obs
diff --git a/mani_skill2/envs/scenes/pick_object.py b/mani_skill2/envs/scenes/pick_object.py
deleted file mode 100644
index 9b2f0c4ea..000000000
--- a/mani_skill2/envs/scenes/pick_object.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from collections import OrderedDict
-
-import numpy as np
-import sapien
-import sapien.physx as physx
-
-from mani_skill2.utils.registration import register_env
-from mani_skill2.utils.structs.pose import vectorize_pose
-
-from .base_env import SceneManipulationEnv
-
-
-@register_env("PickObjectScene-v0", max_episode_steps=200)
-class PickObjectSceneEnv(SceneManipulationEnv):
-    """
-    Args:
-
-    """
-
-    def __init__(self, *args, **kwargs):
-        self.box_length = 0.1
-        super().__init__(*args, **kwargs)
-
-    def _load_actors(self):
-        super()._load_actors()
-
-    def reconfigure(self):
-        super().reconfigure()
-        self.init_state = self.get_state()
-
-    def _get_obs_extra(self) -> OrderedDict:
-        obs = OrderedDict(
-            tcp_pose=vectorize_pose(self.agent.tcp.pose),
-            obj_pose=vectorize_pose(self.goal_obj.pose),
-        )
-        return obs
-
-    def evaluate(self, **kwargs):
-        return dict(success=self.agent.is_grasping(self.goal_obj))
-
-    def compute_dense_reward(self, info, **kwargs):
-        return 1 - np.tanh(np.linalg.norm(self.goal_obj.pose.p - self.agent.tcp.pose.p))
-
-    def compute_normalized_dense_reward(self, **kwargs):
-        return self.compute_dense_reward(**kwargs) / 1
-
-    def _initialize_actors(self):
-        self.set_state(self.init_state)
-
-    def _initialize_task(self):
-        # pick a random goal object to pick up
-        self.goal_obj: sapien.Entity = self._episode_rng.choice(
-            self.scene_builder.movable_objects
-        )
-        print(f"Target Object: {self.goal_obj.name}")
-
-    def reset(self, seed=None, options=None):
-        # sample a scene
-        return super().reset(seed, options)
diff --git a/mani_skill2/examples/demo_random_action.py b/mani_skill2/examples/demo_random_action.py
index 02750dd83..177f2d4e2 100644
--- a/mani_skill2/examples/demo_random_action.py
+++ b/mani_skill2/examples/demo_random_action.py
@@ -78,8 +78,9 @@ def main(args):
         if args.render_mode is not None:
             env.render()
 
-        if terminated or truncated:
-            break
+        if args.render_mode is None or args.render_mode != "human":
+            if terminated or truncated:
+                break
     env.close()
 
 
diff --git a/mani_skill2/utils/building/actors.py b/mani_skill2/utils/building/actors.py
index 04bb8337d..528401d1f 100644
--- a/mani_skill2/utils/building/actors.py
+++ b/mani_skill2/utils/building/actors.py
@@ -262,7 +262,7 @@ def build_actor_ycb(
     model_id: str,
     scene: ManiSkillScene,
     name: str,
-    root_dir=ASSET_DIR / "mani_skill2_ycb",
+    root_dir=ASSET_DIR / "assets/mani_skill2_ycb",
     body_type: str = "dynamic",
     add_collision: bool = True,
     return_builder: bool = False,
@@ -299,7 +299,7 @@ def build_actor_ycb(
 def _load_ycb_dataset():
     # load YCB if used
     MODEL_DBS["YCB"] = {
-        "model_data": load_json(ASSET_DIR / "mani_skill2_ycb/info_pick_v0.json"),
+        "model_data": load_json(ASSET_DIR / "assets/mani_skill2_ycb/info_pick_v0.json"),
         "builder": build_actor_ycb,
     }
 
diff --git a/mani_skill2/utils/building/articulation_builder.py b/mani_skill2/utils/building/articulation_builder.py
index ed717cb3f..55c7987b8 100644
--- a/mani_skill2/utils/building/articulation_builder.py
+++ b/mani_skill2/utils/building/articulation_builder.py
@@ -49,7 +49,6 @@ def create_link_builder(self, parent: LinkBuilder = None):
         builder = LinkBuilder(len(self.link_builders), parent)
         self.link_builders.append(builder)
         if self.disable_self_collisions:
-            # NOTE (stao): Currently in SAPIEN you can't set collision groups after building the links due to some bug when doing GPU sim.
             builder.collision_groups[2] |= 1 << 29
 
         return builder
diff --git a/mani_skill2/utils/building/urdf_loader.py b/mani_skill2/utils/building/urdf_loader.py
index cc1112888..d8732ebf2 100644
--- a/mani_skill2/utils/building/urdf_loader.py
+++ b/mani_skill2/utils/building/urdf_loader.py
@@ -16,7 +16,7 @@
 
 class URDFLoader(SapienURDFLoader):
     scene: ManiSkillScene
-    name: str
+    name: str = None
     disable_self_collisions: bool = False
 
     def parse(
diff --git a/mani_skill2/utils/download_asset.py b/mani_skill2/utils/download_asset.py
index b7b5300fc..215c3b460 100644
--- a/mani_skill2/utils/download_asset.py
+++ b/mani_skill2/utils/download_asset.py
@@ -11,57 +11,86 @@
 import shutil
 import urllib.request
 import zipfile
+from dataclasses import dataclass
 from pathlib import Path
+from typing import Dict
 from urllib.error import URLError
 
+from huggingface_hub import login, snapshot_download
 from tqdm.auto import tqdm
 
 from mani_skill2 import ASSET_DIR, PACKAGE_ASSET_DIR
 from mani_skill2.utils.io_utils import load_json
 
-DATA_SOURCES = {}
+
+@dataclass
+class DataSource:
+    url: str = None
+    hf_repo_id: str = None
+    target_path: str = None
+    checksum: str = None
+    output_dir: str = ASSET_DIR
+
+
+DATA_SOURCES: Dict[str, DataSource] = {}
 DATA_GROUPS = {}
 
 
 def initialize_sources():
-    DATA_SOURCES["ycb"] = dict(
+    """
+    Initialize the metadata for assets
+
+    Note that the current organization works as follows
+
+    - assets/* contain files for individual objects (.obj, .glb etc.) and articulations (.urdf etc.) that are generally reused.
+        E.g. Partnet Mobility and the YCB dataset.
+    - tasks/* contain files that are otherwise too big to upload to GitHub and are relevant for just that one task and will generally not be reused
+    - scene_datasets/* is a bit messier but contains a self-contained folder for each scene dataset each of which is likely organized differently.
+        These datasets often put their unique object and articulation files together with scene configuration data. In the future we will re-organize these
+        datasets so that all objects are put into assets and leave scene_datasets for scene configuration information.
+    - robots/* contains files for additional robots that are not included by default.
+
+    """
+
+    # TODO add google scanned objects
+    DATA_SOURCES["ycb"] = DataSource(
         url="https://huggingface.co/datasets/haosulab/ManiSkill2/resolve/main/data/mani_skill2_ycb.zip",
-        target_path="mani_skill2_ycb",
+        target_path="assets/mani_skill2_ycb",
         checksum="174001ba1003cc0c5adda6453f4433f55ec7e804f0f0da22d015d525d02262fb",
     )
     DATA_GROUPS["PickSingleYCB-v0"] = ["ycb"]
 
-    DATA_SOURCES["pick_clutter_ycb"] = dict(
+    DATA_SOURCES["pick_clutter_ycb"] = DataSource(
         url="https://storage1.ucsd.edu/datasets/ManiSkill2022-assets/pick_clutter/ycb_train_5k.json.gz",
-        output_dir=ASSET_DIR / "pick_clutter",
+        target_path="tasks/pick_clutter",
         checksum="70ec176c7036f326ea7813b77f8c03bea9db5960198498957a49b2895a9ec338",
     )
     DATA_GROUPS["PickClutterYCB-v0"] = ["ycb", "pick_clutter_ycb"]
 
-    DATA_SOURCES["assembling_kits"] = dict(
+    DATA_SOURCES["assembling_kits"] = DataSource(
         url="https://storage1.ucsd.edu/datasets/ManiSkill2022-assets/assembling_kits_v1.zip",
-        target_path="assembling_kits",
+        target_path="tasks/assembling_kits",
         checksum="e3371f17a07a012edaa3a0b3604fb1577f3fb921876c3d5ed59733dd75a6b4a0",
     )
     DATA_GROUPS["AssemblingKits-v0"] = ["assembling_kits"]
 
-    DATA_SOURCES["panda_avoid_obstacles"] = dict(
+    DATA_SOURCES["panda_avoid_obstacles"] = DataSource(
         url="https://storage1.ucsd.edu/datasets/ManiSkill2022-assets/avoid_obstacles/panda_train_2k.json.gz",
-        output_dir=ASSET_DIR / "avoid_obstacles",
+        target_path="tasks/avoid_obstacles",
         checksum="44dae9a0804172515c290c1f49a1e7e72d76e40201a2c5c7d4a3ccd43b4d5be4",
     )
     DATA_GROUPS["PandaAvoidObstacles-v0"] = ["panda_avoid_obstacles"]
 
-    DATA_SOURCES["pinch"] = dict(
+    DATA_SOURCES["pinch"] = DataSource(
         url="https://storage1.ucsd.edu/datasets/ManiSkill2022-assets/pinch.zip",
-        target_path="pinch",
+        target_path="tasks/pinch",
         checksum="3281d2d777fad42e6d37371b2d3ee16fb1c39984907176718ca2e4f447326fe7",
     )
     DATA_GROUPS["Pinch-v0"] = ["pinch"]
 
-    DATA_SOURCES["write"] = dict(
+    DATA_SOURCES["write"] = DataSource(
         url="https://storage1.ucsd.edu/datasets/ManiSkill2022-assets/write.zip",
-        target_path="write",
+        target_path="tasks/write",
         checksum="c5b49e581bfed9cfb2107a607faf52795f840e93f5a7ad389290314513b4b634",
     )
     DATA_GROUPS["Write-v0"] = ["write"]
@@ -78,7 +107,7 @@ def initialize_sources():
         category_uids[category] = []
         for model_id in model_ids:
             uid = f"partnet_mobility/{model_id}"
-            DATA_SOURCES[uid] = dict(
+            DATA_SOURCES[uid] = DataSource(
                 url=f"https://storage1.ucsd.edu/datasets/ManiSkill2022-assets/partnet_mobility/dataset/{model_id}.zip",
                 output_dir=ASSET_DIR / "partnet_mobility" / "dataset",
             )
@@ -97,9 +126,17 @@ def initialize_sources():
     DATA_GROUPS["MoveBucket-v1"] = category_uids["bucket"]
     DATA_GROUPS["TurnFaucet-v0"] = category_uids["faucet"]
 
+    # ---------------------------------------------------------------------------- #
+    # Interactable Scene Datasets
+    # ---------------------------------------------------------------------------- #
+    DATA_SOURCES["ReplicaCAD"] = DataSource(
+        hf_repo_id="haosulab/ReplicaCAD",
+        target_path="scene_datasets/replica_cad_dataset",
+    )
+
 
 def initialize_extra_sources():
-    DATA_SOURCES["xmate3_robotiq"] = dict(
+    DATA_SOURCES["xmate3_robotiq"] = DataSource(
         url="https://storage1.ucsd.edu/datasets/ManiSkill2022-assets/xmate3_robotiq.zip",
         target_path="robots/xmate3_robotiq",
         checksum="ddda102a20eb41e28a0a501702e240e5d7f4084221a44f580e729f08b7c12d1a",
@@ -108,17 +145,12 @@ def initialize_extra_sources():
     # ---------------------------------------------------------------------------- #
     # Visual backgrounds
     # ---------------------------------------------------------------------------- #
-    DATA_SOURCES["minimalistic_modern_bedroom"] = dict(
-        url="https://storage1.ucsd.edu/datasets/ManiSkill-background/minimalistic_modern_bedroom.glb",
-        output_dir=ASSET_DIR / "background",
-        checksum="9d9ea14c8cdfab1ebafe3e8ff5a071b77b361c1abd24e9c59b0f08e1d1a3421a",
-    )
-    # Alias
-    DATA_GROUPS["minimal_bedroom"] = ["minimalistic_modern_bedroom"]
 
     # All backgrounds
     DATA_GROUPS["backgrounds"] = ["minimalistic_modern_bedroom"]
 
+    # TODO add Replica, MatterPort 3D?
+
 
 def prompt_yes_no(message):
     r"""Prints a message and prompts the user for "y" or "n" returning True or False."""
@@ -148,15 +180,26 @@ def sha256sum(filename, chunk_size=4096):
     return sha256_hash.hexdigest()
 
 
+def download_from_hf_datasets(
+    data_source: DataSource,
+):
+    output_dir = Path(data_source.output_dir)
+    output_path = output_dir / data_source.target_path
+    snapshot_download(
+        repo_id=data_source.hf_repo_id,
+        repo_type="dataset",
+        local_dir=output_path,
+        local_dir_use_symlinks=False,
+        resume_download=True,
+    )
+
+
 def download(
-    url,
-    output_dir=ASSET_DIR,
-    target_path: str = None,
-    checksum=None,
+    data_source: DataSource,
     verbose=True,
     non_interactive=True,
 ):
-    output_dir = Path(output_dir)
+    output_dir = Path(data_source.output_dir)
 
     # Create output directory
     if not output_dir.exists():
@@ -164,9 +207,9 @@ def download(
             output_dir.mkdir(parents=True)
         else:
             return
-
+    target_path = data_source.target_path
     if target_path is None:
-        target_path = url.split("/")[-1]
+        target_path = data_source.url.split("/")[-1]
     output_path = output_dir / target_path
     output_dir = osp.dirname(output_path)
 
@@ -184,10 +227,14 @@ def download(
                 print(f"Skip existing: {output_path}")
                 return output_path
 
+    if data_source.hf_repo_id is not None:
+        download_from_hf_datasets(data_source)
+        return
+
     # Download files to temporary location
     try:
         if verbose:
-            print(f"Downloading {url}")
+            print(f"Downloading {data_source.url}")
             pbar = tqdm(unit="iB", unit_scale=True, unit_divisor=1024)
 
         def show_progress(blocknum, bs, size):
@@ -196,19 +243,22 @@ def show_progress(blocknum, bs, size):
                     pbar.total = size
                 pbar.update(bs)
 
-        filename, _ = urllib.request.urlretrieve(url, reporthook=show_progress)
+        filename, _ = urllib.request.urlretrieve(
+            data_source.url, reporthook=show_progress
+        )
         if verbose:
             pbar.close()
     except URLError as err:
-        print(f"Failed to download {url}")
+        print(f"Failed to download {data_source.url}")
         raise err
 
     # Verify checksum
-    if checksum is not None and checksum != sha256sum(filename):
-        raise IOError(f"Downloaded file's SHA-256 hash does not match record: {url}")
-
+    if data_source.checksum is not None and data_source.checksum != sha256sum(filename):
+        raise IOError(
+            f"Downloaded file's SHA-256 hash does not match record: {data_source.url}"
+        )
     # Extract or move to output path
-    if url.endswith(".zip"):
+    if data_source.url.endswith(".zip"):
         with zipfile.ZipFile(filename, "r") as zip_ref:
             if verbose:
                 for file in tqdm(zip_ref.infolist()):
@@ -271,12 +321,12 @@ def main(args):
         if show_progress and verbose:
             print("Downloading assets for {}: {}/{}".format(args.uid, i + 1, len(uids)))
 
-        kwargs = DATA_SOURCES[uid].copy()
+        kwargs = dict()
         kwargs["verbose"] = verbose
         kwargs["non_interactive"] = args.non_interactive
         if args.output_dir is not None:
             kwargs["output_dir"] = args.output_dir
-        output_path = download(**kwargs)
+        output_path = download(DATA_SOURCES[uid], **kwargs)
 
         if output_path is not None and verbose:
             print("=" * 80)
diff --git a/mani_skill2/utils/download_demo.py b/mani_skill2/utils/download_demo.py
index 99a821c24..2ef801321 100644
--- a/mani_skill2/utils/download_demo.py
+++ b/mani_skill2/utils/download_demo.py
@@ -6,6 +6,8 @@
 
 from tqdm import tqdm
 
+from mani_skill2 import DEMO_DIR
+
 DATASET_SOURCES = {}
 
 # Rigid body envs
@@ -208,8 +210,7 @@ def parse_args(args=None):
         "-o",
         "--output_dir",
         type=str,
-        default="demos",
-        help="The directory to save demonstrations to. The files will then be saved to <output_dir>/<env_type>/<env_id>",
+        help="The directory to save demonstrations to. The files will then be saved to <output_dir>/<env_type>/<env_id>. By default it is saved to ~/.maniskill/demos or what MS_ASSET_DIR is set to.",
     )
     return parser.parse_args(args)
 
@@ -244,23 +245,23 @@ def main(args):
             raise ValueError(
                 f"Version v{download_version} of demonstrations for {uid} do not exist. Latest version is v{meta['latest_version']}. If you think this is a bug please raise an issue on GitHub: https://github.com/haosulab/ManiSkill2/issues"
             )
+
+        object_paths = meta["object_paths"]
+        output_dir = str(DEMO_DIR)
+        if args.output_dir:
+            output_dir = args.output_dir
+        final_path = osp.join(output_dir, f"v{download_version}", meta["env_type"])
         if verbose:
             print(
-                f"Downloading v{download_version} demonstrations: {i+1}/{len(uids)}, {uid}"
+                f"Downloading v{download_version} demonstrations to {final_path} - {i+1}/{len(uids)}, {uid}"
             )
-
-        object_paths = meta["object_paths"]
         for object_path in object_paths:
             local_path = download_file(
-                osp.join(args.output_dir), object_path, version=download_version
+                output_dir, object_path, version=download_version
             )
             if osp.splitext(local_path)[1] == ".zip":
                 with zipfile.ZipFile(local_path, "r") as zip_ref:
-                    zip_ref.extractall(
-                        osp.join(
-                            args.output_dir, f"v{download_version}", meta["env_type"]
-                        )
-                    )
+                    zip_ref.extractall(final_path)
                 os.remove(local_path)
 
 
diff --git a/mani_skill2/utils/scene_builder/ai2thor/constants.py b/mani_skill2/utils/scene_builder/ai2thor/constants.py
index 89b1a632a..6e0fde2e5 100644
--- a/mani_skill2/utils/scene_builder/ai2thor/constants.py
+++ b/mani_skill2/utils/scene_builder/ai2thor/constants.py
@@ -11,6 +11,7 @@
 class SceneConfig:
     config_file: str
     source: str
+    spawn_pos_file: str = None
 
 
 @dataclass
diff --git a/mani_skill2/utils/scene_builder/ai2thor/scene_builder.py b/mani_skill2/utils/scene_builder/ai2thor/scene_builder.py
index ce7989e15..5bc028203 100644
--- a/mani_skill2/utils/scene_builder/ai2thor/scene_builder.py
+++ b/mani_skill2/utils/scene_builder/ai2thor/scene_builder.py
@@ -7,7 +7,7 @@
 import json
 import os.path as osp
 from pathlib import Path
-from typing import List
+from typing import Dict, List, Tuple
 
 import numpy as np
 import sapien
@@ -17,7 +17,10 @@
 from tqdm import tqdm
 
 from mani_skill2 import ASSET_DIR
+from mani_skill2.agents.robots.fetch import FETCH_UNIQUE_COLLISION_BIT, Fetch
+from mani_skill2.envs.scene import ManiSkillScene
 from mani_skill2.utils.scene_builder import SceneBuilder
+from mani_skill2.utils.structs.actor import Actor
 
 from .constants import SCENE_SOURCE_TO_DATASET, SceneConfig, load_ai2thor_metadata
 
@@ -33,6 +36,9 @@
     None,
 )
 
+# TODO (arth): fix coacd so this isn't necessary
+WORKING_OBJS = ["apple", "potato", "tomato"]
+
 
 class AI2THORBaseSceneBuilder(SceneBuilder):
     """
@@ -40,8 +46,11 @@ class AI2THORBaseSceneBuilder(SceneBuilder):
     """
 
     scene_dataset: str
+    robot_init_qpos_noise: float = 0.02
 
-    def __init__(self):
+    def __init__(self, env, robot_init_qpos_noise=0.02):
+        self.env = env
+        self.robot_init_qpos_noise = robot_init_qpos_noise
         global OBJECT_SEMANTIC_ID_MAPPING, SEMANTIC_ID_OBJECT_MAPPING, MOVEABLE_OBJECT_IDS
         (
             OBJECT_SEMANTIC_ID_MAPPING,
@@ -61,7 +70,10 @@ def __init__(self):
         else:
             self._scene_configs = ALL_SCENE_CONFIGS[self.scene_dataset]
 
-        super().__init__()
+        self._scene_navigable_positions = [None] * len(self._scene_configs)
+        self.actor_default_poses: List[Tuple[Actor, sapien.Pose]] = []
+
+        self.scene_idx = None
 
     def _should_be_kinematic(self, template_name: str):
         object_config_json = (
@@ -75,14 +87,16 @@ def _should_be_kinematic(self, template_name: str):
         object_category = SEMANTIC_ID_OBJECT_MAPPING[semantic_id]
         return object_category not in MOVEABLE_OBJECT_IDS
 
+    # TODO (arth): figure out coacd building issues (currenlty fails on > 80% objs)
     def build(
-        self, scene: sapien.Scene, scene_id=0, convex_decomposition="none", **kwargs
+        self, scene: ManiSkillScene, scene_idx=0, convex_decomposition="none", **kwargs
     ):
         # save scene and movable objects when building scene
-        self._scene_objects: List[sapien.Entity] = []
-        self._movable_objects: List[sapien.Entity] = []
+        self._scene_objects: Dict[str, Actor] = dict()
+        self._movable_objects: Dict[str, Actor] = dict()
+        self.scene_idx = scene_idx
 
-        scene_cfg = self._scene_configs[scene_id]
+        scene_cfg = self._scene_configs[scene_idx]
 
         dataset = SCENE_SOURCE_TO_DATASET[scene_cfg.source]
         with open(osp.join(dataset.dataset_path, scene_cfg.config_file), "rb") as f:
@@ -94,21 +108,22 @@ def build(
             / f"{scene_json['stage_instance']['template_name']}.glb"
         )
         builder = scene.create_actor_builder()
-        builder.add_visual_from_file(bg_path)
-        builder.add_nonconvex_collision_from_file(bg_path)
-        bg = builder.build_kinematic(name="scene_background")
-        q = transforms3d.quaternions.axangle2quat(
+
+        bg_q = transforms3d.quaternions.axangle2quat(
             np.array([1, 0, 0]), theta=np.deg2rad(90)
         )
         if self.scene_dataset == "ProcTHOR":
             # for some reason the scene needs to rotate around y-axis by 90 degrees for ProcTHOR scenes from hssd dataset
-            q = transforms3d.quaternions.qmult(
+            bg_q = transforms3d.quaternions.qmult(
                 q,
                 transforms3d.quaternions.axangle2quat(
                     np.array([0, -1, 0]), theta=np.deg2rad(90)
                 ),
             )
-        bg.set_pose(sapien.Pose(q=q))
+        bg_pose = sapien.Pose(q=bg_q)
+        builder.add_visual_from_file(bg_path, pose=bg_pose)
+        builder.add_nonconvex_collision_from_file(bg_path, pose=bg_pose)
+        self.bg = builder.build_static(name="scene_background")
 
         global_id = 0
         for object in tqdm(scene_json["object_instances"][:]):
@@ -119,58 +134,127 @@ def build(
             )
             actor_id = f"{object['template_name']}_{global_id}"
             global_id += 1
+            q = transforms3d.quaternions.axangle2quat(
+                np.array([1, 0, 0]), theta=np.deg2rad(90)
+            )
+            rot_q = [
+                object["rotation"][0],
+                object["rotation"][1],
+                object["rotation"][2],
+                object["rotation"][-1],
+            ]
+            q = transforms3d.quaternions.qmult(q, rot_q)
+
             builder = scene.create_actor_builder()
-            builder.add_visual_from_file(str(model_path))
-            if self._should_be_kinematic(object["template_name"]):
+            if self._should_be_kinematic(object["template_name"]) or not np.any(
+                [name in actor_id.lower() for name in WORKING_OBJS]
+            ):
+                position = [
+                    object["translation"][0],
+                    -object["translation"][2],
+                    object["translation"][1] + 0,
+                ]
                 position = [
                     object["translation"][0],
                     -object["translation"][2],
                     object["translation"][1] + 0,
                 ]
-                builder.add_nonconvex_collision_from_file(str(model_path))
-                actor = builder.build_kinematic(name=actor_id)
+                pose = sapien.Pose(p=position, q=q)
+                builder.add_visual_from_file(str(model_path), pose=pose)
+                builder.add_nonconvex_collision_from_file(str(model_path), pose=pose)
+                actor = builder.build_static(name=actor_id)
             else:
                 position = [
                     object["translation"][0],
                     -object["translation"][2],
                     object["translation"][1] + 0.005,
                 ]
+                builder.add_visual_from_file(str(model_path))
                 builder.add_multiple_convex_collisions_from_file(
-                    str(model_path), decomposition=convex_decomposition
+                    str(model_path),
+                    decomposition=convex_decomposition,
                 )
                 actor = builder.build(name=actor_id)
-            self._scene_objects.append(actor)
+                self._movable_objects[actor.name] = actor
+                pose = sapien.Pose(p=position, q=q)
+                self.actor_default_poses.append((actor, pose))
+            self._scene_objects[actor_id] = actor
 
-            q = transforms3d.quaternions.axangle2quat(
-                np.array([1, 0, 0]), theta=np.deg2rad(90)
+        if self._scene_navigable_positions[scene_idx] is None:
+            self._scene_navigable_positions[scene_idx] = np.load(
+                Path(dataset.dataset_path)
+                / (
+                    Path(scene_cfg.config_file).stem.split(".")[0]
+                    + f".{self.env.robot_uids}.navigable_positions.npy"
+                )
             )
-            rot_q = [
-                object["rotation"][0],
-                object["rotation"][1],
-                object["rotation"][2],
-                object["rotation"][-1],
-            ]
-            q = transforms3d.quaternions.qmult(q, rot_q)
-            pose = sapien.Pose(p=position, q=q)
+
+    def disable_fetch_ground_collisions(self):
+        # TODO (stao) (arth): is there a better way to model robots in sim. This feels very unintuitive.
+        for body in self.bg._bodies:
+            cs = body.get_collision_shapes()[0]
+            cg = cs.get_collision_groups()
+            cg[2] |= FETCH_UNIQUE_COLLISION_BIT
+            cs.set_collision_groups(cg)
+
+    def set_actor_default_poses_vels(self):
+        for actor, pose in self.actor_default_poses:
             actor.set_pose(pose)
+            actor.set_linear_velocity([0, 0, 0])
+            actor.set_angular_velocity([0, 0, 0])
+
+    def initialize(self, env_idx):
 
-        # get movable objects
-        self._movable_objects = [
-            obj
-            for obj in self.scene_objects
-            if not obj.find_component_by_type(
-                physx.PhysxRigidDynamicComponent
-            ).kinematic
-        ]
+        self.set_actor_default_poses_vels()
+
+        if self.env.robot_uids == "panda":
+            # fmt: off
+            # EE at [0.615, 0, 0.17]
+            qpos = np.array(
+                [0.0, np.pi / 8, 0, -np.pi * 5 / 8, 0, np.pi * 3 / 4, np.pi / 4, 0.04, 0.04]
+            )
+            # fmt: on
+            qpos[:-2] += self.env._episode_rng.normal(
+                0, self.robot_init_qpos_noise, len(qpos) - 2
+            )
+            self.env.agent.reset(qpos)
+            self.env.agent.robot.set_pose(sapien.Pose([-0.615, 0, 0]))
+        elif self.env.robot_uids == "xmate3_robotiq":
+            qpos = np.array(
+                [0, np.pi / 6, 0, np.pi / 3, 0, np.pi / 2, -np.pi / 2, 0, 0]
+            )
+            qpos[:-2] += self.env._episode_rng.normal(
+                0, self.robot_init_qpos_noise, len(qpos) - 2
+            )
+            self.env.agent.reset(qpos)
+            self.env.agent.robot.set_pose(sapien.Pose([-0.562, 0, 0]))
+        elif self.env.robot_uids == "fetch":
+            agent: Fetch = self.env.agent
+            agent.reset(agent.RESTING_QPOS)
+            agent.robot.set_pose(sapien.Pose([*self.navigable_positions[0], 0.001]))
+            self.disable_fetch_ground_collisions()
+        else:
+            raise NotImplementedError(self.env.robot_uids)
 
     @property
-    def scene_configs(self):
+    def scene_configs(self) -> List[SceneConfig]:
         return self._scene_configs
 
     @property
-    def scene_objects(self):
-        return self._scene_objects
+    def navigable_positions(self) -> np.ndarray:
+        assert isinstance(
+            self.scene_idx, int
+        ), "Must build scene before getting navigable positions"
+        return self._scene_navigable_positions[self.scene_idx]
+
+    @property
+    def scene_objects(self) -> List[Actor]:
+        return list(self._scene_objects.values())
+
+    @property
+    def movable_objects(self) -> List[Actor]:
+        return list(self._movable_objects.values())
 
     @property
-    def movable_objects(self):
+    def movable_objects_by_id(self) -> Dict[str, Actor]:
         return self._movable_objects
diff --git a/mani_skill2/utils/scene_builder/replicacad/metadata/scene_configs.json b/mani_skill2/utils/scene_builder/replicacad/metadata/scene_configs.json
new file mode 100644
index 000000000..173a38ebe
--- /dev/null
+++ b/mani_skill2/utils/scene_builder/replicacad/metadata/scene_configs.json
@@ -0,0 +1,10 @@
+{
+  "scenes": [
+    "apt_0.scene_instance.json",
+    "apt_1.scene_instance.json",
+    "apt_2.scene_instance.json",
+    "apt_3.scene_instance.json",
+    "apt_4.scene_instance.json",
+    "apt_5.scene_instance.json"
+  ]
+}
diff --git a/mani_skill2/utils/scene_builder/replicacad/scene_builder.py b/mani_skill2/utils/scene_builder/replicacad/scene_builder.py
new file mode 100644
index 000000000..864b10d41
--- /dev/null
+++ b/mani_skill2/utils/scene_builder/replicacad/scene_builder.py
@@ -0,0 +1,215 @@
+"""
+Code for building scenes from the ReplicaCAD dataset https://aihabitat.org/datasets/replica_cad/
+
+This code is also heavily commented to serve as a tutorial for how to build custom scenes from scratch and/or port scenes over from other datasets/simulators
+"""
+
+import json
+import os.path as osp
+
+import numpy as np
+import sapien
+import torch
+import transforms3d
+
+from mani_skill2 import ASSET_DIR
+from mani_skill2.agents.robots.fetch.fetch import FETCH_UNIQUE_COLLISION_BIT, Fetch
+from mani_skill2.envs.scene import ManiSkillScene
+from mani_skill2.utils.scene_builder import SceneBuilder
+from mani_skill2.utils.structs.articulation import Articulation
+
+DATASET_CONFIG_DIR = osp.join(osp.dirname(__file__), "metadata")
+
+
+class ReplicaCADSceneBuilder(SceneBuilder):
+    builds_lighting = True
+
+    def __init__(self, env, robot_init_qpos_noise=0.02):
+        super().__init__(env, robot_init_qpos_noise)
+        # Scene datasets from any source generally have several configurations, each of which may involve changing object geometries, poses etc.
+        # You should store this configuration information in the self._scene_configs list, which permits the code to sample from when
+        # simulating more than one scene or performing reconfiguration
+
+        # for ReplicaCAD we have saved the list of all scene configuration files from the dataset to a local json file and create SceneConfig objects out of it
+        with open(osp.join(DATASET_CONFIG_DIR, "scene_configs.json")) as f:
+            self.scene_configs = json.load(f)["scenes"]
+
+    def build(
+        self, scene: ManiSkillScene, scene_idx=0, convex_decomposition="none", **kwargs
+    ):
+        """
+        Given a ManiSkillScene, a sampled scene_idx, build/load the scene objects
+
+        scene_idx is an index corresponding to a sampled scene config in self._scene_configs. The code should...
+        TODO (stao): scene_idx should probably be replaced with scene config?
+
+        TODO (stao): provide a simple way in maybe SceneBuilder to override how to decide if an object should be dynamic or not?
+        """
+        scene_cfg_path = self.scene_configs[scene_idx]
+
+        # We read the json config file describing the scene setup for the selected ReplicaCAD scene
+        with open(
+            osp.join(
+                ASSET_DIR,
+                "scene_datasets/replica_cad_dataset/configs/scenes",
+                scene_cfg_path,
+            ),
+            "rb",
+        ) as f:
+            scene_json = json.load(f)
+
+        # The complex part of porting over scene datasets is that each scene dataset often has it's own format and there is no
+        # one size fits all solution to read that format and use it. The best way to port a scene dataset over is to look
+        # at the configuration files, get a sense of the pattern and find how they reference .glb model files and potentially
+        # decomposed convex meshes for physical simulation
+
+        # ReplicaCAD stores the background model here
+        background_template_name = osp.basename(
+            scene_json["stage_instance"]["template_name"]
+        )
+        bg_path = str(
+            ASSET_DIR
+            / f"scene_datasets/replica_cad_dataset/stages/{background_template_name}.glb"
+        )
+        builder = scene.create_actor_builder()
+        # Note all ReplicaCAD assets are rotated by 90 degrees as they use a different xyz convention to SAPIEN/ManiSkill.
+        q = transforms3d.quaternions.axangle2quat(
+            np.array([1, 0, 0]), theta=np.deg2rad(90)
+        )
+        bg_pose = sapien.Pose(q=q)
+
+        # When creating objects that do not need to be moved ever, you must provide the pose of the object ahead of time
+        # and use builder.build_static. Objects such as the scene background (also called a stage) fits in this category
+        builder.add_visual_from_file(bg_path, pose=bg_pose)
+        builder.add_nonconvex_collision_from_file(bg_path, pose=bg_pose)
+        self.bg = builder.build_static(name="scene_background")
+
+        # For the purposes of physical simulation, we disable collisions between the Fetch robot and the scene background
+        self.disable_fetch_ground_collisions()
+
+        # In scenes, there will always be dynamic objects, kinematic objects, and static objects.
+        # In the case of ReplicaCAD there are only dynamic and static objects. Since dynamic objects can be moved during simulation
+        # we need to keep track of the initial poses of each dynamic actor we create.
+        self.default_object_poses = []
+        for obj_meta in scene_json["object_instances"]:
+
+            # Again, for any dataset you will have to figure out how they reference object files
+            # Note that ASSET_DIR will always refer to the ~/.ms_data folder or whatever MS_ASSET_DIR is set to
+            obj_cfg_path = osp.join(
+                ASSET_DIR,
+                "scene_datasets/replica_cad_dataset/configs/objects",
+                f"{osp.basename(obj_meta['template_name'])}.object_config.json",
+            )
+            with open(obj_cfg_path) as f:
+                obj_cfg = json.load(f)
+            visual_file = osp.join(osp.dirname(obj_cfg_path), obj_cfg["render_asset"])
+            if "collision_asset" in obj_cfg:
+                collision_file = osp.join(
+                    osp.dirname(obj_cfg_path), obj_cfg["collision_asset"]
+                )
+            builder = scene.create_actor_builder()
+            pos = obj_meta["translation"]
+            rot = obj_meta["rotation"]
+            # left multiplying by the offset quaternion we used for the stage/scene background as all assets in ReplicaCAD are rotated by 90 degrees
+            pose = sapien.Pose(q=q) * sapien.Pose(pos, rot)
+
+            # Neatly for simulation, ReplicaCAD specifies if an object is meant to be simulated as dynamic (can be moved like pots) or static (must stay still, like kitchen counters)
+            if obj_meta["motion_type"] == "DYNAMIC":
+                builder.add_visual_from_file(visual_file)
+                if (
+                    "use_bounding_box_for_collision" in obj_cfg
+                    and obj_cfg["use_bounding_box_for_collision"]
+                ):
+                    # some dynamic objects do not have decomposed convex meshes and instead should use a simple bounding box for collision detection
+                    # in this case we use the add_convex_collision_from_file function of SAPIEN which just creates a convex collision based on the visual mesh
+                    builder.add_convex_collision_from_file(visual_file)
+                else:
+                    builder.add_multiple_convex_collisions_from_file(collision_file)
+                actor = builder.build(name=obj_meta["template_name"])
+                self.default_object_poses.append(
+                    (actor, pose * sapien.Pose(p=[0, 0, 0.0]))
+                )
+            elif obj_meta["motion_type"] == "STATIC":
+                builder.add_visual_from_file(visual_file, pose=pose)
+                # for static (and dynamic) objects you don't need to use pre convex decomposed meshes and instead can directly
+                # add the non convex collision mesh based on the visual mesh
+                builder.add_nonconvex_collision_from_file(visual_file, pose=pose)
+                actor = builder.build_static(name=obj_meta["template_name"])
+
+        # ReplicaCAD also provides articulated objects
+        for articulated_meta in scene_json["articulated_object_instances"]:
+
+            template_name = articulated_meta["template_name"]
+            pos = articulated_meta["translation"]
+            rot = articulated_meta["rotation"]
+            # articulated_cfg_path = osp.join(
+            #     ASSET_DIR,
+            #     "scene_datasets/replica_cad_dataset/urdf/objects",
+            #     f"{osp.basename(obj_meta['template_name'])}.object_config.json",
+            # )
+            urdf_path = osp.join(
+                ASSET_DIR,
+                f"scene_datasets/replica_cad_dataset/urdf/{template_name}/{template_name}.urdf",
+            )
+            urdf_loader = scene.create_urdf_loader()
+            urdf_loader.name = template_name
+            urdf_loader.fix_root_link = articulated_meta["fixed_base"]
+            urdf_loader.disable_self_collisions = True
+            if "uniform_scale" in articulated_meta:
+                urdf_loader.scale = articulated_meta["uniform_scale"]
+            articulation = urdf_loader.load(urdf_path)
+            pose = sapien.Pose(q=q) * sapien.Pose(pos, rot)
+            self.default_object_poses.append((articulation, pose))
+
+        # ReplicaCAD also specifies where to put lighting
+        with open(
+            osp.join(
+                ASSET_DIR,
+                "scene_datasets/replica_cad_dataset/configs/lighting",
+                f"{osp.basename(scene_json['default_lighting'])}.lighting_config.json",
+            )
+        ) as f:
+            lighting_cfg = json.load(f)
+        for light_cfg in lighting_cfg["lights"].values():
+            # It appears ReplicaCAD only specifies point light sources so we only use those here
+            if light_cfg["type"] == "point":
+                light_pos_fixed = (
+                    sapien.Pose(q=q) * sapien.Pose(p=light_cfg["position"])
+                ).p
+                # In SAPIEN, one can set color to unbounded values, higher just means more intense. ReplicaCAD provides color and intensity separately so
+                # we multiply it together here. We also take absolute value of intensity since some scene configs write negative intensities (which result in black holes)
+                scene.add_point_light(
+                    light_pos_fixed,
+                    color=np.array(light_cfg["color"]) * np.abs(light_cfg["intensity"]),
+                )
+        scene.set_ambient_light([0.3, 0.3, 0.3])
+
+    def initialize(self, env_idx):
+        if self.env.robot_uids == "fetch":
+            agent: Fetch = self.env.agent
+            agent.reset(agent.RESTING_QPOS)
+
+            # set robot to be inside the small room in the middle
+            # agent.robot.set_pose(sapien.Pose([0.8, 1.9, 0.001]))
+            # qpos = agent.robot.qpos
+            # qpos[:, 2] = 2.9
+            # agent.robot.set_qpos(qpos)
+
+            agent.robot.set_pose(sapien.Pose([-0.8, -1, 0.001]))
+
+        else:
+            raise NotImplementedError(self.env.robot_uids)
+        for obj, pose in self.default_object_poses:
+            # TODO (stao): It's not super clear if sleeping objects works on GPU but appears to improve FPS a little.
+            obj.set_pose(pose)
+            if isinstance(obj, Articulation):
+                obj.set_qpos(obj.qpos * 0)
+        # TODO (stao): settle objects for a few steps then save poses again on first run?
+
+    def disable_fetch_ground_collisions(self):
+        # TODO (stao) (arth): is there a better way to model robots in sim. This feels very unintuitive.
+        for body in self.bg._bodies:
+            cs = body.get_collision_shapes()[0]
+            cg = cs.get_collision_groups()
+            cg[2] |= FETCH_UNIQUE_COLLISION_BIT
+            cs.set_collision_groups(cg)
diff --git a/mani_skill2/utils/scene_builder/scene_builder.py b/mani_skill2/utils/scene_builder/scene_builder.py
index 14578116b..d952460d2 100644
--- a/mani_skill2/utils/scene_builder/scene_builder.py
+++ b/mani_skill2/utils/scene_builder/scene_builder.py
@@ -1,18 +1,23 @@
-from dataclasses import dataclass, field
-from typing import List
+from typing import Any, Dict, List
 
 import sapien
 
 from mani_skill2.envs.sapien_env import BaseEnv
-from mani_skill2.envs.scene import ManiSkillScene
 from mani_skill2.utils.structs.actor import Actor
 
 
-@dataclass
 class SceneBuilder:
     env: BaseEnv
-    _scene_objects: List[Actor] = field(default_factory=list)
-    _movable_objects: List[Actor] = field(default_factory=list)
+    _scene_objects: List[Actor] = []
+    _movable_objects: List[Actor] = []
+    builds_lighting: bool = False
+    """Whether this scene builder will add it's own lighting when build is called. If False, ManiSkill will add some default lighting"""
+    scene_configs: List[Any] = None
+    """List of scene configuration information that can be used to construct scenes. Can be simply a path to a json file or a dictionary"""
+
+    def __init__(self, env, robot_init_qpos_noise=0.02):
+        self.env = env
+        self.robot_init_qpos_noise = robot_init_qpos_noise
 
     def build(self, **kwargs):
         """
@@ -31,9 +36,17 @@ def scene(self):
         return self.env._scene
 
     @property
-    def scene_objects(self):
+    def scene_objects(self) -> List[Actor]:
+        raise NotImplementedError()
+
+    @property
+    def movable_objects(self) -> List[Actor]:
+        raise NotImplementedError()
+
+    @property
+    def scene_objects_by_id(self) -> Dict[str, Actor]:
         raise NotImplementedError()
 
     @property
-    def movable_objects(self):
+    def movable_objects_by_id(self) -> Dict[str, Actor]:
         raise NotImplementedError()
diff --git a/mani_skill2/utils/scene_builder/table/table_scene_builder.py b/mani_skill2/utils/scene_builder/table/table_scene_builder.py
index 000ed5d2d..45bcd0662 100644
--- a/mani_skill2/utils/scene_builder/table/table_scene_builder.py
+++ b/mani_skill2/utils/scene_builder/table/table_scene_builder.py
@@ -1,5 +1,4 @@
 import os.path as osp
-from dataclasses import dataclass
 from pathlib import Path
 from typing import List
 
@@ -9,11 +8,11 @@
 from transforms3d.euler import euler2quat
 
 from mani_skill2.agents.multi_agent import MultiAgent
+from mani_skill2.agents.robots.fetch import FETCH_UNIQUE_COLLISION_BIT
 from mani_skill2.utils.building.ground import build_ground
 from mani_skill2.utils.scene_builder import SceneBuilder
 
 
-@dataclass
 class TableSceneBuilder(SceneBuilder):
     robot_init_qpos_noise: float = 0.02
 
@@ -113,16 +112,14 @@ def initialize(self):
             self.env.agent.reset(qpos)
             self.env.agent.robot.set_pose(sapien.Pose([-1.05, 0, -self.table_height]))
 
-            from mani_skill2.agents.robots.fetch import FETCH_UNIQUE_COLLISION_BIT
-
-            cs = (
-                self.ground._objs[0]
-                .find_component_by_type(sapien.physx.PhysxRigidStaticComponent)
-                .get_collision_shapes()[0]
-            )
-            cg = cs.get_collision_groups()
-            cg[2] |= FETCH_UNIQUE_COLLISION_BIT
-            cs.set_collision_groups(cg)
+            # TODO (stao) (arth): is there a better way to model robots in sim. This feels very unintuitive.
+            for obj in self.ground._objs:
+                cs = obj.find_component_by_type(
+                    sapien.physx.PhysxRigidStaticComponent
+                ).get_collision_shapes()[0]
+                cg = cs.get_collision_groups()
+                cg[2] |= FETCH_UNIQUE_COLLISION_BIT
+                cs.set_collision_groups(cg)
         elif self.env.robot_uids == ("panda", "panda"):
             agent: MultiAgent = self.env.agent
             qpos = np.array(
diff --git a/mani_skill2/utils/structs/actor.py b/mani_skill2/utils/structs/actor.py
index c4754df80..bc52ba650 100644
--- a/mani_skill2/utils/structs/actor.py
+++ b/mani_skill2/utils/structs/actor.py
@@ -36,6 +36,9 @@ class Actor(PhysxRigidDynamicComponentStruct, BaseStruct[sapien.Entity]):
     _builder_initial_pose: sapien.Pose = None
     name: str = None
 
+    def __hash__(self):
+        return self._objs[0].__hash__()
+
     @classmethod
     def _create_from_entities(
         cls,
@@ -57,6 +60,11 @@ def _create_from_entities(
                 px_body_type = "kinematic"
             else:
                 px_body_type = "dynamic"
+        else:
+            bodies = [
+                ent.find_component_by_type(physx.PhysxRigidStaticComponent)
+                for ent in entities
+            ]
         return cls(
             _objs=entities,
             _scene=scene,
diff --git a/mani_skill2/utils/structs/articulation.py b/mani_skill2/utils/structs/articulation.py
index ee27a7f4c..8418b88c5 100644
--- a/mani_skill2/utils/structs/articulation.py
+++ b/mani_skill2/utils/structs/articulation.py
@@ -46,6 +46,8 @@ class Articulation(BaseStruct[physx.PhysxArticulation]):
     """Maps joint name to the Joint object"""
     active_joints: List[Joint]
     """List of active Joint objects, referencing elements in self.joints"""
+    active_joint_map: OrderedDict[str, Joint]
+    """Maps active joint name to the Joint object, referencing elements in self.joints"""
 
     name: str = None
     """Name of this articulation"""
@@ -93,6 +95,7 @@ def _create_from_physx_articulations(
             joints=[],
             joint_map=OrderedDict(),
             active_joints=[],
+            active_joint_map=OrderedDict(),
             name=shared_name,
             _merged=_merged,
         )
@@ -141,13 +144,19 @@ def _create_from_physx_articulations(
         joint_map = OrderedDict()
         wrapped_joints: List[Joint] = []
         for joint_index, joints in enumerate(all_joint_objs):
-            wrapped_joint = Joint.create(joints, self, joint_index)
+            try:
+                active_joint_index = all_active_joint_names.index(
+                    all_joint_names[joint_index]
+                )
+            except:
+                active_joint_index = None
+            wrapped_joint = Joint.create(joints, self, joint_index, active_joint_index)
             joint_map[wrapped_joint.name] = wrapped_joint
             wrapped_joints.append(wrapped_joint)
         self.joints = wrapped_joints
         self.joint_map = joint_map
         self.active_joints = [wrapped_joints[i] for i in active_joint_indices]
-
+        self.active_joint_map = {joint.name: joint for joint in self.active_joints}
         return self
 
     @classmethod
diff --git a/mani_skill2/utils/structs/joint.py b/mani_skill2/utils/structs/joint.py
index 04b53c491..b05d5b955 100644
--- a/mani_skill2/utils/structs/joint.py
+++ b/mani_skill2/utils/structs/joint.py
@@ -28,6 +28,9 @@ class Joint(BaseStruct[physx.PhysxArticulationJoint]):
 
     articulation: Articulation
     index: int
+    """index of this joint among all joints"""
+    active_index: int
+    """index of this joint amongst the active joints"""
 
     child_link: Link = None
     parent_link: Link = None
@@ -42,6 +45,7 @@ def create(
         physx_joints: List[physx.PhysxArticulationJoint],
         articulation: Articulation,
         joint_index: int,
+        active_joint_index: int,
     ):
         # naming convention of the original physx joints is "scene-<id>-<articulation_name>_<original_joint_name>"
         shared_name = "_".join(
@@ -69,6 +73,7 @@ def create(
         return cls(
             articulation=articulation,
             index=joint_index,
+            active_index=active_joint_index,
             _objs=physx_joints,
             _scene=articulation._scene,
             _scene_mask=articulation._scene_mask,
diff --git a/mani_skill2/utils/structs/types.py b/mani_skill2/utils/structs/types.py
index 788c1b1f7..fd3bb083c 100644
--- a/mani_skill2/utils/structs/types.py
+++ b/mani_skill2/utils/structs/types.py
@@ -23,7 +23,7 @@ class GPUMemoryConfig:
 
     temp_buffer_capacity: int = 2**24
     """Increase this if you get 'PxgPinnedHostLinearMemoryAllocator: overflowing initial allocation size, increase capacity to at least %.' """
-    max_rigid_contact_count: int = 524288
+    max_rigid_contact_count: int = 2**19
     max_rigid_patch_count: int = (
         2**18
     )  # 81920 is SAPIEN default but most tasks work with 2**18
diff --git a/mani_skill2/utils/wrappers/record.py b/mani_skill2/utils/wrappers/record.py
index 3a87ec5e4..afa6a4b73 100644
--- a/mani_skill2/utils/wrappers/record.py
+++ b/mani_skill2/utils/wrappers/record.py
@@ -214,13 +214,14 @@ def capture_image(self):
 
     def reset(
         self,
-        *,
+        *args,
         seed: Optional[Union[int, List[int]]] = None,
-        options: Optional[dict] = dict(),
+        options: Optional[dict] = None,
         **kwargs,
     ):
         skip_trajectory = False
-        options.pop("save_trajectory", False)
+        if options is not None:
+            options.pop("save_trajectory", False)
 
         if self.save_on_reset and self._episode_id >= 0 and not skip_trajectory:
             self.flush_trajectory(ignore_empty_transition=True)
@@ -235,7 +236,7 @@ def reset(
             self._episode_id += 1
 
         reset_kwargs = copy.deepcopy(dict(seed=seed, options=options, **kwargs))
-        obs, info = super().reset(seed=seed, options=options, **kwargs)
+        obs, info = super().reset(*args, seed=seed, options=options, **kwargs)
 
         if self.save_trajectory:
             state = self._base_env.get_state()
diff --git a/manualtest/scene_gpu.py b/manualtest/scene_gpu.py
new file mode 100644
index 000000000..20e3d1e69
--- /dev/null
+++ b/manualtest/scene_gpu.py
@@ -0,0 +1,97 @@
+import gymnasium as gym
+import numpy as np
+import torch
+
+from mani_skill2.envs.scenes.base_env import SceneManipulationEnv
+from mani_skill2.utils.scene_builder.ai2thor import (
+    ArchitecTHORSceneBuilder,
+    ProcTHORSceneBuilder,
+    RoboTHORSceneBuilder,
+    iTHORSceneBuilder,
+)
+from mani_skill2.utils.scene_builder.replicacad.scene_builder import (
+    ReplicaCADSceneBuilder,
+)
+from mani_skill2.utils.wrappers import RecordEpisode
+
+if __name__ == "__main__":
+
+    render_mode = "human"
+    print("RENDER_MODE", render_mode)
+
+    env = gym.make(
+        "SceneManipulation-v1",
+        render_mode=render_mode,
+        robot_uids="fetch",
+        scene_builder_cls=ReplicaCADSceneBuilder,
+        num_envs=1,
+        # force_use_gpu_sim=True,
+        scene_idxs=3,
+        # obs_mode="rgbd"
+    )
+    obs, info = env.reset(seed=0)
+    # import matplotlib.pyplot as plt
+    # import ipdb;ipdb.set_trace()
+    viewer = env.render()
+    # base_env: SequentialTaskEnv = env.unwrapped
+    viewer.paused = True
+    viewer = env.render()
+    for i in range(10000):
+        action = np.zeros(env.action_space.shape)
+        # action[..., -7] = -1
+        if i < 60:
+            action[..., -3] = 1
+        elif i < 100:
+            action[..., -3] = 1
+            action[..., -1] = 0.3
+        elif i < 200:
+            action[..., -3] = 1
+            # action[..., -1] = .22
+        else:
+            action = env.action_space.sample()
+        if viewer.window.key_press("r"):
+            obs, info = env.reset()
+        #
+        env.step(action)
+        # print(
+        #     robot.qpos[:, robot.active_joint_map["r_gripper_finger_joint"].active_index]
+        # )
+        # print(
+        #     robot.qpos[:, robot.active_joint_map["l_gripper_finger_joint"].active_index]
+        # )
+
+        env.render()
+    # agent_obs = obs["agent"]
+    # extra_obs = obs["extra"]
+    # fetch_head_depth = obs["sensor_data"]["fetch_head"]["depth"]
+    # fetch_hand_depth = obs["sensor_data"]["fetch_hand"]["depth"]
+
+    # # fetch_head_depth = torch.Tensor(fetch_head_depth.astype(np.int16))
+    # # fetch_hand_depth = torch.Tensor(fetch_hand_depth.astype(np.int16))
+
+    # import matplotlib.pyplot as plt
+    # plt.imsave('out.png', np.concatenate([fetch_head_depth, fetch_hand_depth], axis=-2)[..., 0])
+
+    # env = RecordEpisode(env, '.', save_trajectory=False)
+    # env.reset(seed=0)
+
+    # # print force on robot
+    # robot_link_names = [link.name for link in env.agent.robot.links]
+    # print(robot_link_names)
+    # print(torch.norm(env.agent.robot.get_net_contact_forces(robot_link_names), dim=-1))
+
+    # for step_num in range(30):
+    #     action = np.zeros(env.action_space.shape)
+
+    #     # torso up
+    #     action[..., -4] = 1
+    #     # head still
+    #     action[..., -5] = 0
+    #     # gripper open
+    #     action[..., -7] = 1
+
+    #     # move forward and left
+    #     action[..., -3:-1] = 1
+
+    #     obs, rew, term, trunc, info = env.step(action=action)
+    # env.close()
diff --git a/manualtest/scene_manip_test.py b/manualtest/scene_manip_test.py
new file mode 100644
index 000000000..f6b56f95d
--- /dev/null
+++ b/manualtest/scene_manip_test.py
@@ -0,0 +1,96 @@
+from mani_skill2.envs.scenes.tasks import SequentialTaskEnv, PickSequentialTaskEnv
+from mani_skill2.envs.scenes.tasks.planner import PickSubtask, plan_data_from_file
+from mani_skill2.utils.scene_builder.ai2thor import (
+    ProcTHORSceneBuilder, ArchitecTHORSceneBuilder,
+    iTHORSceneBuilder, RoboTHORSceneBuilder,
+)
+from mani_skill2.envs.scenes.base_env import SceneManipulationEnv
+from mani_skill2.utils.wrappers import RecordEpisode
+
+
+import gymnasium as gym
+import numpy as np
+import torch
+
+import os
+
+render_mode = (
+    "rgb_array" if (
+        "SAPIEN_NO_DISPLAY" in os.environ
+        and int(os.environ["SAPIEN_NO_DISPLAY"]) == 1
+    ) else "human"
+)
+render_mode = "rgb_array"
+print("RENDER_MODE", render_mode)
+
+SCENE_IDX_TO_APPLE_PLAN = {
+    0: [PickSubtask(obj_id="objects/Apple_5_111")],
+    1: [PickSubtask(obj_id="objects/Apple_16_40")],
+    2: [PickSubtask(obj_id="objects/Apple_12_64")],
+    3: [PickSubtask(obj_id="objects/Apple_29_113")],
+    4: [PickSubtask(obj_id="objects/Apple_28_35")],
+    5: [PickSubtask(obj_id="objects/Apple_17_88")],
+    6: [PickSubtask(obj_id="objects/Apple_1_35")],
+    7: [PickSubtask(obj_id="objects/Apple_25_48")],
+    8: [PickSubtask(obj_id="objects/Apple_9_46")],
+    9: [PickSubtask(obj_id="objects/Apple_13_72")],
+}
+
+SCENE_IDX = 6
+env: SequentialTaskEnv = gym.make(
+    'SequentialTask-v0',
+    obs_mode='rgbd',
+    render_mode=render_mode,
+    control_mode='pd_joint_delta_pos',
+    reward_mode='dense',
+    robot_uids='fetch',
+    scene_builder_cls=ArchitecTHORSceneBuilder,
+    task_plans=[SCENE_IDX_TO_APPLE_PLAN[SCENE_IDX]],
+    scene_idxs=SCENE_IDX,
+    # num_envs=2,
+)
+
+print(env.unwrapped._init_raw_obs)
+
+# print(env.observation_space.keys())
+
+obs, info = env.reset(seed=0)
+
+# agent_obs = obs["agent"]
+# extra_obs = obs["extra"]
+# fetch_head_depth = obs["sensor_data"]["fetch_head"]["depth"]
+# fetch_hand_depth = obs["sensor_data"]["fetch_hand"]["depth"]
+
+# # fetch_head_depth = torch.Tensor(fetch_head_depth.astype(np.int16))
+# # fetch_hand_depth = torch.Tensor(fetch_hand_depth.astype(np.int16))
+
+# import matplotlib.pyplot as plt
+# plt.imsave('out.png', np.concatenate([fetch_head_depth, fetch_hand_depth], axis=-2)[..., 0])
+
+
+
+
+
+# env = RecordEpisode(env, '.', save_trajectory=False)
+# env.reset(seed=0)
+
+# # print force on robot
+# robot_link_names = [link.name for link in env.agent.robot.links]
+# print(robot_link_names)
+# print(torch.norm(env.agent.robot.get_net_contact_forces(robot_link_names), dim=-1))
+
+# for step_num in range(30):
+#     action = np.zeros(env.action_space.shape)
+
+#     # torso up
+#     action[..., -4] = 1
+#     # head still
+#     action[..., -5] = 0
+#     # gripper open
+#     action[..., -7] = 1
+
+#     # move forward and left
+#     action[..., -3:-1] = 1
+
+#     obs, rew, term, trunc, info = env.step(action=action)
+# env.close()
diff --git a/setup.py b/setup.py
index 65b682e14..4a50dd4ea 100644
--- a/setup.py
+++ b/setup.py
@@ -35,6 +35,7 @@
         "imageio",
         "imageio[ffmpeg]",
         "pytorch_kinematics",
+        "huggingface_hub",  # we use HF to version control some assets/datasets more easily
     ],
     # Glob patterns do not automatically match dotfiles
     package_data={