From a8e37cb9218cdd3c20eedd91e4e46021f1f3a81c Mon Sep 17 00:00:00 2001 From: DelinQu Date: Thu, 4 Jul 2024 21:18:28 +0800 Subject: [PATCH 1/9] openvla intergration --- scripts/openvla_bridge.sh | 50 +++++ scripts/openvla_drawer_variant_agg.sh | 82 +++++++ scripts/openvla_drawer_visual_matching.sh | 132 ++++++++++++ scripts/openvla_move_near_variant_agg.sh | 138 ++++++++++++ scripts/openvla_move_near_visual_matching.sh | 33 +++ scripts/openvla_pick_coke_can_variant_agg.sh | 182 ++++++++++++++++ .../openvla_pick_coke_can_visual_matching.sh | 40 ++++ scripts/openvla_put_in_drawer_variant_agg.sh | 78 +++++++ .../openvla_put_in_drawer_visual_matching.sh | 65 ++++++ simpler_env/main_inference.py | 10 +- simpler_env/policies/openvla/openvla_model.py | 203 ++++++++++++++++++ 11 files changed, 1012 insertions(+), 1 deletion(-) create mode 100644 scripts/openvla_bridge.sh create mode 100644 scripts/openvla_drawer_variant_agg.sh create mode 100644 scripts/openvla_drawer_visual_matching.sh create mode 100644 scripts/openvla_move_near_variant_agg.sh create mode 100644 scripts/openvla_move_near_visual_matching.sh create mode 100644 scripts/openvla_pick_coke_can_variant_agg.sh create mode 100644 scripts/openvla_pick_coke_can_visual_matching.sh create mode 100644 scripts/openvla_put_in_drawer_variant_agg.sh create mode 100644 scripts/openvla_put_in_drawer_visual_matching.sh create mode 100644 simpler_env/policies/openvla/openvla_model.py diff --git a/scripts/openvla_bridge.sh b/scripts/openvla_bridge.sh new file mode 100644 index 00000000..e8c8289e --- /dev/null +++ b/scripts/openvla_bridge.sh @@ -0,0 +1,50 @@ +gpu_id=0 +policy_model=openvla +ckpt_path="openvla/openvla-7b" + +scene_name=bridge_table_1_v1 +robot=widowx +rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png +robot_init_x=0.147 +robot_init_y=0.028 +export DISPLAY=:1.0 +# VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json +# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \ +# --robot ${robot} --policy-setup widowx_bridge \ +# --control-freq 5 --sim-freq 500 --max-episode-steps 60 \ +# --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \ +# --rgb-overlay-path ${rgb_overlay_path} \ +# --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \ +# --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1; +# +# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \ +# --robot ${robot} --policy-setup widowx_bridge \ +# --control-freq 5 --sim-freq 500 --max-episode-steps 60 \ +# --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \ +# --rgb-overlay-path ${rgb_overlay_path} \ +# --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \ +# --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1; +# +# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \ +# --robot ${robot} --policy-setup widowx_bridge \ +# --control-freq 5 --sim-freq 500 --max-episode-steps 60 \ +# --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \ +# --rgb-overlay-path ${rgb_overlay_path} \ +# --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \ +# --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1; + + +scene_name=bridge_table_1_v2 +robot=widowx_sink_camera_setup +rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_sink.png +robot_init_x=0.127 +robot_init_y=0.06 + +python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \ + --robot ${robot} --policy-setup widowx_bridge \ + --control-freq 5 --sim-freq 500 --max-episode-steps 120 \ + --env-name PutEggplantInBasketScene-v0 --scene-name ${scene_name} \ + --rgb-overlay-path ${rgb_overlay_path} \ + --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1; + diff --git a/scripts/openvla_drawer_variant_agg.sh b/scripts/openvla_drawer_variant_agg.sh new file mode 100644 index 00000000..07f64600 --- /dev/null +++ b/scripts/openvla_drawer_variant_agg.sh @@ -0,0 +1,82 @@ +# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth +declare -a ckpt_paths=( +"openvla/openvla-7b" +) + +declare -a env_names=( +OpenTopDrawerCustomInScene-v0 +OpenMiddleDrawerCustomInScene-v0 +OpenBottomDrawerCustomInScene-v0 +CloseTopDrawerCustomInScene-v0 +CloseMiddleDrawerCustomInScene-v0 +CloseBottomDrawerCustomInScene-v0 +) + +EXTRA_ARGS="--enable-raytracing" + + +# base setup +scene_name=frl_apartment_stage_simple + +EvalSim() { + echo ${ckpt_path} ${env_name} + + python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 113 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.65 0.85 3 --robot-init-y -0.2 0.2 3 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0.0 0.0 1 \ + --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \ + ${EXTRA_ARGS} +} + + +for ckpt_path in "${ckpt_paths[@]}"; do + for env_name in "${env_names[@]}"; do + EvalSim + done +done + + +# backgrounds + +declare -a scene_names=( +"modern_bedroom_no_roof" +"modern_office_no_roof" +) + +for scene_name in "${scene_names[@]}"; do + for ckpt_path in "${ckpt_paths[@]}"; do + for env_name in "${env_names[@]}"; do + EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt" + EvalSim + done + done +done + + +# lightings +scene_name=frl_apartment_stage_simple + +for ckpt_path in "${ckpt_paths[@]}"; do + for env_name in "${env_names[@]}"; do + EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=brighter" + EvalSim + EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=darker" + EvalSim + done +done + + +# new cabinets +scene_name=frl_apartment_stage_simple + +for ckpt_path in "${ckpt_paths[@]}"; do + for env_name in "${env_names[@]}"; do + EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station2" + EvalSim + EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station3" + EvalSim + done +done diff --git a/scripts/openvla_drawer_visual_matching.sh b/scripts/openvla_drawer_visual_matching.sh new file mode 100644 index 00000000..a021660e --- /dev/null +++ b/scripts/openvla_drawer_visual_matching.sh @@ -0,0 +1,132 @@ +# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth +declare -a ckpt_paths=( +"openvla/openvla-7b" +) + +declare -a env_names=( +OpenTopDrawerCustomInScene-v0 +OpenMiddleDrawerCustomInScene-v0 +OpenBottomDrawerCustomInScene-v0 +CloseTopDrawerCustomInScene-v0 +CloseMiddleDrawerCustomInScene-v0 +CloseBottomDrawerCustomInScene-v0 +) + +# URDF variations +declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None) + +for urdf_version in "${urdf_version_arr[@]}"; do + +EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version}" + +EvalOverlay() { +# A0 +python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 113 \ + --env-name ${env_name} --scene-name dummy_drawer \ + --robot-init-x 0.644 0.644 1 --robot-init-y -0.179 -0.179 1 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.03 -0.03 1 \ + --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \ + --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \ + ${EXTRA_ARGS} + +# A1 +python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 113 \ + --env-name ${env_name} --scene-name dummy_drawer \ + --robot-init-x 0.765 0.765 1 --robot-init-y -0.182 -0.182 1 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.02 -0.02 1 \ + --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \ + --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a1.png \ + ${EXTRA_ARGS} + +# A2 +python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 113 \ + --env-name ${env_name} --scene-name dummy_drawer \ + --robot-init-x 0.889 0.889 1 --robot-init-y -0.203 -0.203 1 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.06 -0.06 1 \ + --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \ + --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a2.png \ + ${EXTRA_ARGS} + +# B0 +python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 113 \ + --env-name ${env_name} --scene-name dummy_drawer \ + --robot-init-x 0.652 0.652 1 --robot-init-y 0.009 0.009 1 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \ + --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \ + ${EXTRA_ARGS} + +# B1 +python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 113 \ + --env-name ${env_name} --scene-name dummy_drawer \ + --robot-init-x 0.752 0.752 1 --robot-init-y 0.009 0.009 1 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \ + --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b1.png \ + ${EXTRA_ARGS} + +# B2 +python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 113 \ + --env-name ${env_name} --scene-name dummy_drawer \ + --robot-init-x 0.851 0.851 1 --robot-init-y 0.035 0.035 1 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \ + --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b2.png \ + ${EXTRA_ARGS} + +# C0 +python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 113 \ + --env-name ${env_name} --scene-name dummy_drawer \ + --robot-init-x 0.665 0.665 1 --robot-init-y 0.224 0.224 1 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \ + --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \ + ${EXTRA_ARGS} + +# C1 +python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 113 \ + --env-name ${env_name} --scene-name dummy_drawer \ + --robot-init-x 0.765 0.765 1 --robot-init-y 0.222 0.222 1 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \ + --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \ + --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c1.png \ + ${EXTRA_ARGS} + +# C2 +python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 113 \ + --env-name ${env_name} --scene-name dummy_drawer \ + --robot-init-x 0.865 0.865 1 --robot-init-y 0.222 0.222 1 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \ + --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \ + --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c2.png \ + ${EXTRA_ARGS} +} + + +for ckpt_path in "${ckpt_paths[@]}"; do + for env_name in "${env_names[@]}"; do + EvalOverlay + done +done + + + +done diff --git a/scripts/openvla_move_near_variant_agg.sh b/scripts/openvla_move_near_variant_agg.sh new file mode 100644 index 00000000..e5ad488e --- /dev/null +++ b/scripts/openvla_move_near_variant_agg.sh @@ -0,0 +1,138 @@ + +gpu_id=0 + +declare -a arr=("openvla/openvla-7b") +for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done + + +# base setup + +env_name=MoveNearGoogleInScene-v0 +scene_name=google_pick_coke_can_1_v4 + +for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1; + +done + + + +# distractor + +for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \ + --additional-env-build-kwargs no_distractor=True; + +done + + +# backgrounds + +env_name=MoveNearGoogleInScene-v0 +declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \ + "google_pick_coke_can_1_v4_alt_background_2") + +for scene_name in "${scene_arr[@]}"; + +do for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1; + +done + +done + + + + + +# lighting + +env_name=MoveNearGoogleInScene-v0 +scene_name=google_pick_coke_can_1_v4 + +for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \ + --additional-env-build-kwargs slightly_darker_lighting=True; + +CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \ + --additional-env-build-kwargs slightly_brighter_lighting=True; + +done + + + + + +# table textures + +env_name=MoveNearGoogleInScene-v0 +declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \ + "Baked_sc1_staging_objaverse_cabinet2_h870") + +for scene_name in "${scene_arr[@]}"; + +do for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1; + +done + +done + + + + +# camera orientations + +declare -a env_arr=("MoveNearAltGoogleCameraInScene-v0" \ + "MoveNearAltGoogleCamera2InScene-v0") +scene_name=google_pick_coke_can_1_v4 + +for env_name in "${env_arr[@]}"; + +do for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1; + +done + +done diff --git a/scripts/openvla_move_near_visual_matching.sh b/scripts/openvla_move_near_visual_matching.sh new file mode 100644 index 00000000..7e334d8c --- /dev/null +++ b/scripts/openvla_move_near_visual_matching.sh @@ -0,0 +1,33 @@ + +gpu_id=0 + +declare -a arr=("openvla/openvla-7b") + +env_name=MoveNearGoogleBakedTexInScene-v0 +# env_name=MoveNearGoogleBakedTexInScene-v1 +scene_name=google_pick_coke_can_1_v4 +rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_move_near_real_eval_1.png + +# URDF variations +declare -a urdf_version_arr=(None "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" "recolor_cabinet_visual_matching_1") + +for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done + + +for urdf_version in "${urdf_version_arr[@]}"; + +do for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --rgb-overlay-path ${rgb_overlay_path} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \ + --additional-env-build-kwargs urdf_version=${urdf_version} \ + --additional-env-save-tags baked_except_bpb_orange; + +done + +done diff --git a/scripts/openvla_pick_coke_can_variant_agg.sh b/scripts/openvla_pick_coke_can_variant_agg.sh new file mode 100644 index 00000000..72e91605 --- /dev/null +++ b/scripts/openvla_pick_coke_can_variant_agg.sh @@ -0,0 +1,182 @@ + +gpu_id=0 + +declare -a arr=("openvla/openvla-7b") + +# lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically +declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True") + +for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done + + +# base setup + +env_name=GraspSingleOpenedCokeCanInScene-v0 +scene_name=google_pick_coke_can_1_v4 + +for coke_can_option in "${coke_can_options_arr[@]}"; + +do for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --additional-env-build-kwargs ${coke_can_option}; + +done + +done + + + +# table textures + +env_name=GraspSingleOpenedCokeCanInScene-v0 + +declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \ + "Baked_sc1_staging_objaverse_cabinet2_h870") + + +for coke_can_option in "${coke_can_options_arr[@]}"; + +do for scene_name in "${scene_arr[@]}"; + +do for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --additional-env-build-kwargs ${coke_can_option}; + +done + +done + +done + + + + +# distractors + +env_name=GraspSingleOpenedCokeCanDistractorInScene-v0 +scene_name=google_pick_coke_can_1_v4 + +for coke_can_option in "${coke_can_options_arr[@]}"; + +do for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --additional-env-build-kwargs ${coke_can_option}; + +CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --additional-env-build-kwargs ${coke_can_option} distractor_config=more; + +done + +done + + + + +# backgrounds + +env_name=GraspSingleOpenedCokeCanInScene-v0 +declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \ + "google_pick_coke_can_1_v4_alt_background_2") + +for coke_can_option in "${coke_can_options_arr[@]}"; + +do for scene_name in "${scene_arr[@]}"; + +do for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --additional-env-build-kwargs ${coke_can_option}; + +done + +done + +done + + + +# lightings + +env_name=GraspSingleOpenedCokeCanInScene-v0 +scene_name=google_pick_coke_can_1_v4 + +for coke_can_option in "${coke_can_options_arr[@]}"; + +do for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --additional-env-build-kwargs ${coke_can_option} slightly_darker_lighting=True; + +CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --additional-env-build-kwargs ${coke_can_option} slightly_brighter_lighting=True; + +done + +done + + + + +# camera orientations + +declare -a env_arr=("GraspSingleOpenedCokeCanAltGoogleCameraInScene-v0" \ + "GraspSingleOpenedCokeCanAltGoogleCamera2InScene-v0") +scene_name=google_pick_coke_can_1_v4 + +for coke_can_option in "${coke_can_options_arr[@]}"; + +do for env_name in "${env_arr[@]}"; + +do for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --additional-env-build-kwargs ${coke_can_option}; + +done + +done + +done diff --git a/scripts/openvla_pick_coke_can_visual_matching.sh b/scripts/openvla_pick_coke_can_visual_matching.sh new file mode 100644 index 00000000..083a71ee --- /dev/null +++ b/scripts/openvla_pick_coke_can_visual_matching.sh @@ -0,0 +1,40 @@ + + +gpu_id=0 + +declare -a arr=("openvla/openvla-7b") + +# lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically +declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True") + +# URDF variations +declare -a urdf_version_arr=(None "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" "recolor_cabinet_visual_matching_1") + +env_name=GraspSingleOpenedCokeCanInScene-v0 +scene_name=google_pick_coke_can_1_v4 +rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_coke_can_real_eval_1.png + +for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done + + + +for urdf_version in "${urdf_version_arr[@]}"; + +do for coke_can_option in "${coke_can_options_arr[@]}"; + +do for ckpt_path in "${arr[@]}"; + +do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 80 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --rgb-overlay-path ${rgb_overlay_path} \ + --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version}; + +done + +done + +done diff --git a/scripts/openvla_put_in_drawer_variant_agg.sh b/scripts/openvla_put_in_drawer_variant_agg.sh new file mode 100644 index 00000000..a3c63a12 --- /dev/null +++ b/scripts/openvla_put_in_drawer_variant_agg.sh @@ -0,0 +1,78 @@ +# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth + + + +declare -a arr=("openvla/openvla-7b") + +declare -a env_names=( +PlaceIntoClosedTopDrawerCustomInScene-v0 +) + +EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs model_ids=apple" + + +# base setup +scene_name=frl_apartment_stage_simple + +EvalSim() { + echo ${ckpt_path} ${env_name} + + python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 200 \ + --env-name ${env_name} --scene-name ${scene_name} \ + --robot-init-x 0.65 0.65 1 --robot-init-y -0.2 0.2 3 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0.0 0.0 1 \ + --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \ + ${EXTRA_ARGS} +} + + +for ckpt_path in "${ckpt_paths[@]}"; do + for env_name in "${env_names[@]}"; do + EvalSim + done +done + + +# backgrounds + +declare -a scene_names=( +"modern_bedroom_no_roof" +"modern_office_no_roof" +) + +for scene_name in "${scene_names[@]}"; do + for ckpt_path in "${ckpt_paths[@]}"; do + for env_name in "${env_names[@]}"; do + EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt model_ids=apple" + EvalSim + done + done +done + + +# lightings +scene_name=frl_apartment_stage_simple + +for ckpt_path in "${ckpt_paths[@]}"; do + for env_name in "${env_names[@]}"; do + EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=brighter model_ids=apple" + EvalSim + EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=darker model_ids=apple" + EvalSim + done +done + + +# new cabinets +scene_name=frl_apartment_stage_simple + +for ckpt_path in "${ckpt_paths[@]}"; do + for env_name in "${env_names[@]}"; do + EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station2 model_ids=apple" + EvalSim + EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station3 model_ids=apple" + EvalSim + done +done diff --git a/scripts/openvla_put_in_drawer_visual_matching.sh b/scripts/openvla_put_in_drawer_visual_matching.sh new file mode 100644 index 00000000..98539bf3 --- /dev/null +++ b/scripts/openvla_put_in_drawer_visual_matching.sh @@ -0,0 +1,65 @@ +# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth + +declare -a arr=("openvla/openvla-7b") + + +declare -a env_names=( +PlaceIntoClosedTopDrawerCustomInScene-v0 +# PlaceIntoClosedMiddleDrawerCustomInScene-v0 +# PlaceIntoClosedBottomDrawerCustomInScene-v0 +) + + +# URDF variations +declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None) + +for urdf_version in "${urdf_version_arr[@]}"; do + +EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version} model_ids=baked_apple_v2" + + +EvalOverlay() { +# A0 +python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 200 \ + --env-name ${env_name} --scene-name dummy_drawer \ + --robot-init-x 0.644 0.644 1 --robot-init-y -0.179 -0.179 1 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.03 -0.03 1 \ + --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \ + --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \ + ${EXTRA_ARGS} + +# B0 +python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 200 \ + --env-name ${env_name} --scene-name dummy_drawer \ + --robot-init-x 0.652 0.652 1 --robot-init-y 0.009 0.009 1 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \ + --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \ + ${EXTRA_ARGS} + +# C0 +python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \ + --robot google_robot_static \ + --control-freq 3 --sim-freq 513 --max-episode-steps 200 \ + --env-name ${env_name} --scene-name dummy_drawer \ + --robot-init-x 0.665 0.665 1 --robot-init-y 0.224 0.224 1 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \ + --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \ + --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \ + ${EXTRA_ARGS} +} + + +for ckpt_path in "${ckpt_paths[@]}"; do + for env_name in "${env_names[@]}"; do + EvalOverlay + done +done + + + +done diff --git a/simpler_env/main_inference.py b/simpler_env/main_inference.py index 320f2685..7ebe20c2 100644 --- a/simpler_env/main_inference.py +++ b/simpler_env/main_inference.py @@ -7,6 +7,7 @@ from simpler_env.evaluation.maniskill2_evaluator import maniskill2_evaluator from simpler_env.policies.octo.octo_server_model import OctoServerInference from simpler_env.policies.rt1.rt1_model import RT1Inference +from simpler_env.policies.openvla.openvla_model import OpenVALInference try: from simpler_env.policies.octo.octo_model import OctoInference @@ -28,7 +29,7 @@ gpus[0], [tf.config.LogicalDeviceConfiguration(memory_limit=args.tf_memory_limit)], ) - + print(f"**** {args.policy_model} ****") # policy model creation; update this if you are using a new policy model if args.policy_model == "rt1": assert args.ckpt_path is not None @@ -53,6 +54,13 @@ init_rng=args.octo_init_rng, action_scale=args.action_scale, ) + elif args.policy_model == "openvla": + assert args.ckpt_path is not None + model = OpenVALInference( + saved_model_path=args.ckpt_path, + policy_setup=args.policy_setup, + action_scale=args.action_scale, + ) else: raise NotImplementedError() diff --git a/simpler_env/policies/openvla/openvla_model.py b/simpler_env/policies/openvla/openvla_model.py new file mode 100644 index 00000000..66bdf0d4 --- /dev/null +++ b/simpler_env/policies/openvla/openvla_model.py @@ -0,0 +1,203 @@ +from collections import deque +from typing import Optional, Sequence +import os +import matplotlib.pyplot as plt +import numpy as np +from transforms3d.euler import euler2axangle +from simpler_env.utils.action.action_ensemble import ActionEnsembler +from transformers import AutoModelForVision2Seq, AutoProcessor +from PIL import Image +import torch +import cv2 as cv + + +class OpenVALInference: + def __init__( + self, + saved_model_path: str = "openvla/openvla-7b", + unnorm_key: Optional[str] = None, + policy_setup: str = "widowx_bridge", + horizon: int = 2, + pred_action_horizon: int = 1, + exec_horizon: int = 1, + image_size: list[int] = [224, 224], + action_scale: float = 1.0, + ) -> None: + os.environ["TOKENIZERS_PARALLELISM"] = "false" + if policy_setup == "widowx_bridge": + unnorm_key = "bridge_orig" if unnorm_key is None else unnorm_key + action_ensemble = True + action_ensemble_temp = 0.0 + self.sticky_gripper_num_repeat = 1 + elif policy_setup == "google_robot": + unnorm_key = "fractal20220817_data" if unnorm_key is None else unnorm_key + action_ensemble = True + action_ensemble_temp = 0.0 + self.sticky_gripper_num_repeat = 15 + else: + raise NotImplementedError( + f"Policy setup {policy_setup} not supported for octo models. The other datasets can be found in the huggingface config.json file." + ) + self.policy_setup = policy_setup + self.unnorm_key = unnorm_key + + print(f"*** policy_setup: {policy_setup}, unnorm_key: {unnorm_key} ***") + self.processor = AutoProcessor.from_pretrained(saved_model_path, trust_remote_code=True) + self.vla = AutoModelForVision2Seq.from_pretrained( + "openvla/openvla-7b", + attn_implementation="flash_attention_2", # [Optional] Requires `flash_attn` + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + trust_remote_code=True, + ).cuda() + + self.image_size = image_size + self.action_scale = action_scale + self.horizon = horizon + self.pred_action_horizon = pred_action_horizon + self.exec_horizon = exec_horizon + self.action_ensemble = action_ensemble + self.action_ensemble_temp = action_ensemble_temp + + self.sticky_action_is_on = False + self.gripper_action_repeat = 0 + self.sticky_gripper_action = 0.0 + self.previous_gripper_action = None + + self.task = None + self.task_description = None + self.image_history = deque(maxlen=self.horizon) + if self.action_ensemble: + self.action_ensembler = ActionEnsembler(self.pred_action_horizon, self.action_ensemble_temp) + else: + self.action_ensembler = None + self.num_image_history = 0 + + def _add_image_to_history(self, image: np.ndarray) -> None: + self.image_history.append(image) + self.num_image_history = min(self.num_image_history + 1, self.horizon) + + def reset(self, task_description: str) -> None: + self.task_description = task_description + self.image_history.clear() + if self.action_ensemble: + self.action_ensembler.reset() + self.num_image_history = 0 + + self.sticky_action_is_on = False + self.gripper_action_repeat = 0 + self.sticky_gripper_action = 0.0 + self.previous_gripper_action = None + + def step( + self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs + ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]: + """ + Input: + image: np.ndarray of shape (H, W, 3), uint8 + task_description: Optional[str], task description; if different from previous task description, policy state is reset + Output: + raw_action: dict; raw policy action output + action: dict; processed action to be sent to the maniskill2 environment, with the following keys: + - 'world_vector': np.ndarray of shape (3,), xyz translation of robot end-effector + - 'rot_axangle': np.ndarray of shape (3,), axis-angle representation of end-effector rotation + - 'gripper': np.ndarray of shape (1,), gripper action + - 'terminate_episode': np.ndarray of shape (1,), 1 if episode should be terminated, 0 otherwise + """ + if task_description is not None: + if task_description != self.task_description: + self.reset(task_description) + + assert image.dtype == np.uint8 + self._add_image_to_history(self._resize_image(image)) + + image: Image.Image = Image.fromarray(image) + prompt = task_description + + # predict action (7-dof; un-normalize for bridgev2) + inputs = self.processor(prompt, image).to("cuda:0", dtype=torch.bfloat16) + raw_actions = self.vla.predict_action(**inputs, unnorm_key=self.unnorm_key, do_sample=False)[None] + # print(f"*** raw actions {raw_actions} ***") + + if self.action_ensemble: + raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None] + raw_action = { + "world_vector": np.array(raw_actions[0, :3]), + "rotation_delta": np.array(raw_actions[0, 3:6]), + "open_gripper": np.array(raw_actions[0, 6:7]), # range [0, 1]; 1 = open; 0 = close + } + + # process raw_action to obtain the action to be sent to the maniskill2 environment + action = {} + action["world_vector"] = raw_action["world_vector"] * self.action_scale + action_rotation_delta = np.asarray(raw_action["rotation_delta"], dtype=np.float64) + roll, pitch, yaw = action_rotation_delta + action_rotation_ax, action_rotation_angle = euler2axangle(roll, pitch, yaw) + action_rotation_axangle = action_rotation_ax * action_rotation_angle + action["rot_axangle"] = action_rotation_axangle * self.action_scale + + if self.policy_setup == "google_robot": + current_gripper_action = raw_action["open_gripper"] + if self.previous_gripper_action is None: + relative_gripper_action = np.array([0]) + else: + relative_gripper_action = self.previous_gripper_action - current_gripper_action + self.previous_gripper_action = current_gripper_action + + if np.abs(relative_gripper_action) > 0.5 and self.sticky_action_is_on is False: + self.sticky_action_is_on = True + self.sticky_gripper_action = relative_gripper_action + + if self.sticky_action_is_on: + self.gripper_action_repeat += 1 + relative_gripper_action = self.sticky_gripper_action + + if self.gripper_action_repeat == self.sticky_gripper_num_repeat: + self.sticky_action_is_on = False + self.gripper_action_repeat = 0 + self.sticky_gripper_action = 0.0 + + action["gripper"] = relative_gripper_action + + elif self.policy_setup == "widowx_bridge": + action["gripper"] = 2.0 * (raw_action["open_gripper"] > 0.5) - 1.0 + + action["terminate_episode"] = np.array([0.0]) + + return raw_action, action + + def _resize_image(self, image: np.ndarray) -> np.ndarray: + image = cv.resize(image, tuple(self.image_size), interpolation=cv.INTER_AREA) + return image + + def visualize_epoch( + self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str + ) -> None: + images = [self._resize_image(image) for image in images] + ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"] + + img_strip = np.concatenate(np.array(images[::3]), axis=1) + + # set up plt figure + figure_layout = [["image"] * len(ACTION_DIM_LABELS), ACTION_DIM_LABELS] + plt.rcParams.update({"font.size": 12}) + fig, axs = plt.subplot_mosaic(figure_layout) + fig.set_size_inches([45, 10]) + + # plot actions + pred_actions = np.array( + [ + np.concatenate([a["world_vector"], a["rotation_delta"], a["open_gripper"]], axis=-1) + for a in predicted_raw_actions + ] + ) + for action_dim, action_label in enumerate(ACTION_DIM_LABELS): + # actions have batch, horizon, dim, in this example we just take the first action for simplicity + axs[action_label].plot(pred_actions[:, action_dim], label="predicted action") + axs[action_label].set_title(action_label) + axs[action_label].set_xlabel("Time in one episode") + + axs["image"].imshow(img_strip) + axs["image"].set_xlabel("Time in one episode (subsampled)") + plt.legend() + plt.savefig(save_path) From 7bed3d4bce61ef2bf58ccbaf8f81ccff306883b9 Mon Sep 17 00:00:00 2001 From: DelinQu Date: Fri, 5 Jul 2024 18:23:37 +0800 Subject: [PATCH 2/9] intergrate openvla policy and PR --- scripts/openvla_bridge.sh | 49 +++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/scripts/openvla_bridge.sh b/scripts/openvla_bridge.sh index e8c8289e..80e733ef 100644 --- a/scripts/openvla_bridge.sh +++ b/scripts/openvla_bridge.sh @@ -7,31 +7,30 @@ robot=widowx rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png robot_init_x=0.147 robot_init_y=0.028 -export DISPLAY=:1.0 -# VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json -# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \ -# --robot ${robot} --policy-setup widowx_bridge \ -# --control-freq 5 --sim-freq 500 --max-episode-steps 60 \ -# --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \ -# --rgb-overlay-path ${rgb_overlay_path} \ -# --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \ -# --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1; -# -# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \ -# --robot ${robot} --policy-setup widowx_bridge \ -# --control-freq 5 --sim-freq 500 --max-episode-steps 60 \ -# --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \ -# --rgb-overlay-path ${rgb_overlay_path} \ -# --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \ -# --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1; -# -# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \ -# --robot ${robot} --policy-setup widowx_bridge \ -# --control-freq 5 --sim-freq 500 --max-episode-steps 60 \ -# --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \ -# --rgb-overlay-path ${rgb_overlay_path} \ -# --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \ -# --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1; + +python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \ + --robot ${robot} --policy-setup widowx_bridge \ + --control-freq 5 --sim-freq 500 --max-episode-steps 60 \ + --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \ + --rgb-overlay-path ${rgb_overlay_path} \ + --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1; + +python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \ + --robot ${robot} --policy-setup widowx_bridge \ + --control-freq 5 --sim-freq 500 --max-episode-steps 60 \ + --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \ + --rgb-overlay-path ${rgb_overlay_path} \ + --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1; + +python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \ + --robot ${robot} --policy-setup widowx_bridge \ + --control-freq 5 --sim-freq 500 --max-episode-steps 60 \ + --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \ + --rgb-overlay-path ${rgb_overlay_path} \ + --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \ + --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1; scene_name=bridge_table_1_v2 From f0674e7701750e6262879b26a5a0a0954726479a Mon Sep 17 00:00:00 2001 From: DelinQu Date: Sat, 6 Jul 2024 21:50:17 +0800 Subject: [PATCH 3/9] openvla policy intergration pull request --- simpler_env/main_inference.py | 4 ++-- simpler_env/policies/openvla/openvla_model.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/simpler_env/main_inference.py b/simpler_env/main_inference.py index 7ebe20c2..699c00af 100644 --- a/simpler_env/main_inference.py +++ b/simpler_env/main_inference.py @@ -7,7 +7,7 @@ from simpler_env.evaluation.maniskill2_evaluator import maniskill2_evaluator from simpler_env.policies.octo.octo_server_model import OctoServerInference from simpler_env.policies.rt1.rt1_model import RT1Inference -from simpler_env.policies.openvla.openvla_model import OpenVALInference +from simpler_env.policies.openvla.openvla_model import OpenVLAInference try: from simpler_env.policies.octo.octo_model import OctoInference @@ -56,7 +56,7 @@ ) elif args.policy_model == "openvla": assert args.ckpt_path is not None - model = OpenVALInference( + model = OpenVLAInference( saved_model_path=args.ckpt_path, policy_setup=args.policy_setup, action_scale=args.action_scale, diff --git a/simpler_env/policies/openvla/openvla_model.py b/simpler_env/policies/openvla/openvla_model.py index 66bdf0d4..b21f1378 100644 --- a/simpler_env/policies/openvla/openvla_model.py +++ b/simpler_env/policies/openvla/openvla_model.py @@ -11,7 +11,7 @@ import cv2 as cv -class OpenVALInference: +class OpenVLAInference: def __init__( self, saved_model_path: str = "openvla/openvla-7b", @@ -144,7 +144,7 @@ def step( relative_gripper_action = self.previous_gripper_action - current_gripper_action self.previous_gripper_action = current_gripper_action - if np.abs(relative_gripper_action) > 0.5 and self.sticky_action_is_on is False: + if np.abs(relative_gripper_action) > 0.5 and (not self.sticky_action_is_on): self.sticky_action_is_on = True self.sticky_gripper_action = relative_gripper_action From 29053792b9e3103c30e27b69a0268b4d3da8a710 Mon Sep 17 00:00:00 2001 From: xuanlinli17 Date: Sun, 28 Jul 2024 09:50:03 -0700 Subject: [PATCH 4/9] update openvla inference scripts --- scripts/octo_bridge.sh | 0 scripts/octo_drawer_variant_agg.sh | 0 scripts/octo_drawer_visual_matching.sh | 0 scripts/octo_move_near_variant_agg.sh | 0 scripts/octo_move_near_visual_matching.sh | 0 scripts/octo_pick_coke_can_variant_agg.sh | 0 scripts/octo_pick_coke_can_visual_matching.sh | 0 scripts/octo_put_in_drawer_variant_agg.sh | 0 scripts/octo_put_in_drawer_visual_matching.sh | 0 scripts/openvla_bridge.sh | 0 scripts/openvla_drawer_variant_agg.sh | 0 scripts/openvla_drawer_visual_matching.sh | 0 scripts/openvla_move_near_variant_agg.sh | 0 scripts/openvla_move_near_visual_matching.sh | 0 scripts/openvla_pick_coke_can_variant_agg.sh | 0 .../openvla_pick_coke_can_visual_matching.sh | 0 scripts/openvla_put_in_drawer_variant_agg.sh | 2 +- .../openvla_put_in_drawer_visual_matching.sh | 2 +- scripts/rt1_drawer_variant_agg.sh | 0 scripts/rt1_drawer_visual_matching.sh | 0 scripts/rt1_move_near_variant_agg.sh | 0 scripts/rt1_move_near_visual_matching.sh | 0 scripts/rt1_pick_coke_can_variant_agg.sh | 0 scripts/rt1_pick_coke_can_visual_matching.sh | 0 scripts/rt1_put_in_drawer_variant_agg.sh | 0 scripts/rt1_put_in_drawer_visual_matching.sh | 0 scripts/rt1x_bridge.sh | 0 simpler_env/policies/openvla/openvla_model.py | 26 ++----------------- 28 files changed, 4 insertions(+), 26 deletions(-) mode change 100644 => 100755 scripts/octo_bridge.sh mode change 100644 => 100755 scripts/octo_drawer_variant_agg.sh mode change 100644 => 100755 scripts/octo_drawer_visual_matching.sh mode change 100644 => 100755 scripts/octo_move_near_variant_agg.sh mode change 100644 => 100755 scripts/octo_move_near_visual_matching.sh mode change 100644 => 100755 scripts/octo_pick_coke_can_variant_agg.sh mode change 100644 => 100755 scripts/octo_pick_coke_can_visual_matching.sh mode change 100644 => 100755 scripts/octo_put_in_drawer_variant_agg.sh mode change 100644 => 100755 scripts/octo_put_in_drawer_visual_matching.sh mode change 100644 => 100755 scripts/openvla_bridge.sh mode change 100644 => 100755 scripts/openvla_drawer_variant_agg.sh mode change 100644 => 100755 scripts/openvla_drawer_visual_matching.sh mode change 100644 => 100755 scripts/openvla_move_near_variant_agg.sh mode change 100644 => 100755 scripts/openvla_move_near_visual_matching.sh mode change 100644 => 100755 scripts/openvla_pick_coke_can_variant_agg.sh mode change 100644 => 100755 scripts/openvla_pick_coke_can_visual_matching.sh mode change 100644 => 100755 scripts/openvla_put_in_drawer_variant_agg.sh mode change 100644 => 100755 scripts/openvla_put_in_drawer_visual_matching.sh mode change 100644 => 100755 scripts/rt1_drawer_variant_agg.sh mode change 100644 => 100755 scripts/rt1_drawer_visual_matching.sh mode change 100644 => 100755 scripts/rt1_move_near_variant_agg.sh mode change 100644 => 100755 scripts/rt1_move_near_visual_matching.sh mode change 100644 => 100755 scripts/rt1_pick_coke_can_variant_agg.sh mode change 100644 => 100755 scripts/rt1_pick_coke_can_visual_matching.sh mode change 100644 => 100755 scripts/rt1_put_in_drawer_variant_agg.sh mode change 100644 => 100755 scripts/rt1_put_in_drawer_visual_matching.sh mode change 100644 => 100755 scripts/rt1x_bridge.sh diff --git a/scripts/octo_bridge.sh b/scripts/octo_bridge.sh old mode 100644 new mode 100755 diff --git a/scripts/octo_drawer_variant_agg.sh b/scripts/octo_drawer_variant_agg.sh old mode 100644 new mode 100755 diff --git a/scripts/octo_drawer_visual_matching.sh b/scripts/octo_drawer_visual_matching.sh old mode 100644 new mode 100755 diff --git a/scripts/octo_move_near_variant_agg.sh b/scripts/octo_move_near_variant_agg.sh old mode 100644 new mode 100755 diff --git a/scripts/octo_move_near_visual_matching.sh b/scripts/octo_move_near_visual_matching.sh old mode 100644 new mode 100755 diff --git a/scripts/octo_pick_coke_can_variant_agg.sh b/scripts/octo_pick_coke_can_variant_agg.sh old mode 100644 new mode 100755 diff --git a/scripts/octo_pick_coke_can_visual_matching.sh b/scripts/octo_pick_coke_can_visual_matching.sh old mode 100644 new mode 100755 diff --git a/scripts/octo_put_in_drawer_variant_agg.sh b/scripts/octo_put_in_drawer_variant_agg.sh old mode 100644 new mode 100755 diff --git a/scripts/octo_put_in_drawer_visual_matching.sh b/scripts/octo_put_in_drawer_visual_matching.sh old mode 100644 new mode 100755 diff --git a/scripts/openvla_bridge.sh b/scripts/openvla_bridge.sh old mode 100644 new mode 100755 diff --git a/scripts/openvla_drawer_variant_agg.sh b/scripts/openvla_drawer_variant_agg.sh old mode 100644 new mode 100755 diff --git a/scripts/openvla_drawer_visual_matching.sh b/scripts/openvla_drawer_visual_matching.sh old mode 100644 new mode 100755 diff --git a/scripts/openvla_move_near_variant_agg.sh b/scripts/openvla_move_near_variant_agg.sh old mode 100644 new mode 100755 diff --git a/scripts/openvla_move_near_visual_matching.sh b/scripts/openvla_move_near_visual_matching.sh old mode 100644 new mode 100755 diff --git a/scripts/openvla_pick_coke_can_variant_agg.sh b/scripts/openvla_pick_coke_can_variant_agg.sh old mode 100644 new mode 100755 diff --git a/scripts/openvla_pick_coke_can_visual_matching.sh b/scripts/openvla_pick_coke_can_visual_matching.sh old mode 100644 new mode 100755 diff --git a/scripts/openvla_put_in_drawer_variant_agg.sh b/scripts/openvla_put_in_drawer_variant_agg.sh old mode 100644 new mode 100755 index a3c63a12..fb8ce514 --- a/scripts/openvla_put_in_drawer_variant_agg.sh +++ b/scripts/openvla_put_in_drawer_variant_agg.sh @@ -2,7 +2,7 @@ -declare -a arr=("openvla/openvla-7b") +declare -a ckpt_paths=("openvla/openvla-7b") declare -a env_names=( PlaceIntoClosedTopDrawerCustomInScene-v0 diff --git a/scripts/openvla_put_in_drawer_visual_matching.sh b/scripts/openvla_put_in_drawer_visual_matching.sh old mode 100644 new mode 100755 index 98539bf3..14dd46a4 --- a/scripts/openvla_put_in_drawer_visual_matching.sh +++ b/scripts/openvla_put_in_drawer_visual_matching.sh @@ -1,6 +1,6 @@ # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth -declare -a arr=("openvla/openvla-7b") +declare -a ckpt_paths=("openvla/openvla-7b") declare -a env_names=( diff --git a/scripts/rt1_drawer_variant_agg.sh b/scripts/rt1_drawer_variant_agg.sh old mode 100644 new mode 100755 diff --git a/scripts/rt1_drawer_visual_matching.sh b/scripts/rt1_drawer_visual_matching.sh old mode 100644 new mode 100755 diff --git a/scripts/rt1_move_near_variant_agg.sh b/scripts/rt1_move_near_variant_agg.sh old mode 100644 new mode 100755 diff --git a/scripts/rt1_move_near_visual_matching.sh b/scripts/rt1_move_near_visual_matching.sh old mode 100644 new mode 100755 diff --git a/scripts/rt1_pick_coke_can_variant_agg.sh b/scripts/rt1_pick_coke_can_variant_agg.sh old mode 100644 new mode 100755 diff --git a/scripts/rt1_pick_coke_can_visual_matching.sh b/scripts/rt1_pick_coke_can_visual_matching.sh old mode 100644 new mode 100755 diff --git a/scripts/rt1_put_in_drawer_variant_agg.sh b/scripts/rt1_put_in_drawer_variant_agg.sh old mode 100644 new mode 100755 diff --git a/scripts/rt1_put_in_drawer_visual_matching.sh b/scripts/rt1_put_in_drawer_visual_matching.sh old mode 100644 new mode 100755 diff --git a/scripts/rt1x_bridge.sh b/scripts/rt1x_bridge.sh old mode 100644 new mode 100755 diff --git a/simpler_env/policies/openvla/openvla_model.py b/simpler_env/policies/openvla/openvla_model.py index b21f1378..ae43ef0b 100644 --- a/simpler_env/policies/openvla/openvla_model.py +++ b/simpler_env/policies/openvla/openvla_model.py @@ -1,10 +1,8 @@ -from collections import deque from typing import Optional, Sequence import os import matplotlib.pyplot as plt import numpy as np from transforms3d.euler import euler2axangle -from simpler_env.utils.action.action_ensemble import ActionEnsembler from transformers import AutoModelForVision2Seq, AutoProcessor from PIL import Image import torch @@ -17,7 +15,7 @@ def __init__( saved_model_path: str = "openvla/openvla-7b", unnorm_key: Optional[str] = None, policy_setup: str = "widowx_bridge", - horizon: int = 2, + horizon: int = 1, pred_action_horizon: int = 1, exec_horizon: int = 1, image_size: list[int] = [224, 224], @@ -26,13 +24,9 @@ def __init__( os.environ["TOKENIZERS_PARALLELISM"] = "false" if policy_setup == "widowx_bridge": unnorm_key = "bridge_orig" if unnorm_key is None else unnorm_key - action_ensemble = True - action_ensemble_temp = 0.0 self.sticky_gripper_num_repeat = 1 elif policy_setup == "google_robot": unnorm_key = "fractal20220817_data" if unnorm_key is None else unnorm_key - action_ensemble = True - action_ensemble_temp = 0.0 self.sticky_gripper_num_repeat = 15 else: raise NotImplementedError( @@ -56,8 +50,6 @@ def __init__( self.horizon = horizon self.pred_action_horizon = pred_action_horizon self.exec_horizon = exec_horizon - self.action_ensemble = action_ensemble - self.action_ensemble_temp = action_ensemble_temp self.sticky_action_is_on = False self.gripper_action_repeat = 0 @@ -66,22 +58,10 @@ def __init__( self.task = None self.task_description = None - self.image_history = deque(maxlen=self.horizon) - if self.action_ensemble: - self.action_ensembler = ActionEnsembler(self.pred_action_horizon, self.action_ensemble_temp) - else: - self.action_ensembler = None self.num_image_history = 0 - def _add_image_to_history(self, image: np.ndarray) -> None: - self.image_history.append(image) - self.num_image_history = min(self.num_image_history + 1, self.horizon) - def reset(self, task_description: str) -> None: self.task_description = task_description - self.image_history.clear() - if self.action_ensemble: - self.action_ensembler.reset() self.num_image_history = 0 self.sticky_action_is_on = False @@ -109,7 +89,7 @@ def step( self.reset(task_description) assert image.dtype == np.uint8 - self._add_image_to_history(self._resize_image(image)) + image = self._resize_image(image) image: Image.Image = Image.fromarray(image) prompt = task_description @@ -119,8 +99,6 @@ def step( raw_actions = self.vla.predict_action(**inputs, unnorm_key=self.unnorm_key, do_sample=False)[None] # print(f"*** raw actions {raw_actions} ***") - if self.action_ensemble: - raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None] raw_action = { "world_vector": np.array(raw_actions[0, :3]), "rotation_delta": np.array(raw_actions[0, 3:6]), From 52727f9f29ab4306e946fa595c9d0392b95f5b62 Mon Sep 17 00:00:00 2001 From: xuanlinli17 Date: Sun, 28 Jul 2024 09:52:19 -0700 Subject: [PATCH 5/9] update readme --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index f232c0d4..ee813be5 100644 --- a/README.md +++ b/README.md @@ -289,6 +289,13 @@ If you are using CUDA 12, then to use GPU for Octo inference, you need CUDA vers `PATH=/usr/local/cuda-12.3/bin:$PATH LD_LIBRARY_PATH=/usr/local/cuda-12.3/lib64:$LD_LIBRARY_PATH bash scripts/octo_xxx_script.sh` +### OpenVLA Inference Setup + +``` +pip install torch==2.3.1 torchvision==0.18.1 timm==0.9.10 tokenizers==0.15.2 accelerate==0.32.1 +pip install flash-attn==2.6.1 --no-build-isolation +``` + ## Troubleshooting 1. If you encounter issues such as From 37d8fa8de5cd0f7b2dda472fa8ebd2b48c6994f0 Mon Sep 17 00:00:00 2001 From: xuanlinli17 Date: Sat, 17 Aug 2024 23:40:26 -0700 Subject: [PATCH 6/9] Add OpenVLA metrics --- simpler_env/utils/metrics.py | 82 +++++++++++++++++++++++++ tools/calc_metrics_evaluation_videos.py | 15 +++++ 2 files changed, 97 insertions(+) diff --git a/simpler_env/utils/metrics.py b/simpler_env/utils/metrics.py index 39a251c9..019a5a2f 100644 --- a/simpler_env/utils/metrics.py +++ b/simpler_env/utils/metrics.py @@ -5,6 +5,33 @@ import numpy as np REAL_PERF = { # Real robot eval performance --> extract via: REAL_PERF[task][policy] + "google_robot_pick_coke_can_horizontal": { + "rt-2-x": 0.920, + "rt-1-converged": 0.960, + "rt-1-15pct": 1.000, + "rt-1-x": 0.880, + "rt-1-begin": 0.200, + "octo-base": 0.440, + "openvla-7b": 0.640, + }, + "google_robot_pick_coke_can_vertical": { + "rt-2-x": 0.800, + "rt-1-converged": 0.880, + "rt-1-15pct": 0.960, + "rt-1-x": 0.560, + "rt-1-begin": 0.000, + "octo-base": 0.200, + "openvla-7b": 0.280, + }, + "google_robot_pick_coke_can_standing": { + "rt-2-x": 1.000, + "rt-1-converged": 0.720, + "rt-1-15pct": 0.800, + "rt-1-x": 0.840, + "rt-1-begin": 0.200, + "octo-base": 0.240, + "openvla-7b": 0.360, + }, "google_robot_pick_coke_can": { "rt-2-x": 0.907, "rt-1-converged": 0.853, @@ -12,6 +39,7 @@ "rt-1-x": 0.760, "rt-1-begin": 0.133, "octo-base": 0.293, + "openvla-7b": 0.427, }, "google_robot_move_near": { "rt-2-x": 0.733, @@ -20,6 +48,7 @@ "rt-1-x": 0.450, "rt-1-begin": 0.017, "octo-base": 0.350, + "openvla-7b": 0.667, }, "google_robot_open_drawer": { "rt-2-x": 0.333, @@ -28,6 +57,7 @@ "rt-1-x": 0.519, "rt-1-begin": 0.000, "octo-base": 0.148, + "openvla-7b": 0.111, }, "google_robot_close_drawer": { "rt-2-x": 0.630, @@ -36,6 +66,16 @@ "rt-1-x": 0.741, "rt-1-begin": 0.000, "octo-base": 0.519, + "openvla-7b": 0.148, + }, + "google_robot_drawer": { + "rt-2-x": 0.481, + "rt-1-converged": 0.870, + "rt-1-15pct": 0.796, + "rt-1-x": 0.630, + "rt-1-begin": 0.000, + "octo-base": 0.333, + "openvla-7b": 0.130, }, "google_robot_place_apple_in_closed_top_drawer": { "rt-2-x": 0.074, @@ -44,6 +84,7 @@ "rt-1-x": 0.407, "rt-1-begin": 0.000, "octo-base": 0.000, + "openvla-7b": 0.000, }, "widowx_spoon_on_towel": { "rt-1-x": 0.000, @@ -69,6 +110,33 @@ SIMPLER_PERF = { # SIMPLER simulated eval performance --> extract via: SIMPLER_PERF[task][policy] + "google_robot_pick_coke_can_horizontal": { + "rt-2-x": 0.740, + "rt-1-converged": 0.960, + "rt-1-15pct": 0.860, + "rt-1-x": 0.820, + "rt-1-begin": 0.050, + "octo-base": 0.210, + "openvla-7b": 0.310, + }, + "google_robot_pick_coke_can_vertical": { + "rt-2-x": 0.740, + "rt-1-converged": 0.900, + "rt-1-15pct": 0.790, + "rt-1-x": 0.330, + "rt-1-begin": 0.000, + "octo-base": 0.210, + "openvla-7b": 0.030, + }, + "google_robot_pick_coke_can_standing": { + "rt-2-x": 0.880, + "rt-1-converged": 0.710, + "rt-1-15pct": 0.480, + "rt-1-x": 0.550, + "rt-1-begin": 0.030, + "octo-base": 0.090, + "openvla-7b": 0.190, + }, "google_robot_pick_coke_can": { "rt-2-x": 0.787, "rt-1-converged": 0.857, @@ -76,6 +144,7 @@ "rt-1-x": 0.567, "rt-1-begin": 0.027, "octo-base": 0.170, + "openvla-7b": 0.177, }, "google_robot_move_near": { "rt-2-x": 0.779, @@ -84,6 +153,7 @@ "rt-1-x": 0.317, "rt-1-begin": 0.050, "octo-base": 0.042, + "openvla-7b": 0.492, }, "google_robot_open_drawer": { "rt-2-x": 0.157, @@ -92,6 +162,7 @@ "rt-1-x": 0.296, "rt-1-begin": 0.000, "octo-base": 0.009, + "openvla-7b": 0.250, }, "google_robot_close_drawer": { "rt-2-x": 0.343, @@ -100,6 +171,16 @@ "rt-1-x": 0.891, "rt-1-begin": 0.278, "octo-base": 0.444, + "openvla-7b": 0.574, + }, + "google_robot_drawer": { + "rt-2-x": 0.250, + "rt-1-converged": 0.730, + "rt-1-15pct": 0.565, + "rt-1-x": 0.597, + "rt-1-begin": 0.139, + "octo-base": 0.227, + "openvla-7b": 0.412, }, "google_robot_place_apple_in_closed_top_drawer": { "rt-2-x": 0.037, @@ -108,6 +189,7 @@ "rt-1-x": 0.213, "rt-1-begin": 0.000, "octo-base": 0.000, + "openvla-7b": 0.000, }, "widowx_spoon_on_towel": { "rt-1-x": 0.000, diff --git a/tools/calc_metrics_evaluation_videos.py b/tools/calc_metrics_evaluation_videos.py index 1981ad25..3ea1eff5 100644 --- a/tools/calc_metrics_evaluation_videos.py +++ b/tools/calc_metrics_evaluation_videos.py @@ -27,6 +27,7 @@ def calc_pick_coke_can_stats(root_result_dir): "rt-1-x": 0.88, "rt-1-begin": 0.20, "octo-base": 0.44, + "openvla-7b": 0.64, }, "vertical": { "rt-2-x": 0.80, @@ -35,6 +36,7 @@ def calc_pick_coke_can_stats(root_result_dir): "rt-1-x": 0.56, "rt-1-begin": 0.00, "octo-base": 0.20, + "openvla-7b": 0.28, }, "standing": { "rt-2-x": 1.00, @@ -43,6 +45,7 @@ def calc_pick_coke_can_stats(root_result_dir): "rt-1-x": 0.84, "rt-1-begin": 0.20, "octo-base": 0.24, + "openvla-7b": 0.36, }, } @@ -282,6 +285,7 @@ def calc_move_near_stats(root_result_dir): "rt-1-x": 0.45, "rt-1-begin": 0.017, "octo-base": 0.35, + "openvla-7b": 0.667, } ckpt_alias_keys = list(move_near_real_success.keys()) @@ -413,6 +417,7 @@ def calc_drawer_stats(root_result_dir): "rt-1-x": 0.519, "rt-1-begin": 0.000, "octo-base": 0.148, + "openvla-7b": 0.111, }, "close": { "rt-2-x": 0.630, @@ -421,6 +426,7 @@ def calc_drawer_stats(root_result_dir): "rt-1-x": 0.741, "rt-1-begin": 0.000, "octo-base": 0.519, + "openvla-7b": 0.148, }, } @@ -642,6 +648,7 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir): "rt-1-x": 0.407, "rt-1-begin": 0.000, "octo-base": 0.000, + "openvla-7b": 0.000, }, } @@ -855,21 +862,25 @@ def calc_bridge_put_on_env_stats(root_result_dir): "rt-1-x": 0.042, "octo-base": 0.500, "octo-small": 0.542, + "openvla-7b": 0.10, }, "put_carrot_on_plate": { "rt-1-x": 0.167, "octo-base": 0.500, "octo-small": 0.208, + "openvla-7b": 0.10, }, "stack_green_block_on_yellow_block": { "rt-1-x": 0.000, "octo-base": 0.292, "octo-small": 0.583, + "openvla-7b": 0.10, }, "put_eggplant_in_basket": { "rt-1-x": 0.000, "octo-base": 0.400, "octo-small": 0.600, + "openvla-7b": 0.10, }, } real_success_dict = { @@ -877,17 +888,20 @@ def calc_bridge_put_on_env_stats(root_result_dir): "rt-1-x": 0.000, "octo-base": 0.333, "octo-small": 0.417, + "openvla-7b": 0.10, }, "put_carrot_on_plate": {"rt-1-x": 0.00, "octo-base": 0.25, "octo-small": 0.083}, "stack_green_block_on_yellow_block": { "rt-1-x": 0.000, "octo-base": 0.000, "octo-small": 0.125, + "openvla-7b": 0.10, }, "put_eggplant_in_basket": { "rt-1-x": 0.000, "octo-base": 0.250, "octo-small": 0.400, + "openvla-7b": 0.10, }, } @@ -1023,6 +1037,7 @@ def calc_bridge_put_on_env_stats(root_result_dir): "octo-base": "octo-base", "octo-small": "octo-small", "octo-server": "octo-server", + "openvla-7b": "openvla-7b", } parser = argparse.ArgumentParser() From 3667e65213d9f0d826081b1f9d720297b4e70fae Mon Sep 17 00:00:00 2001 From: xuanlinli17 Date: Sat, 17 Aug 2024 23:55:38 -0700 Subject: [PATCH 7/9] update readme --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ee813be5..c998a097 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,10 @@ We hope that our work guides and inspires future real-to-sim evaluation efforts. - [Code Structure](#code-structure) - [Adding New Policies](#adding-new-policies) - [Adding New Real-to-Sim Evaluation Environments and Robots](#adding-new-real-to-sim-evaluation-environments-and-robots) - - [Full Installation (RT-1 and Octo Inference, Env Building)](#full-installation-rt-1-and-octo-inference-env-building) + - [Full Installation (RT-1, Octo, OpenVLA Inference, Env Building)](#full-installation-rt-1-octo-openvla-inference-env-building) - [RT-1 Inference Setup](#rt-1-inference-setup) - [Octo Inference Setup](#octo-inference-setup) + - [OpenVLA Inference Setup](#openvla-inference-setup) - [Troubleshooting](#troubleshooting) - [Citation](#citation) @@ -97,7 +98,7 @@ cd {this_repo} pip install -e . ``` -**If you'd like to perform evaluations on our provided agents (e.g., RT-1, Octo), or add new robots and environments, please additionally follow the full installation instructions [here](#full-installation-rt-1-and-octo-inference-env-building).** +**If you'd like to perform evaluations on our provided agents (e.g., RT-1, Octo, OpenVLA), or add new robots and environments, please additionally follow the full installation instructions [here](#full-installation-rt-1-octo-openvla-inference-env-building).** ## Examples @@ -105,7 +106,7 @@ pip install -e . - Simple RT-1 and Octo evaluation script on prepackaged environments with visual matching evaluation setup: see [`simpler_env/simple_inference_visual_matching_prepackaged_envs.py`](https://github.com/simpler-env/SimplerEnv/blob/main/simpler_env/simple_inference_visual_matching_prepackaged_envs.py). - Colab notebook for RT-1 and Octo inference: see [this link](https://colab.research.google.com/github/simpler-env/SimplerEnv/blob/main/example.ipynb). - Environment interactive visualization and manual control: see [`ManiSkill2_real2sim/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py`](https://github.com/simpler-env/ManiSkill2_real2sim/blob/main/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py) -- Policy inference scripts to reproduce our Google Robot and WidowX real-to-sim evaluation results with sweeps over object / robot poses and advanced loggings. These contain both visual matching and variant aggregation evaluation setups along with RT-1, RT-1-X, and Octo policies. See [`scripts/`](https://github.com/simpler-env/SimplerEnv/tree/main/scripts). +- Policy inference scripts to reproduce our Google Robot and WidowX real-to-sim evaluation results with sweeps over object / robot poses and advanced loggings. These contain both visual matching and variant aggregation evaluation setups along with RT-1, RT-1-X, Octo, and OpenVLA policies. See [`scripts/`](https://github.com/simpler-env/SimplerEnv/tree/main/scripts). - Real-to-sim evaluation videos from running `scripts/*.sh`: see [this link](https://huggingface.co/datasets/xuanlinli17/simpler-env-eval-example-videos/tree/main). ## Current Environments @@ -219,7 +220,7 @@ If you want to use existing environments for evaluating new policies, you can ke We provide a step-by-step guide to add new real-to-sim evaluation environments and robots in [this README](ADDING_NEW_ENVS_ROBOTS.md) -## Full Installation (RT-1 and Octo Inference, Env Building) +## Full Installation (RT-1, Octo, OpenVLA Inference, Env Building) If you'd like to perform evaluations on our provided agents (e.g., RT-1, Octo), or add new robots and environments, please follow the full installation instructions below. From 2b0ecc8ec6793ec2268e630b1710939feb998407 Mon Sep 17 00:00:00 2001 From: xuanlinli17 Date: Sun, 18 Aug 2024 09:27:27 -0700 Subject: [PATCH 8/9] add openvla simple inference --- README.md | 2 +- ...imple_inference_visual_matching_prepackaged_envs.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c998a097..83239580 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ pip install -e . ## Examples -- Simple RT-1 and Octo evaluation script on prepackaged environments with visual matching evaluation setup: see [`simpler_env/simple_inference_visual_matching_prepackaged_envs.py`](https://github.com/simpler-env/SimplerEnv/blob/main/simpler_env/simple_inference_visual_matching_prepackaged_envs.py). +- Simple RT-1, Octo, and OpenVLA evaluation script on prepackaged environments with visual matching evaluation setup: see [`simpler_env/simple_inference_visual_matching_prepackaged_envs.py`](https://github.com/simpler-env/SimplerEnv/blob/main/simpler_env/simple_inference_visual_matching_prepackaged_envs.py). - Colab notebook for RT-1 and Octo inference: see [this link](https://colab.research.google.com/github/simpler-env/SimplerEnv/blob/main/example.ipynb). - Environment interactive visualization and manual control: see [`ManiSkill2_real2sim/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py`](https://github.com/simpler-env/ManiSkill2_real2sim/blob/main/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py) - Policy inference scripts to reproduce our Google Robot and WidowX real-to-sim evaluation results with sweeps over object / robot poses and advanced loggings. These contain both visual matching and variant aggregation evaluation setups along with RT-1, RT-1-X, Octo, and OpenVLA policies. See [`scripts/`](https://github.com/simpler-env/SimplerEnv/tree/main/scripts). diff --git a/simpler_env/simple_inference_visual_matching_prepackaged_envs.py b/simpler_env/simple_inference_visual_matching_prepackaged_envs.py index 8bf23b87..2e09cd22 100644 --- a/simpler_env/simple_inference_visual_matching_prepackaged_envs.py +++ b/simpler_env/simple_inference_visual_matching_prepackaged_envs.py @@ -6,6 +6,8 @@ --ckpt-path ./checkpoints/rt_1_tf_trained_for_000400120 --task google_robot_pick_coke_can --logging-root ./results_simple_eval/ --n-trajs 10 python simpler_env/simple_inference_visual_matching_prepackaged_envs.py --policy octo-small \ --ckpt-path None --task widowx_spoon_on_towel --logging-root ./results_simple_eval/ --n-trajs 10 + python simpler_env/simple_inference_visual_matching_prepackaged_envs.py --policy openvla/openvla-7b \ + --ckpt-path None --task google_robot_move_near_v1 --logging-root ./results_simple_eval/ --n-trajs 10 """ import argparse @@ -21,7 +23,7 @@ parser = argparse.ArgumentParser() -parser.add_argument("--policy", default="rt1", choices=["rt1", "octo-base", "octo-small"]) +parser.add_argument("--policy", default="rt1", choices=["rt1", "octo-base", "octo-small", "openvla/openvla-7b"]) parser.add_argument( "--ckpt-path", type=str, @@ -37,7 +39,7 @@ parser.add_argument("--n-trajs", type=int, default=10) args = parser.parse_args() -if args.policy in ["octo-base", "octo-small"]: +if args.policy in ["octo-base", "octo-small", "openvla/openvla-7b"]: if args.ckpt_path in [None, "None"] or "rt_1_x" in args.ckpt_path: args.ckpt_path = args.policy if args.ckpt_path[-1] == "/": @@ -75,6 +77,10 @@ from simpler_env.policies.octo.octo_model import OctoInference model = OctoInference(model_type=args.ckpt_path, policy_setup=policy_setup, init_rng=0) +elif "openvla" in args.policy: + from simpler_env.policies.openvla.openvla_model import OpenVLAInference + + model = OpenVLAInference(saved_model_path=args.ckpt_path, policy_setup=policy_setup) else: raise NotImplementedError() From 2cd7aaea91a7ed19d81c7652b4c9f01e157e6444 Mon Sep 17 00:00:00 2001 From: xuanlinli17 Date: Sun, 18 Aug 2024 09:44:30 -0700 Subject: [PATCH 9/9] minor readme modification --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 83239580..c46c40d8 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,7 @@ simpler_env/ policies/: policy implementations rt1/: RT-1 policy implementation octo/: Octo policy implementation + openvla/: OpenVLA policy implementation utils/: env/: environment building and observation utilities debug/: debugging tools for policies and robots @@ -206,7 +207,7 @@ scripts/: example bash scripts for policy inference under our variant aggregatio If you want to use existing environments for evaluating new policies, you can keep `./ManiSkill2_real2sim` as is. -1. Implement new policy inference scripts in `simpler_env/policies/{your_new_policy}`, following the examples for RT-1 (`simpler_env/policies/rt1`) and Octo (`simpler_env/policies/octo`) policies. +1. Implement new policy inference scripts in `simpler_env/policies/{your_new_policy}`, following the examples for RT-1 (`simpler_env/policies/rt1`), Octo (`simpler_env/policies/octo`), and OpenVLA (`simpler_env/policies/openvla`) policies. 2. You can now use `simpler_env/simple_inference_visual_matching_prepackaged_envs.py` to perform policy evaluations in simulation. - If the policy behaviors deviate a lot from those in the real-world, you can write similar scripts as in `simpler_env/utils/debug/{policy_name}_inference_real_video.py` to debug the policy behaviors. The debugging script performs policy inference by feeding real eval video frames into the policy. If the policy behavior still deviates significantly from real, this may suggest that policy actions are processed incorrectly into the simulation environments. Please double check action orderings and action spaces. 3. If you'd like to perform customized evaluations,