diff --git a/cradle/environment/rdr2/composite_skills/auto_shoot.py b/cradle/environment/rdr2/composite_skills/auto_shoot.py index 0af5dd0..dc92806 100644 --- a/cradle/environment/rdr2/composite_skills/auto_shoot.py +++ b/cradle/environment/rdr2/composite_skills/auto_shoot.py @@ -6,11 +6,6 @@ import torch from torchvision.ops import box_convert -try: - from groundingdino.util.inference import annotate -except: - pass - from cradle.config import Config from cradle.log import Logger from cradle.gameio.io_env import IOEnvironment @@ -18,13 +13,20 @@ from cradle.environment.rdr2.atomic_skills.move import turn from cradle.environment.rdr2.skill_registry import register_skill from cradle.utils.image_utils import exec_clip_minimap -from cradle.utils.object_utils import groundingdino_detect, circle_detector_detect from cradle import constants config = Config() logger = Logger() io_env = IOEnvironment() +if config.is_game == True: + try: + from groundingdino.util.inference import annotate + except: + pass + + from cradle.utils.object_utils import groundingdino_detect, circle_detector_detect + DEFAULT_MAX_SHOOTING_ITERATIONS = 100 SHOOT_PEOPLE_TARGET_NAME = "person" SHOOT_WOLVES_TARGET_NAME = "wolf" diff --git a/cradle/environment/rdr2/composite_skills/follow.py b/cradle/environment/rdr2/composite_skills/follow.py index 9742015..f9d6ae2 100644 --- a/cradle/environment/rdr2/composite_skills/follow.py +++ b/cradle/environment/rdr2/composite_skills/follow.py @@ -10,7 +10,7 @@ from cradle.environment.rdr2.atomic_skills.move import turn, move_forward from cradle.environment.rdr2.skill_registry import register_skill from cradle.utils.image_utils import exec_clip_minimap -from cradle.utils.object_utils import groundingdino_detect, circle_detector_detect +from cradle.utils.object_utils import circle_detector_detect from cradle import constants config = Config() diff --git a/cradle/environment/skill_registry.py b/cradle/environment/skill_registry.py index e723823..8859056 100644 --- a/cradle/environment/skill_registry.py +++ b/cradle/environment/skill_registry.py @@ -20,7 +20,6 @@ from cradle.utils.check import is_valid_value from cradle.gameio.io_env import IOEnvironment from cradle.constants import * -from cradle.utils.object_utils import groundingdino_detect, circle_detector_detect config = Config() diff --git a/cradle/provider/object_detect/gd_provider.py b/cradle/provider/object_detect/gd_provider.py index 6b618f6..247382e 100644 --- a/cradle/provider/object_detect/gd_provider.py +++ b/cradle/provider/object_detect/gd_provider.py @@ -3,12 +3,16 @@ from cradle.provider import BaseProvider from cradle import constants from cradle.log import Logger +from cradle.config import Config from cradle.memory import LocalMemory -from cradle.utils.object_utils import groundingdino_detect +config = Config() logger = Logger() memory = LocalMemory() +if config.is_game == True: + from cradle.utils.object_utils import groundingdino_detect + class GdProvider(BaseProvider): diff --git a/cradle/utils/image_utils.py b/cradle/utils/image_utils.py index 8400922..421d5b5 100644 --- a/cradle/utils/image_utils.py +++ b/cradle/utils/image_utils.py @@ -20,13 +20,15 @@ from cradle.config import Config from cradle.gameio import IOEnvironment from cradle.log import Logger -from cradle.utils.object_utils import groundingdino_detect from cradle import constants config = Config() io_env = IOEnvironment() logger = Logger() +if config.is_game == True: + from cradle.utils.object_utils import groundingdino_detect + def show_image(img): diff --git a/docs/envs/software.md b/docs/envs/software.md index 94e0a40..fc8c931 100644 --- a/docs/envs/software.md +++ b/docs/envs/software.md @@ -1,9 +1,24 @@ -Here are the settings for Software side. - ## Software Setup ### 1. Install Software Dependencies +**Install Segment Anything Model (SAM)** + +On Windows install from https://developer.nvidia.com/cuda-11-8-0-download-archive (Linux packages also available). + +Ensure pytorch is installed using the right CUDA dependencies. + +```bash +conda install pytorch torchvision cudatoolkit=11.8 -c nvidia -c pytorch +``` + +If this doesn't work, or you prefer the pip way, you can try something like: + +```bash +pip3 install --upgrade torch==2.1.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html +pip3 install torchvision==0.16.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html +``` + Download the [StableSAM](https://huggingface.co/spaces/abhishek/StableSAM/blob/main/sam_vit_h_4b8939.pth) model file and copy it to the `/cache` folder. ### 2. Change Computer Settings Before Running the Code @@ -16,9 +31,15 @@ Then, set the folder that the agent will open to display in Large icons or Extra ![Large icons](../envs/images/software/large_icon.png) -### 3. Open the software you want to test +### 3. Open the software and task you want to run +Cradle is mainly tested on Chrome, Outlook, Capcut, Meitu and Feishu. Theoretically, it can also be run on other software applications. + +#### 3.1 Follow [25 Tasks Provided](#25-tasks-provided) to choose the software and task you want to run +Change the task `task_id` in `cradle/runner/app_runner.py` according to the description in `cradle/conf/env_config_[env_name].json` to switch among tasks. -Below are the exact software versions utilized in our paper: +#### 3.2 Follow [Initial Stage for Each Software](#initial-stage-for-each-software) to open the software + +Below are the exact software versions utilized: | Software | Version | | -------- | ------- | @@ -33,7 +54,7 @@ In theory, any version can be used. However, if you want to reproduce our experi ### 4. Run To simplify operations, the default LLM model we use is OpenAI's `GPT-4o`. -After opening the corresponding software in your main screen, use the follow script to let Cradle run. +After opening the corresponding software on your main screen, use the following script to let Cradle run. ```bash # Run Chrome @@ -48,9 +69,7 @@ python runner.py --envConfig "./conf/env_config_xiuxiu.json" python runner.py --envConfig "./conf/env_config_feishu.json" ``` -Or if you want use debug mode, you need to change the `--envConfig` target in `.vscode\launch.json` to the software's JSON file in the `conf\` directory that you want to test. - -## 25 Tasks in our Paper +## 25 Tasks Provided Task Descriptions for Chrome, Outlook, CapCut, Meitu and Feishu. **Difficulty** refers to how hard it is for our agent to accomplish the corresponding tasks. @@ -87,7 +106,7 @@ Task Descriptions for Chrome, Outlook, CapCut, Meitu and Feishu. **Difficulty** | #4 Set User Status| Open the user profile menu and set my status to "In meeting". | Medium | | #5 Start Video Conference | Create a new meeting and meet now. | Easy | -## Initial Stage for Every Software +## Initial Stage for Each Software ### 1. Chrome @@ -154,8 +173,3 @@ For each task in Feishu, the initial page is shown in the figure: - Copy the `cradle\environment\chrome` folder located in `cradle\environment\` and rename it to match your software environment name. Replace all instances of "chrome" within the folder with your software's environment name. - Copy the `res\chrome` folder located in `res\` and rename it to your software environment name. Replace all instances of "chrome" within the folder with your software's environment name. Modify the prompts and template-matching icon images as needed for important UI elements that SAM2SOM cannot recognize. - -4. Debug and Terminal Modes: - - - Debug Mode: Change the `--envConfig` target in `.vscode\launch.json` to point to the software's JSON file in the `conf\` directory that you want to test. - - Terminal Mode: Pass the `--envConfig` argument to the software's JSON file in the `conf\` directory that you want to test. diff --git a/requirements.txt b/requirements.txt index ff78e2c..1500f74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +ahk==1.7.6 +ahk-binary==2023.9.0 backoff==2.2.1 openai==1.2.3 python-dotenv==1.0.0 @@ -16,6 +18,8 @@ aiohttp easyocr==1.7.1 spacy==3.7.2 chardet==5.2.0 +matplotlib==3.9.1 +supervision==0.21.0 pyobjc-framework-Quartz==10.0; sys_platform == "darwin" pyobjc-framework-Cocoa==10.0; sys_platform == "darwin" git+https://github.com/facebookresearch/segment-anything.git