diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index c85a809f05bb..b99e1ab1ab7a 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -47,6 +47,7 @@ 'CodeActAgent': codeact_user_response, 'CodeActSWEAgent': codeact_user_response, 'SupervisorAgent': codeact_user_response, + 'DelegatorAgent': codeact_user_response, } @@ -69,6 +70,13 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata): f'--- BEGIN HINTS ---\n{instance.hints_text}\n--- END HINTS ---\n' ) instruction += CODEACT_SWE_PROMPT.format(workspace_dir_name=workspace_dir_name) + elif metadata.agent_class == 'DelegatorAgent': + instruction = ( + f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following PR description:\n\n" + f'\n' + f'{instance.problem_statement}\n' + '\n\n' + ) else: # Instruction based on Anthropic's official trajectory # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs @@ -92,20 +100,6 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata): "Your thinking should be thorough and so it's fine if it's very long.\n" ) - instruction += ( - '\n' - '- You MUST generate only one action per turn!\n' - '- A patch is a set of changes to the source code of the codebase that you are given\n' - '- You MUST generate a patch that attempts to fix the issue described in the \n' - '\n' - ) - - if RUN_WITH_BROWSING: - instruction += ( - '\n' - 'You SHOULD NEVER attempt to browse the web. ' - '\n' - ) return instruction diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index 2926d7d5e459..8ff482de580f 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -73,7 +73,7 @@ class CodeActAgent(Agent): JupyterRequirement(), ] obs_prefix = 'OBSERVATION:\n' - when_to_stop = 6 + when_to_stop = -1 number_of_events = -1 def __init__( @@ -363,16 +363,6 @@ def step(self, state: State) -> Action: outputs={'fixed': True, 'trayectory': serialized_messages} ) - # if we've reached the max number of iterations, go back for an evaluation on the approach - if self.when_to_stop > 0 and state.local_iteration % self.when_to_stop == 0: - messages = self._get_messages(state) - serialized_messages = [ - msg.model_dump() for msg in messages - ] # Serialize each Message object - return AgentFinishAction( - outputs={'trayectory': serialized_messages, 'fixed': False} - ) - # prepare what we want to send to the LLM messages = self._get_messages(state) params: dict = { @@ -390,6 +380,15 @@ def step(self, state: State) -> Action: ] response = self.llm.completion(**params) + # if we've reached the max number of iterations, go back for an evaluation on the approach + if self.when_to_stop > 0 and state.local_iteration % self.when_to_stop == 0: + return AgentFinishAction( + outputs={ + 'response': response['choices'][0]['message']['content'], + 'fixed': False, + } + ) + if self.function_calling_active: actions = codeact_function_calling.response_to_actions(response) for action in actions: diff --git a/openhands/agenthub/delegator_agent/agent.py b/openhands/agenthub/delegator_agent/agent.py index 7cb987c8c3f7..e17381f5d8f7 100644 --- a/openhands/agenthub/delegator_agent/agent.py +++ b/openhands/agenthub/delegator_agent/agent.py @@ -49,7 +49,6 @@ def step(self, state: State) -> Action: if not isinstance(last_observation, AgentDelegateObservation): raise Exception('Last observation is not an AgentDelegateObservation') - goal, _ = state.get_current_user_intent() if self.current_delegate == 'study': self.current_delegate = 'coder' diff --git a/openhands/agenthub/micro/coder/prompt.md b/openhands/agenthub/micro/coder/prompt.md index 31d4439e2b36..046318030bff 100644 --- a/openhands/agenthub/micro/coder/prompt.md +++ b/openhands/agenthub/micro/coder/prompt.md @@ -21,7 +21,13 @@ Do NOT finish until you have completed the tasks. ## History {{ instructions.history_truncated }} -{{ history_to_json(state.history, max_events=20) }} +{% for event in state.history[-20:] %} +{% if event.source == "agent" %} +Agent: {{ event.action }} - {{ event.content if event.content else event.observation }} +{% else %} +User: {{ event.content if event.content else event.observation }} +{% endif %} +{% endfor %} ## Format {{ instructions.format.action }} diff --git a/openhands/agenthub/micro/study_repo_for_task/prompt.md b/openhands/agenthub/micro/study_repo_for_task/prompt.md index 91cdf3c3c6a0..d6e5ca77c5c2 100644 --- a/openhands/agenthub/micro/study_repo_for_task/prompt.md +++ b/openhands/agenthub/micro/study_repo_for_task/prompt.md @@ -24,7 +24,13 @@ implement the solution. If the codebase is empty, you should call the `finish` a ## History {{ instructions.history_truncated }} -{{ history_to_json(state.history, max_events=20) }} +{% for event in state.history[-20:] %} +{% if event.source == "agent" %} +Agent: {{ event.action }} - {{ event.content if event.content else event.observation }} +{% else %} +User: {{ event.content if event.content else event.observation }} +{% endif %} +{% endfor %} ## Format {{ instructions.format.action }} diff --git a/openhands/agenthub/micro/verifier/prompt.md b/openhands/agenthub/micro/verifier/prompt.md index 48c7a73cc45d..d3ec424565a4 100644 --- a/openhands/agenthub/micro/verifier/prompt.md +++ b/openhands/agenthub/micro/verifier/prompt.md @@ -22,7 +22,13 @@ explaining what the problem is. ## History {{ instructions.history_truncated }} -{{ history_to_json(state.history, max_events=20) }} +{% for event in state.history[-20:] %} +{% if event.source == "agent" %} +Agent: {{ event.action }} - {{ event.content if event.content else event.observation }} +{% else %} +User: {{ event.content if event.content else event.observation }} +{% endif %} +{% endfor %} ## Format {{ instructions.format.action }} diff --git a/openhands/agenthub/supervisor_agent/agent.py b/openhands/agenthub/supervisor_agent/agent.py index 96e04348581f..196c7871329b 100644 --- a/openhands/agenthub/supervisor_agent/agent.py +++ b/openhands/agenthub/supervisor_agent/agent.py @@ -2,9 +2,7 @@ import re from typing import Any, Dict, List -from openhands.agenthub.supervisor_agent.prompt import ( - get_prompt, -) +from openhands.agenthub.supervisor_agent.prompt import code_act_agent_prompt, get_prompt from openhands.controller.agent import Agent from openhands.controller.state.state import State from openhands.core.config import AgentConfig @@ -12,6 +10,7 @@ from openhands.core.message import Message, TextContent from openhands.events.action import Action, AgentDelegateAction, AgentFinishAction from openhands.events.observation.delegate import AgentDelegateObservation +from openhands.events.observation.observation import Observation from openhands.llm.llm import LLM from openhands.runtime.plugins.agent_skills import AgentSkillsRequirement from openhands.runtime.plugins.jupyter import JupyterRequirement @@ -34,6 +33,7 @@ class SupervisorAgent(Agent): task: str = '' test_command: str = '' time_to_stop: int = 60 # Every 60 iterations, we stop and evaluate the approach + phase: int = 0 sandbox_plugins: list[PluginRequirement] = [ # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since @@ -56,7 +56,7 @@ def __init__(self, llm: LLM, config: AgentConfig): - llm (LLM): The llm to be used by this agent """ llm_config = LLMConfig( - model='openai/o1-mini', api_key='REDACTED', temperature=1.0 + model='openai/o1-preview', api_key='REDACTED', temperature=1.0 ) llm = LLM(llm_config) # TODO: Remove this once we have a real AgentConfig @@ -70,77 +70,53 @@ def __init__(self, llm: LLM, config: AgentConfig): def step(self, state: State) -> Action: self.logger.debug('Starting step with state: %s', state) self.logger.debug('LLM config: %s', self.llm_config) - last_observation = state.history[-1] + last_observation: Observation | None = None + for event in reversed(state.history): + if isinstance(event, Observation): + last_observation = event + break + task, _ = state.get_current_user_intent() self.task = task or '' - # import pdb; pdb.set_trace() - # Try CodeActAgent first if we haven't tried it yet - if not self.tried_direct_code: - prompt = get_prompt(self.task, [], 'initial') - raw_response = self.get_response(prompt) - match = re.search( - r'(.*?)', - raw_response, - re.DOTALL, - ) - self.augmented_task = match.group(1).strip('"') if match else self.task - self.tried_direct_code = True + if self.phase == 0: + self.phase += 1 + prompt = get_prompt(self.task, None, 'high_level_task') return AgentDelegateAction( agent='CodeActAgent', inputs={ - 'task': self.task, - 'augmented_task': self.augmented_task, - 'when_to_stop': self.time_to_stop, + 'task': prompt, + 'when_to_stop': 1, }, ) if not isinstance(last_observation, AgentDelegateObservation): - raise ValueError('Last observation is not an AgentDelegateObservation') + return AgentFinishAction() if not last_observation.outputs.get('fixed', False): - trayectory: List[Dict] = last_observation.outputs['trayectory'] - deserialized_trajectory = [ - Message( - role=msg_dict['role'], - content=[ - TextContent(text=content_text) - for content_text in [ - msg_dict['content'][0]['text'] - if isinstance(msg_dict['content'], list) - else msg_dict['content'] - ] - ], - tool_call_id=msg_dict.get('tool_call_id'), - name=msg_dict.get('name'), - ) - for msg_dict in trayectory - ] - # import pdb; pdb.set_trace() - prompt = get_prompt(self.task, deserialized_trajectory, 'right_track') - raw_response = self.get_response(prompt) - match = re.search(r'(.*?)', raw_response, re.DOTALL) - if match and 'yes' in match.group(1).lower(): - return AgentDelegateAction( - agent='CodeActAgent', - inputs={ - 'task': self.task, - 'trayectory': trayectory, - 'when_to_stop': self.time_to_stop, - }, - ) - # pdb.set_trace() - prompt = get_prompt(self.task, deserialized_trajectory, 'refactor') + response: str = last_observation.outputs['response'] + match = re.search( + r'(.*?)', str(response), re.DOTALL + ) + self.requirements = match.group(1).strip('"') if match else '' + + self.phase += 1 + prompt = get_prompt( + self.task, None, 'initial', requirements=self.requirements + ) raw_response = self.get_response(prompt) - match = re.search(r'(.*?)', raw_response, re.DOTALL) - next_step = match.group(1).strip('"') if match else '' - self.logger.debug('Suggested approach: %s', next_step) + match = re.search( + r'(.*?)', + raw_response, + re.DOTALL, + ) + steps = match.group(1).strip('"') if match else self.task + return AgentDelegateAction( agent='CodeActAgent', inputs={ 'task': self.task, - 'trayectory': trayectory, - 'next_step': next_step, + 'next_step': code_act_agent_prompt % {'steps': steps}, 'when_to_stop': self.time_to_stop, }, ) diff --git a/openhands/agenthub/supervisor_agent/prompt.py b/openhands/agenthub/supervisor_agent/prompt.py index f2a032eeddf3..f76f4d7f2eb2 100644 --- a/openhands/agenthub/supervisor_agent/prompt.py +++ b/openhands/agenthub/supervisor_agent/prompt.py @@ -1,3 +1,5 @@ +from typing import Optional + from openhands.core.message import Message, TextContent HISTORY_SIZE = 20 @@ -8,9 +10,8 @@ # 2. Implementing the solution. # Then the manager needs to check if the issue has been fixed, if not, it needs to iterate. general_description = """ -You are a helpful assistant that can provides DETAILED guidance on how to fix an issue in a codebase. +You are a helpful assistant that provides a detailed step-by-step plan. """ - side_effects_description = """ You are a helpful assistant that creative insights into the side-effects of changes made. @@ -26,6 +27,7 @@ - Testing has been taken into account, so you should not mention it in any way! - Be aware of consistency issues! - Provide ONLY the related functions. (e.g. If the mentions the write function, then generate the read function). +- Encapsulate your suggestions in between and tags. EXAMPLE: @@ -34,6 +36,22 @@ After implementing those changes: - The parser functions that read the data might need to be updated to adapt to the new format. + +END OF EXAMPLE +""" + +high_level_task = """ + +%(task)s + +Can you create a summary with all the functional and non-functional requirements for the task described in ? + + +- Encapsulate your suggestions in between and tags. +- Documentation has been taken into account, so you should not mention it in any way! +- Testing has been taken into account, so you should not mention it in any way! +- Do NOT consider performance implications + """ initial_prompt = """ @@ -41,46 +59,44 @@ %(task)s -Try to imagine with all details how would you fix the . What is the root cause of the issue? -Consider opposite scenarios (eg. if the is writing to a file, consider what happens when the file is read). -Consider edge cases (eg. what if the file doesn't exist?). +I have already thought out the functional and non-functional requirements for the task described in : -I've already taken care of all changes to any of the test files described in the . This means you DON'T have to think about the testing logic or any of the tests in any way! -The idea is to make the minimal changes to non-tests files in the /workspace directory to ensure the is satisfied. + +%(requirements)s + -How would you fix the issue described in the with the least amount of steps? Generate the augmented with the least amount of steps to fix the issue in between and tags. -Each step MUST be very detailed as to why is needed. -Your thinking should be thorough and so it's fine if it's very long. -Be as detailed as possible. +create a step-by-step plan broken down into phases for how to implement this using requirements mentioned in . -Documentation has been taken into account, so you should not repeat it in the . -Testing has been taken into account, so you should not repeat it in the . You can create new tests, but never use existing tests. -ALWAYS output all your reasoning, be as detailed as possible. +Your thinking should be thorough and so it's fine if it's very long. -Follow this structure: -1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure. - - Files to explore, parts of the codebase I should focus on, keywords to look for... - - Extended reasoning... -2. Create a script to reproduce the error and execute it to confirm that the error is reproducible - - Ensure that when executing the script, you get the error described in the - - Suggested code to reproduce the error, keeping in mind the side-effects described in the previous step, so that the error and side-effects are reproducible - - Extended reasoning... -3. Edit the sourcecode of the repo to resolve the issue - - Suggest what files to change and code SUGGESTIONS. Trying to fix the issue in with the least amount of changes. - - Keep in mind for the code suggestions that I might need to change some other functions to prevent the side-effects described in the previous steps. - - Extended reasoning... -4. Rerun your reproduce script and confirm that the error is fixed! +Documentation has been taken into account, so you should not repeat it in the . +I've already taken care of all changes to any of the test files described in the . This means you DON'T have to modify the testing logic or any of the tests in any way! -One step MUST be to recreate the issue and ensure that the error log is the same as the one described in the . +- Encapsulate your suggestions in between and tags. +- One step MUST be about reproducing the issue with a simple script, no pytest! +- The goal is to fix the issue with the MINIMAL changes to non-tests files in the /workspace directory. -Example: - +REMEMBER: the idea is to fix the issue with the MINIMAL changes to non-tests files in the /workspace directory. +""" + +code_act_agent_prompt = """ + +Can you help me implement the necessary changes to the repository so that the requirements specified in the are met? +I've already taken care of all changes to any of the test files described in the . This means you DON'T have to modify the testing logic or any of the tests in any way! +Your task is to make the minimal changes to non-tests files in the /workspace directory to ensure the is satisfied. +Follow the steps described in to resolve the issue: + + +%(steps)s + - + +- When reproducing the issue, use a simple Python script and directly examine its output instead of pytest. + -REMEMBER: you ARE ONLY suggesting steps to fix the issue, do NOT be assertive, use the language of a suggestion. +Your turn! """ right_track_prompt = """ @@ -144,7 +160,7 @@ """ -def format_conversation(trajectory: list[Message]) -> str: +def format_conversation(trajectory: Optional[list[Message]] = None) -> str: """Format a conversation history into a readable string. Args: @@ -153,6 +169,8 @@ def format_conversation(trajectory: list[Message]) -> str: Returns: Formatted string representing the conversation """ + if trajectory is None: + trajectory = [] formatted_parts = [] for message in trajectory: @@ -170,9 +188,10 @@ def format_conversation(trajectory: list[Message]) -> str: def get_prompt( task: str, - trajectory: list[Message], + trajectory: Optional[list[Message]] = None, prompt_type: str = 'initial', augmented_task: str = '', + requirements: str = '', ) -> str: """Format and return the appropriate prompt based on prompt_type. @@ -184,27 +203,23 @@ def get_prompt( Returns: Formatted prompt string """ + if trajectory is None: + trajectory = [] # If approach is a conversation history, format it - if trajectory: - approach = format_conversation(trajectory) - else: - approach = '' + approach = format_conversation(trajectory) # Select the appropriate prompt template - if prompt_type == 'initial': - template = initial_prompt - elif prompt_type == 'right_track': - template = right_track_prompt - elif prompt_type == 'refactor': - template = refactor_prompt - elif prompt_type == 'critical': - template = critical_prompt - - # Format the selected template with the task and approach - formatted_prompt = general_description + template % { + template = { + 'initial': initial_prompt, + 'right_track': right_track_prompt, + 'refactor': refactor_prompt, + 'critical': critical_prompt, + 'high_level_task': high_level_task, + }[prompt_type] + + return general_description + template % { 'task': task, 'approach': approach, 'augmented_pr_description': augmented_task, + 'requirements': requirements, } - - return formatted_prompt