diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index c85a809f05bb..b99e1ab1ab7a 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -47,6 +47,7 @@
     'CodeActAgent': codeact_user_response,
     'CodeActSWEAgent': codeact_user_response,
     'SupervisorAgent': codeact_user_response,
+    'DelegatorAgent': codeact_user_response,
 }
 
 
@@ -69,6 +70,13 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
                 f'--- BEGIN HINTS ---\n{instance.hints_text}\n--- END HINTS ---\n'
             )
         instruction += CODEACT_SWE_PROMPT.format(workspace_dir_name=workspace_dir_name)
+    elif metadata.agent_class == 'DelegatorAgent':
+        instruction = (
+            f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following PR description:\n\n"
+            f'<pr_description>\n'
+            f'{instance.problem_statement}\n'
+            '</pr_description>\n\n'
+        )
     else:
         # Instruction based on Anthropic's official trajectory
         # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
@@ -92,20 +100,6 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
             "Your thinking should be thorough and so it's fine if it's very long.\n"
         )
 
-    instruction += (
-        '<IMPORTANT>\n'
-        '- You MUST generate only one action per turn!\n'
-        '- A patch is a set of changes to the source code of the codebase that you are given\n'
-        '- You MUST generate a patch that attempts to fix the issue described in the <pr_description>\n'
-        '</IMPORTANT>\n'
-    )
-
-    if RUN_WITH_BROWSING:
-        instruction += (
-            '<IMPORTANT!>\n'
-            'You SHOULD NEVER attempt to browse the web. '
-            '</IMPORTANT!>\n'
-        )
     return instruction
 
 
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 2926d7d5e459..8ff482de580f 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -73,7 +73,7 @@ class CodeActAgent(Agent):
         JupyterRequirement(),
     ]
     obs_prefix = 'OBSERVATION:\n'
-    when_to_stop = 6
+    when_to_stop = -1
     number_of_events = -1
 
     def __init__(
@@ -363,16 +363,6 @@ def step(self, state: State) -> Action:
                 outputs={'fixed': True, 'trayectory': serialized_messages}
             )
 
-        # if we've reached the max number of iterations, go back for an evaluation on the approach
-        if self.when_to_stop > 0 and state.local_iteration % self.when_to_stop == 0:
-            messages = self._get_messages(state)
-            serialized_messages = [
-                msg.model_dump() for msg in messages
-            ]  # Serialize each Message object
-            return AgentFinishAction(
-                outputs={'trayectory': serialized_messages, 'fixed': False}
-            )
-
         # prepare what we want to send to the LLM
         messages = self._get_messages(state)
         params: dict = {
@@ -390,6 +380,15 @@ def step(self, state: State) -> Action:
             ]
         response = self.llm.completion(**params)
 
+        # if we've reached the max number of iterations, go back for an evaluation on the approach
+        if self.when_to_stop > 0 and state.local_iteration % self.when_to_stop == 0:
+            return AgentFinishAction(
+                outputs={
+                    'response': response['choices'][0]['message']['content'],
+                    'fixed': False,
+                }
+            )
+
         if self.function_calling_active:
             actions = codeact_function_calling.response_to_actions(response)
             for action in actions:
diff --git a/openhands/agenthub/delegator_agent/agent.py b/openhands/agenthub/delegator_agent/agent.py
index 7cb987c8c3f7..e17381f5d8f7 100644
--- a/openhands/agenthub/delegator_agent/agent.py
+++ b/openhands/agenthub/delegator_agent/agent.py
@@ -49,7 +49,6 @@ def step(self, state: State) -> Action:
 
         if not isinstance(last_observation, AgentDelegateObservation):
             raise Exception('Last observation is not an AgentDelegateObservation')
-
         goal, _ = state.get_current_user_intent()
         if self.current_delegate == 'study':
             self.current_delegate = 'coder'
diff --git a/openhands/agenthub/micro/coder/prompt.md b/openhands/agenthub/micro/coder/prompt.md
index 31d4439e2b36..046318030bff 100644
--- a/openhands/agenthub/micro/coder/prompt.md
+++ b/openhands/agenthub/micro/coder/prompt.md
@@ -21,7 +21,13 @@ Do NOT finish until you have completed the tasks.
 
 ## History
 {{ instructions.history_truncated }}
-{{ history_to_json(state.history, max_events=20) }}
+{% for event in state.history[-20:] %}
+{% if event.source == "agent" %}
+Agent: {{ event.action }} - {{ event.content if event.content else event.observation }}
+{% else %}
+User: {{ event.content if event.content else event.observation }}
+{% endif %}
+{% endfor %}
 
 ## Format
 {{ instructions.format.action }}
diff --git a/openhands/agenthub/micro/study_repo_for_task/prompt.md b/openhands/agenthub/micro/study_repo_for_task/prompt.md
index 91cdf3c3c6a0..d6e5ca77c5c2 100644
--- a/openhands/agenthub/micro/study_repo_for_task/prompt.md
+++ b/openhands/agenthub/micro/study_repo_for_task/prompt.md
@@ -24,7 +24,13 @@ implement the solution. If the codebase is empty, you should call the `finish` a
 
 ## History
 {{ instructions.history_truncated }}
-{{ history_to_json(state.history, max_events=20) }}
+{% for event in state.history[-20:] %}
+{% if event.source == "agent" %}
+Agent: {{ event.action }} - {{ event.content if event.content else event.observation }}
+{% else %}
+User: {{ event.content if event.content else event.observation }}
+{% endif %}
+{% endfor %}
 
 ## Format
 {{ instructions.format.action }}
diff --git a/openhands/agenthub/micro/verifier/prompt.md b/openhands/agenthub/micro/verifier/prompt.md
index 48c7a73cc45d..d3ec424565a4 100644
--- a/openhands/agenthub/micro/verifier/prompt.md
+++ b/openhands/agenthub/micro/verifier/prompt.md
@@ -22,7 +22,13 @@ explaining what the problem is.
 
 ## History
 {{ instructions.history_truncated }}
-{{ history_to_json(state.history, max_events=20) }}
+{% for event in state.history[-20:] %}
+{% if event.source == "agent" %}
+Agent: {{ event.action }} - {{ event.content if event.content else event.observation }}
+{% else %}
+User: {{ event.content if event.content else event.observation }}
+{% endif %}
+{% endfor %}
 
 ## Format
 {{ instructions.format.action }}
diff --git a/openhands/agenthub/supervisor_agent/agent.py b/openhands/agenthub/supervisor_agent/agent.py
index 96e04348581f..196c7871329b 100644
--- a/openhands/agenthub/supervisor_agent/agent.py
+++ b/openhands/agenthub/supervisor_agent/agent.py
@@ -2,9 +2,7 @@
 import re
 from typing import Any, Dict, List
 
-from openhands.agenthub.supervisor_agent.prompt import (
-    get_prompt,
-)
+from openhands.agenthub.supervisor_agent.prompt import code_act_agent_prompt, get_prompt
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
@@ -12,6 +10,7 @@
 from openhands.core.message import Message, TextContent
 from openhands.events.action import Action, AgentDelegateAction, AgentFinishAction
 from openhands.events.observation.delegate import AgentDelegateObservation
+from openhands.events.observation.observation import Observation
 from openhands.llm.llm import LLM
 from openhands.runtime.plugins.agent_skills import AgentSkillsRequirement
 from openhands.runtime.plugins.jupyter import JupyterRequirement
@@ -34,6 +33,7 @@ class SupervisorAgent(Agent):
     task: str = ''
     test_command: str = ''
     time_to_stop: int = 60  # Every 60 iterations, we stop and evaluate the approach
+    phase: int = 0
 
     sandbox_plugins: list[PluginRequirement] = [
         # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
@@ -56,7 +56,7 @@ def __init__(self, llm: LLM, config: AgentConfig):
         - llm (LLM): The llm to be used by this agent
         """
         llm_config = LLMConfig(
-            model='openai/o1-mini', api_key='REDACTED', temperature=1.0
+            model='openai/o1-preview', api_key='REDACTED', temperature=1.0
         )
         llm = LLM(llm_config)
         # TODO: Remove this once we have a real AgentConfig
@@ -70,77 +70,53 @@ def __init__(self, llm: LLM, config: AgentConfig):
     def step(self, state: State) -> Action:
         self.logger.debug('Starting step with state: %s', state)
         self.logger.debug('LLM config: %s', self.llm_config)
-        last_observation = state.history[-1]
+        last_observation: Observation | None = None
+        for event in reversed(state.history):
+            if isinstance(event, Observation):
+                last_observation = event
+                break
+
         task, _ = state.get_current_user_intent()
         self.task = task or ''
 
-        # import pdb; pdb.set_trace()
-        # Try CodeActAgent first if we haven't tried it yet
-        if not self.tried_direct_code:
-            prompt = get_prompt(self.task, [], 'initial')
-            raw_response = self.get_response(prompt)
-            match = re.search(
-                r'<augmented_pr_description>(.*?)</augmented_pr_description>',
-                raw_response,
-                re.DOTALL,
-            )
-            self.augmented_task = match.group(1).strip('"') if match else self.task
-            self.tried_direct_code = True
+        if self.phase == 0:
+            self.phase += 1
+            prompt = get_prompt(self.task, None, 'high_level_task')
             return AgentDelegateAction(
                 agent='CodeActAgent',
                 inputs={
-                    'task': self.task,
-                    'augmented_task': self.augmented_task,
-                    'when_to_stop': self.time_to_stop,
+                    'task': prompt,
+                    'when_to_stop': 1,
                 },
             )
 
         if not isinstance(last_observation, AgentDelegateObservation):
-            raise ValueError('Last observation is not an AgentDelegateObservation')
+            return AgentFinishAction()
 
         if not last_observation.outputs.get('fixed', False):
-            trayectory: List[Dict] = last_observation.outputs['trayectory']
-            deserialized_trajectory = [
-                Message(
-                    role=msg_dict['role'],
-                    content=[
-                        TextContent(text=content_text)
-                        for content_text in [
-                            msg_dict['content'][0]['text']
-                            if isinstance(msg_dict['content'], list)
-                            else msg_dict['content']
-                        ]
-                    ],
-                    tool_call_id=msg_dict.get('tool_call_id'),
-                    name=msg_dict.get('name'),
-                )
-                for msg_dict in trayectory
-            ]
-            # import pdb; pdb.set_trace()
-            prompt = get_prompt(self.task, deserialized_trajectory, 'right_track')
-            raw_response = self.get_response(prompt)
-            match = re.search(r'<answer>(.*?)</answer>', raw_response, re.DOTALL)
-            if match and 'yes' in match.group(1).lower():
-                return AgentDelegateAction(
-                    agent='CodeActAgent',
-                    inputs={
-                        'task': self.task,
-                        'trayectory': trayectory,
-                        'when_to_stop': self.time_to_stop,
-                    },
-                )
-            # pdb.set_trace()
-            prompt = get_prompt(self.task, deserialized_trajectory, 'refactor')
+            response: str = last_observation.outputs['response']
+            match = re.search(
+                r'<requirements>(.*?)</requirements>', str(response), re.DOTALL
+            )
+            self.requirements = match.group(1).strip('"') if match else ''
+
+            self.phase += 1
+            prompt = get_prompt(
+                self.task, None, 'initial', requirements=self.requirements
+            )
             raw_response = self.get_response(prompt)
-            match = re.search(r'<next_step>(.*?)</next_step>', raw_response, re.DOTALL)
-            next_step = match.group(1).strip('"') if match else ''
-            self.logger.debug('Suggested approach: %s', next_step)
+            match = re.search(
+                r'<steps>(.*?)</steps>',
+                raw_response,
+                re.DOTALL,
+            )
+            steps = match.group(1).strip('"') if match else self.task
+
             return AgentDelegateAction(
                 agent='CodeActAgent',
                 inputs={
                     'task': self.task,
-                    'trayectory': trayectory,
-                    'next_step': next_step,
+                    'next_step': code_act_agent_prompt % {'steps': steps},
                     'when_to_stop': self.time_to_stop,
                 },
             )
diff --git a/openhands/agenthub/supervisor_agent/prompt.py b/openhands/agenthub/supervisor_agent/prompt.py
index f2a032eeddf3..f76f4d7f2eb2 100644
--- a/openhands/agenthub/supervisor_agent/prompt.py
+++ b/openhands/agenthub/supervisor_agent/prompt.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 from openhands.core.message import Message, TextContent
 
 HISTORY_SIZE = 20
@@ -8,9 +10,8 @@
 # 2. Implementing the solution.
 # Then the manager needs to check if the issue has been fixed, if not, it needs to iterate.
 general_description = """
-You are a helpful assistant that can provides DETAILED guidance on how to fix an issue in a codebase.
+You are a helpful assistant that provides a detailed step-by-step plan.
 """
-
 side_effects_description = """
 You are a helpful assistant that creative insights into the side-effects of changes made.
 
@@ -26,6 +27,7 @@
 - Testing has been taken into account, so you should not mention it in any way!
 - Be aware of consistency issues!
 - Provide ONLY the related functions. (e.g. If the <pr_description> mentions the write function, then generate the read function).
+- Encapsulate your suggestions in between <suggestions> and </suggestions> tags.
 </IMPORTANT>
 
 EXAMPLE:
@@ -34,6 +36,22 @@
 </pr_description>
 After implementing those changes:
 - The parser functions that read the data might need to be updated to adapt to the new format.
+
+END OF EXAMPLE
+"""
+
+high_level_task = """
+
+%(task)s
+
+Can you create a summary with all the functional and non-functional requirements for the task described in <pr_description>?
+
+<IMPORTANT>
+- Encapsulate your suggestions in between <requirements> and </requirements> tags.
+- Documentation has been taken into account, so you should not mention it in any way!
+- Testing has been taken into account, so you should not mention it in any way!
+- Do NOT consider performance implications
+</IMPORTANT>
 """
 
 initial_prompt = """
@@ -41,46 +59,44 @@
 
 %(task)s
 
-Try to imagine with all details how would you fix the <pr_description>. What is the root cause of the issue?
-Consider opposite scenarios (eg. if the <pr_description> is writing to a file, consider what happens when the file is read).
-Consider edge cases (eg. what if the file doesn't exist?).
+I have already thought out the functional and non-functional requirements for the task described in <pr_description>:
 
-I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to think about the testing logic or any of the tests in any way!
-The idea is to make the minimal changes to non-tests files in the /workspace directory to ensure the <pr_description> is satisfied.
+<requirements>
+%(requirements)s
+</requirements>
 
-How would you fix the issue described in the <pr_description> with the least amount of steps? Generate the augmented <pr_description> with the least amount of steps to fix the issue in between <augmented_pr_description> and </augmented_pr_description> tags.
-Each step MUST be very detailed as to why is needed.
-Your thinking should be thorough and so it's fine if it's very long.
-Be as detailed as possible.
+create a step-by-step plan broken down into phases for how to implement this using requirements mentioned in <requirements>.
 
-Documentation has been taken into account, so you should not repeat it in the <augmented_pr_description>.
-Testing has been taken into account, so you should not repeat it in the <augmented_pr_description>. You can create new tests, but never use existing tests.
-ALWAYS output all your reasoning, be as detailed as possible.
+Your thinking should be thorough and so it's fine if it's very long.
 
-Follow this structure:
-1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.
-  - Files to explore, parts of the codebase I should focus on, keywords to look for...
-  - Extended reasoning...
-2. Create a script to reproduce the error and execute it to confirm that the error is reproducible
-  - Ensure that when executing the script, you get the error described in the <pr_description>
-  - Suggested code to reproduce the error, keeping in mind the side-effects described in the previous step, so that the error and side-effects are reproducible
-  - Extended reasoning...
-3. Edit the sourcecode of the repo to resolve the issue
-  - Suggest what files to change and code SUGGESTIONS. Trying to fix the issue in <pr_description> with the least amount of changes.
-  - Keep in mind for the code suggestions that I might need to change some other functions to prevent the side-effects described in the previous steps.
-  - Extended reasoning...
-4. Rerun your reproduce script and confirm that the error is fixed!
+Documentation has been taken into account, so you should not repeat it in the <steps>.
+I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
 
 <IMPORTANT>
-One step MUST be to recreate the issue and ensure that the error log is the same as the one described in the <pr_description>.
+- Encapsulate your suggestions in between <steps> and </steps> tags.
+- One step MUST be about reproducing the issue with a simple script, no pytest!
+- The goal is to fix the issue with the MINIMAL changes to non-tests files in the /workspace directory.
 </IMPORTANT>
 
-Example:
-<augmented_pr_description>
+REMEMBER: the idea is to fix the issue with the MINIMAL changes to non-tests files in the /workspace directory.
+"""
+
+code_act_agent_prompt = """
+
+Can you help me implement the necessary changes to the repository so that the requirements specified in the <pr_description> are met?
+I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
+Your task is to make the minimal changes to non-tests files in the /workspace directory to ensure the <pr_description> is satisfied.
+Follow the steps described in <steps> to resolve the issue:
+
+<steps>
+%(steps)s
+</steps>
 
-</augmented_pr_description>
+<IMPORTANT>
+- When reproducing the issue, use a simple Python script and directly examine its output instead of pytest.
+</IMPORTANT>
 
-REMEMBER: you ARE ONLY suggesting steps to fix the issue, do NOT be assertive, use the language of a suggestion.
+Your turn!
 """
 
 right_track_prompt = """
@@ -144,7 +160,7 @@
 """
 
 
-def format_conversation(trajectory: list[Message]) -> str:
+def format_conversation(trajectory: Optional[list[Message]] = None) -> str:
     """Format a conversation history into a readable string.
 
     Args:
@@ -153,6 +169,8 @@ def format_conversation(trajectory: list[Message]) -> str:
     Returns:
         Formatted string representing the conversation
     """
+    if trajectory is None:
+        trajectory = []
     formatted_parts = []
 
     for message in trajectory:
@@ -170,9 +188,10 @@ def format_conversation(trajectory: list[Message]) -> str:
 
 def get_prompt(
     task: str,
-    trajectory: list[Message],
+    trajectory: Optional[list[Message]] = None,
     prompt_type: str = 'initial',
     augmented_task: str = '',
+    requirements: str = '',
 ) -> str:
     """Format and return the appropriate prompt based on prompt_type.
 
@@ -184,27 +203,23 @@ def get_prompt(
     Returns:
         Formatted prompt string
     """
+    if trajectory is None:
+        trajectory = []
     # If approach is a conversation history, format it
-    if trajectory:
-        approach = format_conversation(trajectory)
-    else:
-        approach = ''
+    approach = format_conversation(trajectory)
 
     # Select the appropriate prompt template
-    if prompt_type == 'initial':
-        template = initial_prompt
-    elif prompt_type == 'right_track':
-        template = right_track_prompt
-    elif prompt_type == 'refactor':
-        template = refactor_prompt
-    elif prompt_type == 'critical':
-        template = critical_prompt
-
-    # Format the selected template with the task and approach
-    formatted_prompt = general_description + template % {
+    template = {
+        'initial': initial_prompt,
+        'right_track': right_track_prompt,
+        'refactor': refactor_prompt,
+        'critical': critical_prompt,
+        'high_level_task': high_level_task,
+    }[prompt_type]
+
+    return general_description + template % {
         'task': task,
         'approach': approach,
         'augmented_pr_description': augmented_task,
+        'requirements': requirements,
     }
-
-    return formatted_prompt