diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 91d04a75ef6a..2926d7d5e459 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -1,4 +1,3 @@
-import json
 import os
 from collections import deque
 from itertools import islice
@@ -13,6 +12,7 @@
 from openhands.core.config.llm_config import LLMConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
+from openhands.core.utils import json
 from openhands.events.action import (
     Action,
     AgentDelegateAction,
@@ -73,6 +73,8 @@ class CodeActAgent(Agent):
         JupyterRequirement(),
     ]
     obs_prefix = 'OBSERVATION:\n'
+    when_to_stop = 6
+    number_of_events = -1
 
     def __init__(
         self,
@@ -85,6 +87,7 @@ def __init__(
         - llm (LLM): The llm to be used by this agent
         """
 
+        # import pdb; pdb.set_trace()
         llm_config = LLMConfig(
             model='litellm_proxy/claude-3-5-sonnet-20241022',
             api_key='REDACTED',
@@ -93,10 +96,9 @@ def __init__(
         )
         llm = LLM(llm_config)
         # TODO: Remove this once we have a real AgentConfig
-        config = AgentConfig(llm_config='o1-mini')
+        config = AgentConfig()
         super().__init__(llm, config)
         self.reset()
-
         self.micro_agent = (
             MicroAgent(
                 os.path.join(
@@ -343,6 +345,11 @@ def step(self, state: State) -> Action:
         - MessageAction(content) - Message action to run (e.g. ask for clarification)
         - AgentFinishAction() - end the interaction
         """
+
+        # If this agent has a supervisor, we need to get the time to stop from the supervisor
+        if self.when_to_stop < 0 and state.inputs.get('when_to_stop', None):
+            self.when_to_stop = state.inputs['when_to_stop']
+
         # Continue with pending actions if any
         if self.pending_actions:
             return self.pending_actions.popleft()
@@ -350,7 +357,21 @@ def step(self, state: State) -> Action:
         # if we're done, go back
         last_user_message = state.get_last_user_message()
         if last_user_message and last_user_message.strip() == '/exit':
-            return AgentFinishAction()
+            messages = self._get_messages(state)
+            serialized_messages = [msg.model_dump() for msg in messages]
+            return AgentFinishAction(
+                outputs={'fixed': True, 'trayectory': serialized_messages}
+            )
+
+        # if we've reached the max number of iterations, go back for an evaluation on the approach
+        if self.when_to_stop > 0 and state.local_iteration % self.when_to_stop == 0:
+            messages = self._get_messages(state)
+            serialized_messages = [
+                msg.model_dump() for msg in messages
+            ]  # Serialize each Message object
+            return AgentFinishAction(
+                outputs={'trayectory': serialized_messages, 'fixed': False}
+            )
 
         # prepare what we want to send to the LLM
         messages = self._get_messages(state)
@@ -409,17 +430,60 @@ def _get_messages(self, state: State) -> list[Message]:
             - Messages from the same role are combined to prevent consecutive same-role messages
             - For Anthropic models, specific messages are cached according to their documentation
         """
-        messages: list[Message] = [
-            Message(
-                role='system',
-                content=[
-                    TextContent(
-                        text=self.system_prompt,
-                        cache_prompt=self.llm.is_caching_prompt_active(),  # Cache system prompt
-                    )
-                ],
+        # import pdb; pdb.set_trace()
+        messages: list[Message] = []
+        trayectory = state.inputs.get('trayectory', '')
+        # If there is no trayectory, its the first time we are seeing the task
+        if not trayectory:
+            messages.append(
+                Message(
+                    role='system',
+                    content=[
+                        TextContent(
+                            text=self.system_prompt,
+                            cache_prompt=self.llm.is_caching_prompt_active(),  # Cache system prompt
+                        )
+                    ],
+                )
             )
-        ]
+            if state.inputs.get('task', '') != '':
+                # During AgentDelegation the history is empty, so we add the task as the user message.
+                messages.append(
+                    Message(
+                        role='user',
+                        content=[TextContent(text=state.inputs['task'])],
+                    )
+                )
+
+            if state.inputs.get('augmented_task', ''):
+                messages.append(
+                    Message(
+                        role='user',
+                        content=[TextContent(text=state.inputs['augmented_task'])],
+                    )
+                )
+        else:
+            # If there is a previous trayectory, we restore it.
+            deserialized_trajectory = [
+                Message(
+                    role='user',
+                    content=[
+                        TextContent(text=content_text)
+                        for content_text in [
+                            msg_dict['content'][0]['text']
+                            if isinstance(msg_dict['content'], list)
+                            else msg_dict['content']
+                        ]
+                        if content_text  # Skip empty content
+                    ],
+                    tool_call_id=msg_dict.get('tool_call_id'),
+                    name=msg_dict.get('name'),
+                )
+                for msg_dict in trayectory
+                if msg_dict.get('content')  # Skip messages with no content
+            ]
+            messages.extend(deserialized_trajectory)
+
         if self.initial_user_message:
             messages.append(
                 Message(
@@ -431,7 +495,9 @@ def _get_messages(self, state: State) -> list[Message]:
         pending_tool_call_action_messages: dict[str, Message] = {}
         tool_call_id_to_message: dict[str, Message] = {}
         events = list(state.history)
-        for event in events:
+        if self.number_of_events < 0:
+            self.number_of_events = len(events)
+        for i, event in enumerate(events):
             # create a regular message from an event
             if isinstance(event, Action):
                 messages_to_add = self.get_action_message(
@@ -446,6 +512,14 @@ def _get_messages(self, state: State) -> list[Message]:
             else:
                 raise ValueError(f'Unknown event type: {type(event)}')
 
+            if i == self.number_of_events and state.inputs.get('next_step', ''):
+                messages_to_add = [
+                    Message(
+                        role='user',
+                        content=[TextContent(text=state.inputs['next_step'])],
+                    )
+                ]
+
             # Check pending tool call action messages and see if they are complete
             _response_ids_to_remove = []
             for (
@@ -488,6 +562,13 @@ def _get_messages(self, state: State) -> list[Message]:
                     else:
                         messages.append(message)
 
+        if self.number_of_events == len(events) and state.inputs.get('next_step', ''):
+            messages.append(
+                Message(
+                    role='user', content=[TextContent(text=state.inputs['next_step'])]
+                )
+            )
+
         if self.llm.is_caching_prompt_active():
             # NOTE: this is only needed for anthropic
             # following logic here:
diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
index 1799478601bd..3a888f2e11b1 100644
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -13,6 +13,7 @@
 )
 
 from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import Message
 from openhands.events.action import (
     Action,
     AgentDelegateAction,
@@ -448,7 +449,11 @@ def combine_thought(action: Action, thought: str) -> Action:
     return action
 
 
-def response_to_actions(response: ModelResponse) -> list[Action]:
+def response_to_actions(
+    response: ModelResponse, messages: list[Message] | None = None
+) -> list[Action]:
+    if messages is None:
+        messages = []
     actions: list[Action] = []
     assert len(response.choices) == 1, 'Only one choice is supported for now'
     assistant_msg = response.choices[0].message
@@ -481,7 +486,9 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                     inputs=arguments,
                 )
             elif tool_call.function.name == 'finish':
-                action = AgentFinishAction()
+                action = AgentFinishAction(
+                    outputs={'fixed': True, 'trayectory': messages}
+                )
             elif tool_call.function.name == 'edit_file':
                 action = FileEditAction(**arguments)
             elif tool_call.function.name == 'str_replace_editor':
diff --git a/openhands/agenthub/supervisor_agent/agent.py b/openhands/agenthub/supervisor_agent/agent.py
index 722d7365cb3a..96e04348581f 100644
--- a/openhands/agenthub/supervisor_agent/agent.py
+++ b/openhands/agenthub/supervisor_agent/agent.py
@@ -1,8 +1,8 @@
 import logging
-from typing import Any, Dict, List, Literal, Union
+import re
+from typing import Any, Dict, List
 
 from openhands.agenthub.supervisor_agent.prompt import (
-    TASK_TYPE_ISSUE,
     get_prompt,
 )
 from openhands.controller.agent import Agent
@@ -10,11 +10,12 @@
 from openhands.core.config import AgentConfig
 from openhands.core.config.llm_config import LLMConfig
 from openhands.core.message import Message, TextContent
-from openhands.core.utils import json
 from openhands.events.action import Action, AgentDelegateAction, AgentFinishAction
-from openhands.events.action.agent import AgentRejectAction
 from openhands.events.observation.delegate import AgentDelegateObservation
 from openhands.llm.llm import LLM
+from openhands.runtime.plugins.agent_skills import AgentSkillsRequirement
+from openhands.runtime.plugins.jupyter import JupyterRequirement
+from openhands.runtime.plugins.requirement import PluginRequirement
 
 
 class SupervisorAgent(Agent):
@@ -32,7 +33,21 @@ class SupervisorAgent(Agent):
     does_it_needs_a_test: bool = False
     task: str = ''
     test_command: str = ''
-    phase: Literal['search', 'summary', 'code'] = 'search'
+    time_to_stop: int = 60  # Every 60 iterations, we stop and evaluate the approach
+
+    sandbox_plugins: list[PluginRequirement] = [
+        # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
+        # AgentSkillsRequirement provides a lot of Python functions,
+        # and it needs to be initialized before Jupyter for Jupyter to use those functions.
+        AgentSkillsRequirement(),
+        JupyterRequirement(),
+    ]
+
+    # Add class attribute for tried_direct_code
+    tried_direct_code: bool = False
+
+    # Add class attribute for augmented_task
+    augmented_task: str = ''
 
     def __init__(self, llm: LLM, config: AgentConfig):
         """Initialize the Supervisor Agent with an LLM
@@ -55,122 +70,85 @@ def __init__(self, llm: LLM, config: AgentConfig):
     def step(self, state: State) -> Action:
         self.logger.debug('Starting step with state: %s', state)
         self.logger.debug('LLM config: %s', self.llm_config)
-
-        if len(self.suggested_approaches) == 0:
-            self.suggested_approaches = self.get_suggested_approaches(state)
-        self.suggested_approach_index += 1
-
-        last_observation = state.history[-1] if state.history else None
-        if isinstance(last_observation, AgentDelegateObservation):
-            self.results[self.phase].append(last_observation.outputs.get('output', ''))
-
-        if self.suggested_approach_index < len(self.suggested_approaches):
-            # Delegate to the SearcherAgent as we need to gather more information
-            return self.delegate_to_agent(
-                'SearcherAgent',
-                self.task,
-                self.suggested_approaches[self.suggested_approach_index].get(
-                    'suggested_approach', []
-                ),
+        last_observation = state.history[-1]
+        task, _ = state.get_current_user_intent()
+        self.task = task or ''
+
+        # import pdb; pdb.set_trace()
+        # Try CodeActAgent first if we haven't tried it yet
+        if not self.tried_direct_code:
+            prompt = get_prompt(self.task, [], 'initial')
+            raw_response = self.get_response(prompt)
+            match = re.search(
+                r'<augmented_pr_description>(.*?)</augmented_pr_description>',
+                raw_response,
+                re.DOTALL,
             )
-
-        if self.phase == 'search':
-            condensed_information = self.ask_llm(
-                self.task, 'summary', self.results[self.phase]
+            self.augmented_task = match.group(1).strip('"') if match else self.task
+            self.tried_direct_code = True
+            return AgentDelegateAction(
+                agent='CodeActAgent',
+                inputs={
+                    'task': self.task,
+                    'augmented_task': self.augmented_task,
+                    'when_to_stop': self.time_to_stop,
+                },
             )
-            if condensed_information and len(condensed_information) > 0:
-                first_result = condensed_information[0]
-                if first_result.get('summary', '') != '':
-                    self.phase = 'summary'
-                    self.condensed_information = first_result.get('summary', '')
-                else:
-                    suggested_approach: str | list[str] = first_result.get(
-                        'suggested_approach', []
-                    )
-                    self.results['search'].append(suggested_approach)
-                    return self.delegate_to_agent(
-                        'SearcherAgent', self.task, suggested_approach
-                    )
 
-        if self.phase == 'summary':
-            if not self.does_it_needs_a_test:
-                test_check = self.ask_llm(self.task, 'code', self.condensed_information)
-                first_check = (
-                    test_check[0] if test_check and len(test_check) > 0 else {}
-                )
-                self.does_it_needs_a_test = (
-                    first_check.get('suggested_approach', '') == TASK_TYPE_ISSUE
+        if not isinstance(last_observation, AgentDelegateObservation):
+            raise ValueError('Last observation is not an AgentDelegateObservation')
+
+        if not last_observation.outputs.get('fixed', False):
+            trayectory: List[Dict] = last_observation.outputs['trayectory']
+            deserialized_trajectory = [
+                Message(
+                    role=msg_dict['role'],
+                    content=[
+                        TextContent(text=content_text)
+                        for content_text in [
+                            msg_dict['content'][0]['text']
+                            if isinstance(msg_dict['content'], list)
+                            else msg_dict['content']
+                        ]
+                    ],
+                    tool_call_id=msg_dict.get('tool_call_id'),
+                    name=msg_dict.get('name'),
                 )
-                self.phase = 'code'
-                if self.does_it_needs_a_test:
-                    self.current_delegate = 'TesterAgent'
-                    return AgentDelegateAction(
-                        agent='TesterAgent',
-                        inputs={
-                            'task': self.task,
-                            'summary': self.condensed_information,
-                        },
-                    )
-        if self.phase == 'code':
-            if (
-                self.does_it_needs_a_test
-                and last_observation is not None
-                and isinstance(last_observation, AgentDelegateObservation)
-            ):
-                self.test_command = last_observation.outputs.get('output', '')
+                for msg_dict in trayectory
+            ]
+            # import pdb; pdb.set_trace()
+            prompt = get_prompt(self.task, deserialized_trajectory, 'right_track')
+            raw_response = self.get_response(prompt)
+            match = re.search(r'<answer>(.*?)</answer>', raw_response, re.DOTALL)
+            if match and 'yes' in match.group(1).lower():
                 return AgentDelegateAction(
-                    agent='CoderAgent',
+                    agent='CodeActAgent',
                     inputs={
                         'task': self.task,
-                        'summary': self.condensed_information,
-                        'test_command': self.test_command,
+                        'trayectory': trayectory,
+                        'when_to_stop': self.time_to_stop,
                     },
                 )
-
+            # pdb.set_trace()
+            prompt = get_prompt(self.task, deserialized_trajectory, 'refactor')
+            raw_response = self.get_response(prompt)
+            match = re.search(r'<next_step>(.*?)</next_step>', raw_response, re.DOTALL)
+            next_step = match.group(1).strip('"') if match else ''
+            self.logger.debug('Suggested approach: %s', next_step)
+            return AgentDelegateAction(
+                agent='CodeActAgent',
+                inputs={
+                    'task': self.task,
+                    'trayectory': trayectory,
+                    'next_step': next_step,
+                    'when_to_stop': self.time_to_stop,
+                },
+            )
         return AgentFinishAction()
 
-    def get_suggested_approaches(self, state: State):
-        self.logger.debug('No suggested approaches found, breaking down task.')
-        task, _ = state.get_current_user_intent()
-        if not task:
-            return []
-        self.task = task
-        suggested_approaches = self.ask_llm(self.task, 'search')
-        self.logger.debug('Suggested approaches: %s', self.suggested_approaches)
-        if not suggested_approaches:
-            return AgentRejectAction()
-        return suggested_approaches
-
-    def delegate_to_agent(
-        self, agent_name: str, task: str, suggested_approach: Union[str, List[str]]
-    ) -> AgentDelegateAction:
-        self.logger.debug(f'Delegating to agent: {agent_name}')
-        self.current_delegate = agent_name
-        # Join the list of strings with newlines if it's a list
-        approach = (
-            '\n'.join(suggested_approach)
-            if isinstance(suggested_approach, list)
-            else suggested_approach
-        )
-        return AgentDelegateAction(
-            agent=agent_name, inputs={'task': task, 'suggested_approach': approach}
-        )
-
-    def ask_llm(
-        self, task: str, phase: str, search_results: Union[str, List[str]] = ''
-    ) -> List[Dict[str, str]]:
-        # Format search_results as one item per line if it's a list
-        if isinstance(search_results, list):
-            search_results = '\n'.join(search_results)
-        prompt = get_prompt(task, phase, search_results)
-        return self.get_response(prompt)
-
-    def get_response(self, prompt: str) -> List[Dict[str, str]]:
-        content = [TextContent(text=prompt)]
-        message = Message(role='user', content=content)
+    def get_response(self, prompt: str) -> str:
+        message = Message(role='user', content=[TextContent(text=prompt)])
         response = self.llm.completion(
             messages=self.llm.format_messages_for_llm(message)
         )
-        if isinstance(response, list):
-            return json.loads(response[0]['message']['content'])
-        return json.loads(response['choices'][0]['message']['content'])
+        return response['choices'][0]['message']['content']
diff --git a/openhands/agenthub/supervisor_agent/prompt.py b/openhands/agenthub/supervisor_agent/prompt.py
index 4d8e68a92df9..f2a032eeddf3 100644
--- a/openhands/agenthub/supervisor_agent/prompt.py
+++ b/openhands/agenthub/supervisor_agent/prompt.py
@@ -1,3 +1,5 @@
+from openhands.core.message import Message, TextContent
+
 HISTORY_SIZE = 20
 
 # General Description, the goal is to devise a manager that is able to iterate if the solution has not been found yet.
@@ -6,329 +8,203 @@
 # 2. Implementing the solution.
 # Then the manager needs to check if the issue has been fixed, if not, it needs to iterate.
 general_description = """
-You are a strategic planner AI in a software development team. You have a team of agents
-who will complete the tasks you give them. Each agent is an expert in a specific area,
-but it can only focus on one very specific sub-task at a time.
+You are a helpful assistant that can provides DETAILED guidance on how to fix an issue in a codebase.
+"""
 
-Your goal is to complete the following task:
-%(task)s
+side_effects_description = """
+You are a helpful assistant that creative insights into the side-effects of changes made.
+
+%(approach)s
+
+Imagine that the changes described in <pr_description> have been implemented.
+Now this feature is being used. During the usage of this feature, what are the parts of the codebase that could be affected?
+Your thinking should be thorough and so it's fine if it's very long.
+ALWAYS output all your reasoning, be as detailed as possible.
+
+<IMPORTANT>
+- Documentation has been taken into account, so you should not mention it in any way!
+- Testing has been taken into account, so you should not mention it in any way!
+- Be aware of consistency issues!
+- Provide ONLY the related functions. (e.g. If the <pr_description> mentions the write function, then generate the read function).
+</IMPORTANT>
+
+EXAMPLE:
+<pr_description>
+The changes require to change how the data is stored.
+</pr_description>
+After implementing those changes:
+- The parser functions that read the data might need to be updated to adapt to the new format.
+"""
 
-This task is very complex, it requires careful planning and thinking.
-In order to properly complete the task, there are two phases:
-- Search: exploring the codebase, finding the relevant details. (e.g. what is the root cause of the issue?)
-- Summary: summarising the information you have gathered.
-- Code: implementing the solution. (e.g. how to fix the issue?)
+initial_prompt = """
+I am trying to fix the following issue:
 
-As a strategic manager, your goal is to create a suggested approach for phase %(phase)s.
+%(task)s
 
-## Detailed Suggested Approaches
-Generate several detailed suggested approaches that will be used by your agents to complete the task.
-Each agent will be assigned one of the suggested approaches and will bring you back feedback.
-So, be creative and think of as many different approaches as possible.
-You are trying to HELP the agents complete the task, you MUST be AS DETAILED AS POSSIBLE.
+Try to imagine with all details how would you fix the <pr_description>. What is the root cause of the issue?
+Consider opposite scenarios (eg. if the <pr_description> is writing to a file, consider what happens when the file is read).
+Consider edge cases (eg. what if the file doesn't exist?).
+
+I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to think about the testing logic or any of the tests in any way!
+The idea is to make the minimal changes to non-tests files in the /workspace directory to ensure the <pr_description> is satisfied.
+
+How would you fix the issue described in the <pr_description> with the least amount of steps? Generate the augmented <pr_description> with the least amount of steps to fix the issue in between <augmented_pr_description> and </augmented_pr_description> tags.
+Each step MUST be very detailed as to why is needed.
+Your thinking should be thorough and so it's fine if it's very long.
+Be as detailed as possible.
+
+Documentation has been taken into account, so you should not repeat it in the <augmented_pr_description>.
+Testing has been taken into account, so you should not repeat it in the <augmented_pr_description>. You can create new tests, but never use existing tests.
+ALWAYS output all your reasoning, be as detailed as possible.
+
+Follow this structure:
+1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.
+  - Files to explore, parts of the codebase I should focus on, keywords to look for...
+  - Extended reasoning...
+2. Create a script to reproduce the error and execute it to confirm that the error is reproducible
+  - Ensure that when executing the script, you get the error described in the <pr_description>
+  - Suggested code to reproduce the error, keeping in mind the side-effects described in the previous step, so that the error and side-effects are reproducible
+  - Extended reasoning...
+3. Edit the sourcecode of the repo to resolve the issue
+  - Suggest what files to change and code SUGGESTIONS. Trying to fix the issue in <pr_description> with the least amount of changes.
+  - Keep in mind for the code suggestions that I might need to change some other functions to prevent the side-effects described in the previous steps.
+  - Extended reasoning...
+4. Rerun your reproduce script and confirm that the error is fixed!
+
+<IMPORTANT>
+One step MUST be to recreate the issue and ensure that the error log is the same as the one described in the <pr_description>.
+</IMPORTANT>
+
+Example:
+<augmented_pr_description>
+
+</augmented_pr_description>
+
+REMEMBER: you ARE ONLY suggesting steps to fix the issue, do NOT be assertive, use the language of a suggestion.
 """
 
+right_track_prompt = """
 
-condense_information_prompt = """
-Previously, your agents were tasked to gather information about the codebase.
-They have now returned their findings.
+I am trying to fix the issue described in the <pr_description> following the steps described in the <pr_description>
+I keep track of everything I did in the <pr_approach>
 
-As a strategic manager, your job is to look CAREFULLY at the information they have gathered.
-You need to make sure you have a good understanding of the codebase, and the potential solutions
-to the task.
+<pr_approach>
+%(approach)s
+</pr_approach>
 
-## Information Gathered
-%(search_results)s
+Take a step back and reconsider everything I have done in the <pr_approach>.
+Your thinking should be thorough and so it's fine if it's very long.
+Can you help me identify if I am on the right track?
 
-## Summary
-Do you think you have enough information to complete the task?
-If not, you need to request more information from the agents.
-Return a list of 1 JSON describing what extra information you would need and the suggested approach to gather that information.
-[
-    {
-        "suggested_approach": ["<suggested approach to gather the missing information>"]
-    }
-]
-If you have enough information, you need to summarise the information you have gathered.
-How would you explain this to a new joiner to the team?
-Where would you point them to?
-Provide a detailed step by step guide.
-Remember, the agents DON'T have access to the internet. Every task must be conducted OFFLINE.
-The agents have cloned the repo, so they can open files, browse the code, interact with it...
-In the information gathered, there might be some repeated information, or some information
-that is actually not relevant.
-You need to be able to distinguish what is relevant, and what is not.
-In the information you have gathered, there might be file names, function names, class names. You MUST include
-them in the summary, so the agents know where to look.
-Generate a list of 1 JSON with the following format:
-[
-    {
-        "summary": ["<step by step guide>"]
-    }
-]
-
-IMPORTANT: Be VERY VERY VERY SPECIFIC.
-IMPORTANT: Include the file names, function names, class names, code blocks, in the step by step guide.
-IMPORTANT: Generate as many steps as possible.
+<IMPORTANT>
+- If there are many code changes, I am probably not on the right track.
+- Only reply with yes or no enclosed in between <answer> and </answer> tags
+</IMPORTANT>
 """
 
-# Constants for task type choices
-TASK_TYPE_ISSUE = 'yes, the task is an issue that needs to be replicated'
-TASK_TYPE_FEATURE = 'no, the task is a new feature that needs to be implemented'
+refactor_prompt = """
+The assistant is super CREATIVE always thinks of different ways of approaching the problem.
 
-does_it_needs_a_test_prompt = (
-    """
-As a strategic manager, you need to judge if the task is an issue that needs to be replicated first
-or if it is a new feature that just needs to be implemented.
-
-Your agents have already gathered information about the codebase.
-
-## Information Gathered
-%(search_results)s
-
-Think CAREFULLY before answering.
-What do you think is the best course of action?
-IMPORTANT: You MUST return a list of 1 JSON with the following format:
-[
-    {
-        "suggested_approach": ["<Choose ONE: either '"""
-    + TASK_TYPE_ISSUE
-    + """' OR '"""
-    + TASK_TYPE_FEATURE
-    + """'>"]
-    }
-]
+I am trying to fix the issue described in the <pr_description> following the steps described in the <pr_description>
+I keep track of everything I did in the <pr_approach>
 
-IMPORTANT: You MUST choose one of the two options.
+<pr_approach>
+%(approach)s
+</pr_approach>
+
+Take a step back and reconsider everything I have done in the <pr_approach>.
+The idea is to make the minimal changes to non-tests files in the /workspace directory to ensure the <pr_description> is satisfied.
+I believe my approach is not the best one, can you suggest what my INMEDIATE next step should be? (You can suggest to revert changes and try to do something else)
+Your thinking should be thorough and so it's fine if it's very long.
+if possible suggest ONLY code changes and the reasoning behind those changes.
+Do not use assertive language, use the language of a suggestion.
+REMEMBER: I might have written too many lines of code, so it might be better to discard those changes and start again.
+
+<IMPORTANT>
+- Reply with the suggested approach enclosed in between <next_step> and </next_step> tags
+</IMPORTANT>
 """
-)
 
-initial_prompt = """
-You MUST ONLY generate a list of JSONs:
-
-[
-    {
-      "suggested_approach": ["<suggested approach>"]
-    },
-    {
-      "suggested_approach": ["<suggested approach>"]
-    },
-]
-
-Suggested approaches MUST be independent.
-You MUST generate at least 1 suggested approach.
-IMPORTANT: the agents DON'T have access to the internet. Every task must be conducted OFFLINE.
-The agents have cloned the repo, so they can open files, browse the code, interact with it...
-The goal of phase 1, exploring the codebase, finding the relevant details is ONLY to collect information.
-Be as HELPFUL and DETAILED as possible.
-Use the suggested approach to guide the agents in their exploration of the codebase.
-They MUST interact with the environment:
-- Open as many files as needed to gather as much information as possible.
-- Read every piece of code that might be relevant to the task, summarise what does it do.
-- Decide which functions are important to the task, understand how they are used and how they are called.
-
-Remember that the agents can use a Python environment with <execute_ipython>, e.g.:
-<execute_ipython>
-print("Hello World!")
-</execute_ipython>
-
-They can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
-If a bash command returns exit code `-1`, this means the process is not yet finished.
-They must then send a second <execute_bash>. The second <execute_bash> can be empty
-(which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process,
-or it can contain the text `ctrl+c` to interrupt the process.
-
-For commands that may run indefinitely, the output should be redirected to a file and the command run
-in the background, e.g. <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
-If a command execution result says "Command timed out. Sending SIGINT to the process",
-the assistant should retry running the command in the background.
-
-Be VERY VERY SPECIFIC.
-
----- START OF EXAMPLE ----
-
-## TASK
-
-"
-Enable quiet mode/no-verbose in CLI for use in pre-commit hook There seems to be only an option to increase the level of verbosity when using
-SQLFluff [CLI](https://docs.sqlfluff.com/en/stable/cli.html), not to limit it further. It would be great to have an option to further limit the amount of prints when running
-`sqlfluff fix`, especially in combination with deployment using a pre-commit hook. For example, only print the return status and the number of fixes applied, similar to how it
-is when using `black` in a pre-commit hook: ![image](https://user-images.githubusercontent.com/10177212/140480676-dc98d00b-4383-44f2-bb90-3301a6eedec2.png) This hides the potentially
-long list of fixes that are being applied to the SQL files, which can get quite verbose.
-"
-
-## YOUR RESPONSE:
-
-[
-  {
-    "suggested_approach": [
-      "1. Open the SQLFluff codebase and navigate to the CLI module, likely located in 'src/sqlfluff/cli/'.",
-      "2. Locate the file responsible for parsing command-line arguments, such as 'commands.py' or 'cli.py'.",
-      "3. Examine how the '--verbose' flag is implemented in the code.",
-      "4. Identify if there is an existing '--quiet' or '--no-verbose' option.",
-      "5. Understand how verbosity levels are set and managed within the CLI code.",
-      "6. Look for any variables or settings that control the default verbosity level.",
-      "7. Determine how the '--verbose' flag increases verbosity and see if a similar mechanism can decrease verbosity.",
-      "8. Note down any functions or methods that output information to the console.",
-      "9. Identify how these functions can be controlled via verbosity levels.",
-      "10. Summarize findings and consider how to implement a '--quiet' flag."
-    ]
-  },
-  {
-    "suggested_approach": [
-      "1. Investigate the logging configuration in SQLFluff, possibly located in 'src/sqlfluff/core/logger.py' or similar.",
-      "2. Understand how logging levels are set (e.g., DEBUG, INFO, WARNING, ERROR).",
-      "3. Examine if the logging levels are affected by CLI arguments.",
-      "4. Identify where in the code the logging configuration is initialized based on user input.",
-      "5. Check if there is a way to adjust the logging level via a CLI option.",
-      "6. Determine if adding a '--quiet' flag can set the logging level to WARNING or ERROR to suppress INFO messages.",
-      "7. Note the changes needed in the logging setup to support a quiet mode.",
-      "8. Identify all logging statements that may need to respect the new logging level.",
-      "9. Consider the impact on existing functionality and ensure that critical messages are still displayed.",
-      "10. Summarize how logging can be adjusted to implement a quiet mode."
-    ]
-  },
-  {
-    "suggested_approach": [
-      "1. Analyze how output to the console is handled throughout the codebase.",
-      "2. Identify the functions used for outputting messages, such as 'click.echo', 'print', or custom wrapper functions.",
-      "3. Trace where these output functions are called in the code, especially during 'sqlfluff fix' execution.",
-      "4. Determine if there is a centralized output function or if output is scattered across multiple functions.",
-      "5. Assess whether output functions can be modified to check a verbosity level before printing.",
-      "6. Consider creating or modifying a wrapper function that respects a verbosity or quiet setting.",
-      "7. Identify any messages that should always be displayed, regardless of verbosity settings (e.g., errors).",
-      "8. Note the locations in the code where changes need to be made to control output.",
-      "9. Evaluate the feasibility of implementing a quiet mode by adjusting output functions.",
-      "10. Summarize the steps required to control output at the source."
-    ]
-  },
-  {
-    "suggested_approach": [
-      "1. Explore the configuration options available in SQLFluff by examining the configuration parser code, possibly in 'src/sqlfluff/core/config.py'.",
-      "2. Look for existing configuration parameters related to verbosity or output control.",
-      "3. Determine how configuration files (like '.sqlfluff') are parsed and applied.",
-      "4. Assess if a new configuration option can be introduced to control verbosity levels.",
-      "5. Identify how this configuration option can be read and applied during runtime.",
-      "6. Check if the CLI options can override configuration file settings for verbosity.",
-      "7. Map out the code changes required to implement and support a new configuration option.",
-      "8. Ensure that the new configuration integrates smoothly with existing settings.",
-      "9. Consider user documentation and how users would be informed about the new option.",
-      "10. Summarize the process of adding a verbosity control via configuration files."
-    ]
-  },
-  {
-    "suggested_approach": [
-      "1. Examine the implementation of the 'sqlfluff fix' command to understand its workflow.",
-      "2. Identify where the command generates output and how that output is formatted.",
-      "3. Determine if 'sqlfluff fix' has different output modes or formats based on context.",
-      "4. Check if the command detects when it's running in a pre-commit hook or similar environment.",
-      "5. Consider if output suppression can be contextually applied when running in certain environments.",
-      "6. Identify any existing mechanisms for output control based on execution context.",
-      "7. Explore how the 'black' formatter handles output suppression in pre-commit hooks.",
-      "8. Analyze if similar techniques can be applied within SQLFluff's codebase.",
-      "9. Note any dependencies or external factors that influence output generation.",
-      "10. Summarize how context-aware output control can be implemented."
-    ]
-  }
-]
-
-
----- END OF EXAMPLE ----
-
-
---- START OF EXAMPLE 2 ---
-
-## TASK
-"
-ModelChain.prepare_inputs can succeed with missing dhi From the docstring for `ModelChain.prepare_inputs()`
-I believe the method should fail if `weather` does not have a `dhi` column. The validation checks for `'ghi'` twice,
-but not `'dhi`' https://github.com/pvlib/pvlib-python/blob/11c356f9a89fc88b4d3ff368ce1aae170a97ebd7/pvlib/modelchain.py#L1136
-"
-
-## YOUR RESPONSE:
-
-[
-  {
-    "suggested_approach": [
-      "1. Open the file pvlib/modelchain.py and locate the ModelChain.prepare_inputs method. Carefully read through the method's code, focusing on the section where it validates the weather DataFrame columns, specifically around line 1136.",
-      "2. Identify the validation checks for the weather DataFrame. Note whether it checks for the presence of 'dhi' or mistakenly checks for 'ghi' twice.",
-      "3. Examine the docstring of ModelChain.prepare_inputs to understand the expected behavior when dhi is missing from the weather data.",
-      "4. Investigate any helper functions called within prepare_inputs that handle irradiance data, such as methods for inferring missing components.",
-      "5. Review the unit tests related to prepare_inputs in pvlib/tests/test_modelchain.py to see if cases with missing dhi are covered.",
-      "6. Use the Python environment to simulate calling prepare_inputs with weather data missing the dhi column and observe the outcome.",
-      "<execute_ipython>",
-      "import pvlib",
-      "from pvlib import modelchain, location, pvsystem",
-      "import pandas as pd",
-      "mc = modelchain.ModelChain(pvsystem.PVSystem(), location.Location(32.2, -110.9))",
-      "weather = pd.DataFrame({'ghi': [1000], 'dni': [800]})",
-      "mc.prepare_inputs(weather)",
-      "</execute_ipython>",
-      "7. Document any discrepancies between the code and the documentation, and note any unexpected behaviors."
-    ]
-  },
-  {
-    "suggested_approach": [
-      "1. Generate a flowchart of the prepare_inputs method to understand its logic and how it processes the weather DataFrame.",
-      "2. Open pvlib/modelchain.py and trace each step within prepare_inputs, paying attention to how it handles missing data.",
-      "3. Look for any conditional statements that manage cases where dhi is not provided and see if alternative calculations are performed or if an error is raised.",
-      "4. Explore related methods like complete_irradiance or irradiance.get_total_irradiance to see how missing components are handled.",
-      "5. Test different weather DataFrame scenarios in the Python environment to observe how prepare_inputs behaves with various missing columns.",
-      "<execute_ipython>",
-      "import pvlib",
-      "from pvlib import modelchain, location, pvsystem",
-      "import pandas as pd",
-      "mc = modelchain.ModelChain(pvsystem.PVSystem(), location.Location(32.2, -110.9))",
-      "# Weather data missing 'dhi'",
-      "weather_missing_dhi = pd.DataFrame({'ghi': [1000], 'dni': [800]})",
-      "mc.prepare_inputs(weather_missing_dhi)",
-      "# Weather data missing 'ghi'",
-      "weather_missing_ghi = pd.DataFrame({'dhi': [200], 'dni': [800]})",
-      "mc.prepare_inputs(weather_missing_ghi)",
-      "</execute_ipython>",
-      "6. Record the outcomes and any exceptions raised to determine if the method behaves as intended."
-    ]
-  },
-  {
-    "suggested_approach": [
-      "1. Analyze the git commit history for modelchain.py to identify when the validation issue was introduced.",
-      "<execute_bash>",
-      "cd pvlib-python",
-      "git log -L 1136,1140 /modelchain.py",
-      "</execute_bash>",
-      "2. Review the changes in each commit affecting the validation checks in prepare_inputs.",
-      "3. Open the relevant commits and examine the differences in the validation code.",
-      "4. Check for any related issues or pull requests in the repository's local clone that discuss missing dhi validation.",
-      "5. Look into the test coverage reports (if available locally) to see if the validation logic is adequately tested.",
-      "6. Summarize findings on whether the issue is a recent regression or an existing oversight."
-    ]
-  }
-]
-
---- END OF EXAMPLE 2 ---
-
---- YOUR TURN ---
-
-## TASK
-%(task)s
+critical_prompt = """
+The assistant is super CREATIVE, it considers every possible scenario that is DIFFERENT from the ones described in the <pr_description>.
+
+I believe I have fixed the issue described in the <pr_description> following the steps described in the <pr_approach>
+<pr_approach>
+%(approach)s
+</pr_approach>
 
-## YOUR RESPONSE:
+After fixing the issue, there might be some side-effects that we need to consider.
+(e.g. if we fix the way data is written, then we might need to modify the way data is read)
+Your thinking should be thorough and so it's fine if it's very long.
+
+<IMPORTANT>
+- Only reply with ONE side-effect enclosed in between <next_step> and </next_step> tags starting with the phrase "Have you considered..."
+- If you thing everything is covered, just reply with "everything is covered" enclosed in between <next_step> and </next_step> tags
+</IMPORTANT>
 """
 
 
-def get_prompt(task: str, phase: str, search_results: str = '') -> str:
-    if phase == 'search':
-        base_prompt = general_description + initial_prompt
-    elif phase == 'summary':
-        base_prompt = general_description + condense_information_prompt
+def format_conversation(trajectory: list[Message]) -> str:
+    """Format a conversation history into a readable string.
+
+    Args:
+        trajectory: List of Message objects containing conversation turns
 
-    formatted_prompt = base_prompt % {
+    Returns:
+        Formatted string representing the conversation
+    """
+    formatted_parts = []
+
+    for message in trajectory:
+        role = message.role
+        # Join all TextContent messages together
+        content_text = ' '.join(
+            item.text for item in message.content if isinstance(item, TextContent)
+        )
+
+        if content_text.strip():  # Only add non-empty content
+            formatted_parts.append(f'{role}: {content_text}\n')
+
+    return '\n'.join(formatted_parts)
+
+
+def get_prompt(
+    task: str,
+    trajectory: list[Message],
+    prompt_type: str = 'initial',
+    augmented_task: str = '',
+) -> str:
+    """Format and return the appropriate prompt based on prompt_type.
+
+    Args:
+        task: The task description
+        trajectory: List of Message objects containing conversation history
+        prompt_type: Type of prompt to return ("initial" or "refactor")
+        augmented_task: The augmented task description
+    Returns:
+        Formatted prompt string
+    """
+    # If approach is a conversation history, format it
+    if trajectory:
+        approach = format_conversation(trajectory)
+    else:
+        approach = ''
+
+    # Select the appropriate prompt template
+    if prompt_type == 'initial':
+        template = initial_prompt
+    elif prompt_type == 'right_track':
+        template = right_track_prompt
+    elif prompt_type == 'refactor':
+        template = refactor_prompt
+    elif prompt_type == 'critical':
+        template = critical_prompt
+
+    # Format the selected template with the task and approach
+    formatted_prompt = general_description + template % {
         'task': task,
-        'phase': phase,
-        'search_results': search_results,
+        'approach': approach,
+        'augmented_pr_description': augmented_task,
     }
 
-    # Add instruction to not include json formatting
-    formatted_prompt += '\n\nIMPORTANT: Do not include ```json at the start or ``` at the end of your response. Just return the raw JSON list.'
-
     return formatted_prompt