Merge branch 'main' of github.com:replayio/SWE-agent into dominik/pro…

…-864-6a-test-initial-runtime-data-hypotheses
replayio · Oct 7, 2024 · f3788c3 · f3788c3
2 parents 53785cc + 8d8906d
commit f3788c3
Show file tree

Hide file tree

Showing 9 changed files with 201 additions and 25 deletions.
diff --git a/config/_tdd_repro_prompt.md b/config/_tdd_repro_prompt.md
@@ -1,7 +1,7 @@
 # Requirements
 
 * You are provided `tdd_*` tools to reproduce the issue with one or more golden tests and also check for regressions.
-* The output of the last reproduction is always provided to you.
-* Failed tests also provide a `CALL_GRAPH_ON_EXCEPTION` containing the entire call hierarchy of all functions from that test.
 * Don't submit until the reproduction command proves your fix.
-
+* IMPORTANT: Always FIRST RUN the `tdd_repro` command to reproduce the issue.
+* This provides you with in-depth test failure and runtime information.
+* This includes `CALL_GRAPH_ON_EXCEPTION`: It contains the entire call hierarchy of all functions from that test.
diff --git a/config/commands/_tdd.sh b/config/commands/_tdd.sh
@@ -28,6 +28,12 @@ tdd_repro() {
         export TDD_TRACE_TARGET_CONFIG="{ \"target_file\": \"$1\", \"target_function_name\": \"$2\", \"decl_lineno\": $line_no}"
     fi
     eval "$TEST_CMD_FAIL_TO_PASS"
+
+    # include the continuation file if it exists
+    if [ -f "$MANUAL_INPUT_CONTINUATION_FILE" ]; then
+        cat $MANUAL_INPUT_CONTINUATION_FILE
+        rm $MANUAL_INPUT_CONTINUATION_FILE
+    fi
     popd > /dev/null
 }
 

diff --git a/config/default_with_tools.yaml b/config/default_with_tools.yaml
@@ -14,7 +14,6 @@ instance_template: |-
   {issue}
   </ISSUE_DESCRIPTION>
 
-  {tdd_results}
   # COMMAND/TOOL INSTRUCTIONS
   * YOU CAN ONLY MAKE ONE TOOL CALL ("COMMAND") AT A TIME. You should always wait for feedback after every command.
   * You can write scripts and run them. E.g. you can write a python script and then run it with `python <script_name>.py`.
@@ -58,7 +57,6 @@ instance_template: |-
   * Thoughts: <YOUR_THOUGHTS/>
   </CALL_GRAPH_ANALYSIS_TEMPLATE>
 
-
   (Open file: {open_file})
   (Current directory: {working_dir})
   bash-$

diff --git a/run_instance.sh b/run_instance.sh
@@ -1,8 +1,24 @@
 set -euo pipefail
+
+case $# in
+  1)
+    MANUAL_INPUT_ARGS=""
+    ;;
+
+  3)
+    MANUAL_INPUT_ARGS="--manual_input_conversation_path $2 --manual_input_continuation_label $3"
+    ;;
+
+  *)
+    echo "Usage: $0 <instance_id> [<manual_input_conversation_path> <manual_input_continuation_label>]"
+    exit 1
+    ;;
+esac
+
 set -x
 
-# This runs the instance from the official SWE-agent demo video.
-# See: https://www.youtube.com/watch?v=CeMtJ4XObAM
+echo $MANUAL_INPUT_ARGS
+
 python3 run.py \
   --model_name "claude-sonnet-3.5" \
   --data_path "princeton-nlp/SWE-bench_Verified" \
@@ -12,5 +28,6 @@ python3 run.py \
   --instance_filter "$1" \
   --skip_existing False \
   --cache_task_images \
-  --tdd
+  --tdd \
+  $MANUAL_INPUT_ARGS
 
diff --git a/sweagent/agent/agents.py b/sweagent/agent/agents.py
@@ -3,6 +3,7 @@
 import json
 import re
 from dataclasses import dataclass
+from operator import itemgetter
 from pathlib import Path
 from typing import Any, TypedDict
 
@@ -13,6 +14,7 @@
 
 from sweagent.agent.commands import Command, ParseCommand
 from sweagent.agent.history_processors import HistoryProcessor
+from sweagent.agent.manual_input import ManualInput
 from sweagent.agent.models import (
     APIStats,
     ContextWindowExceededError,
@@ -21,6 +23,7 @@
     ModelQueryResult,
     get_last_valid_tool_use_name,
     get_model,
+    make_model_query_result,
     make_assistant_content,
     make_user_reply_content,
 )
@@ -194,6 +197,9 @@ class AgentArguments(FlattenedAccess, FrozenSerializable):
     # We put tdd on the agent args because it needs it in post_init.
     tdd: bool = False
 
+    manual_input_conversation_path: Path | None = None
+    manual_input_continuation_label: str | None = None
+
     def __post_init__(self):
         if self.config is None and self.config_file is not None:
             # If unassigned, we load the config from the file to store its contents with the overall arguments
@@ -284,6 +290,8 @@ def __init__(self, name: str, args: AgentArguments, env: SWEEnv):
         self.last_container_id = None
         self.hooks = []
         self.logger = get_logger(f"Agent[{name}]")
+        self.manual_input = ManualInput(args.manual_input_conversation_path, args.manual_input_continuation_label)
+        self.initial_model_response = None
 
     def add_hook(self, hook: AgentHook):
         """Add hook to agent"""
@@ -318,22 +326,12 @@ def _system_repro_prompt(self):
 
     # Unflag all tdd entries from history, so it can be compressed.
     def _unflag_tdd_history(self):
-        self._assert_tdd_history_entries()
+        # self._assert_tdd_history_entries()
 
         for entry in self.history:
             if "tdd" in entry:
                 entry["tdd"] = False
 
-    def _make_initial_tdd_result(self) -> str:
-        if self.env.tdd:
-            if "{tdd_results}" not in self.config.instance_template:
-                # {tdd_results} needs to be referenced in instance_template for this to work.
-                raise ValueError("{tdd_results} not found in instance_template:\n\n" + self.config.instance_template)
-            test_result = self.env.communicate("tdd_repro")
-            # logger.debug(f"[TDD] Initial Results:\n{test_result}")
-            return f"# ISSUE REPRODUCTION RESULTS<NOTE: These are the results of a test run of tests that reproduce the issue. It contains call graphs  of failed tests. Start your investigation here./>\n<ISSUE_REPRODUCTION>\n{test_result}\n</ISSUE_REPRODUCTION>\n\n"
-        return ""
-
     # ###########################################################################
     # setup and more
     # ###########################################################################
@@ -349,6 +347,29 @@ def setup(self, env: SWEEnv, instance_args, init_model_stats=None) -> None:
         self.model.setup(init_model_stats)
         self.instance_args = instance_args
 
+        if self.manual_input.enabled():
+            history_and_patch = self.manual_input.load_conversation()
+            if history_and_patch is not None:
+                history, patch = history_and_patch
+                self.initial_model_response = history[-1]
+                assert self.initial_model_response["role"] == "assistant"
+                assert self.initial_model_response["action"] == "tdd_repro"
+                self.history = history[:-1]
+                self.made_initial_prompt = True
+                if patch is not None:
+                    env.apply_conversation_patch(patch)
+
+                continuation = self.manual_input.load_continuation_file()
+
+                if continuation is not None:
+                    self.logger.info(f"Continuing from manual input:\n{continuation}")
+                    continuation_file_path = "/root/continuation.md"
+                    env.copy_string_to_container_file(continuation, continuation_file_path)
+                    env.communicate_with_handling(f'export MANUAL_INPUT_CONTINUATION_FILE="{continuation_file_path}"')
+
+                return
+
+
         # Compose system prompt.
         system_msg = self.config.system_template.format(**self.system_args)
         system_msg = f"{system_msg}\n\n{self._system_repro_prompt()}"
@@ -571,7 +592,13 @@ def forward(self, observation: str, available_actions: list[str], state: str) ->
             action: action that the model proposes
             output: raw model output (not output of the action)
         """
-        thought, action, output = self.forward_with_error_check(observation, state)
+        if self.initial_model_response is not None:
+            thought, action, content = itemgetter("thought", "action", "content")(self.initial_model_response)
+            output = make_model_query_result(content)
+            self.initial_model_response = None
+        else:
+            thought, action, output = self.forward_with_error_check(observation, state)
+
         last_tool_name = get_last_valid_tool_use_name(output)
         last_command = self.get_command(last_tool_name)
         ran_tdd_action = last_command.tdd if last_command else False
@@ -619,8 +646,6 @@ def forward_model(self, observation: str, state: str) -> ModelQueryResult:
             templates = [self.config.instance_template]
             if self.config.strategy_template is not None:
                 templates.append(self.config.strategy_template)
-            # Get tdd_results, to be rendered into the initial_prompt template.
-            state_vars["tdd_results"] = self._make_initial_tdd_result()
         elif observation is None or observation.strip() == "":
             # Show no output template if observation content was empty
             templates = [self.config.next_step_no_output_template]
@@ -896,13 +921,15 @@ def run(
             If return_type is "info_trajectory", returns a tuple of
             the info dictionary and the trajectory (list of dictionaries).
         """
-        self.made_initial_prompt = False
         done = False
         # mypy checks
         assert env.container_obj is not None
         assert env.record is not None
         assert self.config is not None
 
+        self.made_initial_prompt = False
+        self.manual_input.set_instance_id(env.record["instance_id"])
+
         if env.container_obj.id != self.last_container_id:
             self.logger.info(f"Initializing agent settings for container {env.container_obj.id}")
             self.init_environment_vars(env)
@@ -922,6 +949,7 @@ def run(
             for hook in self.hooks:
                 hook.on_step_start()
             state = env.communicate(self.state_command) if self.state_command else None
+            stop_on_tdd_repro = self.manual_input.enabled() and self.initial_model_response is None
             thought, action, output = self.forward(observation, env.get_available_actions(), state)
             for hook in self.hooks:
                 hook.on_actions_generated(thought=thought, action=action, output=repr(output))
@@ -937,6 +965,16 @@ def run(
                     observations.append(obs)
                     if sub_action["cmd_name"] == self.config.submit_command:
                         done = True
+                    if sub_action["action"] == "tdd_repro" and stop_on_tdd_repro:
+                        done = True
+                        output = env.communicate("([ -s /root/test.patch ] && git apply -R < /root/test.patch); git add -A && echo -n '<<CONVERSATION_PATCH||' > /root/conversation-patch && git diff --cached >> /root/conversation-patch && echo -n '||CONVERSATION_PATCH>>' >> /root/conversation-patch && cat /root/conversation-patch")
+
+                        pattern = r"\<\<CONVERSATION_PATCH\|\|(.*)\|\|CONVERSATION_PATCH\>\>"
+
+                        match = re.search(pattern, output, re.DOTALL)
+                        if match is not None:
+                            patch = match.group(1)
+                            self.manual_input.save_conversation(self.history, patch)
                     if done:
                         break
                 else:

diff --git a/sweagent/agent/manual_input.py b/sweagent/agent/manual_input.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+import os
+from typing import Tuple
+
+from sweagent.utils.log import get_logger
+from sweagent.agent.model_cache import json_serialize_file, json_deserialize_file, json_serialize_str, json_deserialize_str, hash_string
+
+ManualTDDInputEnvVar = "MANUAL_TDD_INPUT_DIRECTORY"
+
+logger = get_logger("manual_tdd_input")
+
+class ManualInput:
+    base_dir: str | None
+    instance_id: str | None
+    conversation_path: str | None
+    continuation_label: str | None
+
+    def __init__(self, conversation_path: str | None, continuation_label: str | None):
+        self.conversation_path = conversation_path
+        self.continuation_label = continuation_label
+        self.base_dir = None
+        if ManualTDDInputEnvVar in os.environ:
+            logger.warning("⚠ ManualInput is enabled")
+            self.base_dir = os.environ[ManualTDDInputEnvVar]
+
+    def enabled(self) -> bool:
+        return self.base_dir is not None
+
+    def set_instance_id(self, instance_id: str) -> None:
+        self.instance_id = instance_id
+
+    def _get_conversation_dir(self) -> str:
+        if self.conversation_path is None:
+            return os.path.join(self.base_dir, self.instance_id)
+        return os.path.join(self.base_dir, self.instance_id, self.conversation_path)
+
+    def load_continuation_file(self) -> str | None:
+        if not self.enabled():
+            return None
+
+        try:
+            with open(os.path.join(self._get_conversation_dir(), f"{self.continuation_label}.md"), "r") as f:
+                return f.read()
+        except FileNotFoundError:
+            return None
+
+    def save_conversation(self, conversation: list[dict[str, str]], patch: str | None) -> None:
+        if not self.enabled():
+            return None
+
+        parent_dir = self._get_conversation_dir()
+
+        # if continuation_label is left off, we're storing the root conversation
+        if self.continuation_label is None:
+            self.continuation_label = "root"
+
+        content = json_serialize_str(conversation)
+        hash = hash_string(content)
+
+        new_subdir = os.path.join(parent_dir, f"{self.continuation_label}-{hash}")
+        os.makedirs(new_subdir, exist_ok=True)
+
+        with open(os.path.join(new_subdir, "conversation.json"), "w") as f:
+            f.write(content)
+
+        if patch is not None and patch.strip() != "":
+            with open(os.path.join(new_subdir, "patch.diff"), "w") as f:
+                f.write(patch)
+
+        subdir_to_print = new_subdir[len(self.base_dir):]
+        logger.info(f"Conversation saved to ${ManualTDDInputEnvVar}%s", subdir_to_print)
+
+    def load_conversation(self) -> Tuple[list[dict[str, str]], str] | None:
+        if not self.enabled():
+            return None
+
+        dir = self._get_conversation_dir()
+        if not os.path.exists(dir):
+            return None
+
+        conversation_file_path = os.path.join(dir, "conversation.json")
+        if not os.path.exists(conversation_file_path):
+            return None
+
+        with open(conversation_file_path, "r") as f:
+            conversation = json_deserialize_str(f.read())
+
+        patch_file_path = os.path.join(dir, "patch.diff")
+
+        # a missing patch isn't an error (will happen with the first tdd_repro call)
+        if os.path.exists(patch_file_path):
+            with open(patch_file_path, "r") as f:
+                patch = f.read()
+        else:
+            patch = None
+
+        dir_to_print = dir[len(self.base_dir):]
+        logger.info("Conversation loaded from ${ManualTDDInputEnvVar}%s", dir_to_print)
+
+        return conversation, patch
diff --git a/sweagent/agent/model_cache.py b/sweagent/agent/model_cache.py
@@ -146,7 +146,7 @@ def __init__(self):
 
     def _get_file(self, history: list[dict[str, str]]) -> str:
         hash_input = json_serialize_str(history)
-        print(f"HASH_INPUT\n{hash_input}\nEND_OF_HASH_INPUT")
+        # print(f"HASH_INPUT\n{hash_input}\nEND_OF_HASH_INPUT")
         hash = hash_string(hash_input)
         return f"{self.directory}/model-query-{hash}.json"
 

diff --git a/sweagent/agent/models.py b/sweagent/agent/models.py
@@ -33,6 +33,11 @@
 _MAX_RETRIES = keys_config.get("SWE_AGENT_MODEL_MAX_RETRIES", 0)
 
 
+def make_model_query_result(content: str | list[ContentBlock]) -> ModelQueryResult:
+    if isinstance(content, str):
+        return content
+    return AnthropicModelResult(blocks=content)
+
 def make_assistant_content(output: ModelQueryResult):
     if isinstance(output, str):
         return output
@@ -68,7 +73,7 @@ def make_user_reply_content(action_result: str, model_result: ModelQueryResult |
         result = {
             "type": "tool_result",
             "tool_use_id": tool_use.id,
-            "content": action_result
+            "content": action_result,
         }
         if is_error:
             result["is_error"] = True

diff --git a/sweagent/environment/swe_env.py b/sweagent/environment/swe_env.py
@@ -494,6 +494,18 @@ def _apply_test_patch(self):
         )
         self.logger.debug(f"[TDD] Applied test patch - output:\n{res}")
 
+    def apply_conversation_patch(self, patch: str) -> None:
+        """
+        Apply patch to source in repo
+        """
+        patch_path = "/root/conversation.patch"
+        self.copy_string_to_container_file(patch, patch_path)
+        res = self.communicate_with_handling(
+            f"cd /{self._repo_name} && git apply -v {patch_path} && rm {patch_path}",
+            error_msg="Failed to apply patch",
+        )
+        self.logger.debug(f"[TDD] Applied previous changes - output:\n{res}")
+
     def step(self, action: str) -> tuple[str | None, int, bool, dict]:
         """
         Runs an action proposed by the agent in the environment and returns the corresponding output.