Skip to content

Commit

Permalink
Merge branch 'main' of github.com:replayio/SWE-agent into dominik/pro…
Browse files Browse the repository at this point in the history
…-864-6a-test-initial-runtime-data-hypotheses
  • Loading branch information
Domiii committed Oct 7, 2024
2 parents 53785cc + 8d8906d commit f3788c3
Show file tree
Hide file tree
Showing 9 changed files with 201 additions and 25 deletions.
6 changes: 3 additions & 3 deletions config/_tdd_repro_prompt.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Requirements

* You are provided `tdd_*` tools to reproduce the issue with one or more golden tests and also check for regressions.
* The output of the last reproduction is always provided to you.
* Failed tests also provide a `CALL_GRAPH_ON_EXCEPTION` containing the entire call hierarchy of all functions from that test.
* Don't submit until the reproduction command proves your fix.

* IMPORTANT: Always FIRST RUN the `tdd_repro` command to reproduce the issue.
* This provides you with in-depth test failure and runtime information.
* This includes `CALL_GRAPH_ON_EXCEPTION`: It contains the entire call hierarchy of all functions from that test.
6 changes: 6 additions & 0 deletions config/commands/_tdd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ tdd_repro() {
export TDD_TRACE_TARGET_CONFIG="{ \"target_file\": \"$1\", \"target_function_name\": \"$2\", \"decl_lineno\": $line_no}"
fi
eval "$TEST_CMD_FAIL_TO_PASS"

# include the continuation file if it exists
if [ -f "$MANUAL_INPUT_CONTINUATION_FILE" ]; then
cat $MANUAL_INPUT_CONTINUATION_FILE
rm $MANUAL_INPUT_CONTINUATION_FILE
fi
popd > /dev/null
}

Expand Down
2 changes: 0 additions & 2 deletions config/default_with_tools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ instance_template: |-
{issue}
</ISSUE_DESCRIPTION>
{tdd_results}
# COMMAND/TOOL INSTRUCTIONS
* YOU CAN ONLY MAKE ONE TOOL CALL ("COMMAND") AT A TIME. You should always wait for feedback after every command.
* You can write scripts and run them. E.g. you can write a python script and then run it with `python <script_name>.py`.
Expand Down Expand Up @@ -58,7 +57,6 @@ instance_template: |-
* Thoughts: <YOUR_THOUGHTS/>
</CALL_GRAPH_ANALYSIS_TEMPLATE>
(Open file: {open_file})
(Current directory: {working_dir})
bash-$
Expand Down
23 changes: 20 additions & 3 deletions run_instance.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,24 @@
set -euo pipefail

case $# in
1)
MANUAL_INPUT_ARGS=""
;;

3)
MANUAL_INPUT_ARGS="--manual_input_conversation_path $2 --manual_input_continuation_label $3"
;;

*)
echo "Usage: $0 <instance_id> [<manual_input_conversation_path> <manual_input_continuation_label>]"
exit 1
;;
esac

set -x

# This runs the instance from the official SWE-agent demo video.
# See: https://www.youtube.com/watch?v=CeMtJ4XObAM
echo $MANUAL_INPUT_ARGS

python3 run.py \
--model_name "claude-sonnet-3.5" \
--data_path "princeton-nlp/SWE-bench_Verified" \
Expand All @@ -12,5 +28,6 @@ python3 run.py \
--instance_filter "$1" \
--skip_existing False \
--cache_task_images \
--tdd
--tdd \
$MANUAL_INPUT_ARGS

68 changes: 53 additions & 15 deletions sweagent/agent/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import re
from dataclasses import dataclass
from operator import itemgetter
from pathlib import Path
from typing import Any, TypedDict

Expand All @@ -13,6 +14,7 @@

from sweagent.agent.commands import Command, ParseCommand
from sweagent.agent.history_processors import HistoryProcessor
from sweagent.agent.manual_input import ManualInput
from sweagent.agent.models import (
APIStats,
ContextWindowExceededError,
Expand All @@ -21,6 +23,7 @@
ModelQueryResult,
get_last_valid_tool_use_name,
get_model,
make_model_query_result,
make_assistant_content,
make_user_reply_content,
)
Expand Down Expand Up @@ -194,6 +197,9 @@ class AgentArguments(FlattenedAccess, FrozenSerializable):
# We put tdd on the agent args because it needs it in post_init.
tdd: bool = False

manual_input_conversation_path: Path | None = None
manual_input_continuation_label: str | None = None

def __post_init__(self):
if self.config is None and self.config_file is not None:
# If unassigned, we load the config from the file to store its contents with the overall arguments
Expand Down Expand Up @@ -284,6 +290,8 @@ def __init__(self, name: str, args: AgentArguments, env: SWEEnv):
self.last_container_id = None
self.hooks = []
self.logger = get_logger(f"Agent[{name}]")
self.manual_input = ManualInput(args.manual_input_conversation_path, args.manual_input_continuation_label)
self.initial_model_response = None

def add_hook(self, hook: AgentHook):
"""Add hook to agent"""
Expand Down Expand Up @@ -318,22 +326,12 @@ def _system_repro_prompt(self):

# Unflag all tdd entries from history, so it can be compressed.
def _unflag_tdd_history(self):
self._assert_tdd_history_entries()
# self._assert_tdd_history_entries()

for entry in self.history:
if "tdd" in entry:
entry["tdd"] = False

def _make_initial_tdd_result(self) -> str:
if self.env.tdd:
if "{tdd_results}" not in self.config.instance_template:
# {tdd_results} needs to be referenced in instance_template for this to work.
raise ValueError("{tdd_results} not found in instance_template:\n\n" + self.config.instance_template)
test_result = self.env.communicate("tdd_repro")
# logger.debug(f"[TDD] Initial Results:\n{test_result}")
return f"# ISSUE REPRODUCTION RESULTS<NOTE: These are the results of a test run of tests that reproduce the issue. It contains call graphs of failed tests. Start your investigation here./>\n<ISSUE_REPRODUCTION>\n{test_result}\n</ISSUE_REPRODUCTION>\n\n"
return ""

# ###########################################################################
# setup and more
# ###########################################################################
Expand All @@ -349,6 +347,29 @@ def setup(self, env: SWEEnv, instance_args, init_model_stats=None) -> None:
self.model.setup(init_model_stats)
self.instance_args = instance_args

if self.manual_input.enabled():
history_and_patch = self.manual_input.load_conversation()
if history_and_patch is not None:
history, patch = history_and_patch
self.initial_model_response = history[-1]
assert self.initial_model_response["role"] == "assistant"
assert self.initial_model_response["action"] == "tdd_repro"
self.history = history[:-1]
self.made_initial_prompt = True
if patch is not None:
env.apply_conversation_patch(patch)

continuation = self.manual_input.load_continuation_file()

if continuation is not None:
self.logger.info(f"Continuing from manual input:\n{continuation}")
continuation_file_path = "/root/continuation.md"
env.copy_string_to_container_file(continuation, continuation_file_path)
env.communicate_with_handling(f'export MANUAL_INPUT_CONTINUATION_FILE="{continuation_file_path}"')

return


# Compose system prompt.
system_msg = self.config.system_template.format(**self.system_args)
system_msg = f"{system_msg}\n\n{self._system_repro_prompt()}"
Expand Down Expand Up @@ -571,7 +592,13 @@ def forward(self, observation: str, available_actions: list[str], state: str) ->
action: action that the model proposes
output: raw model output (not output of the action)
"""
thought, action, output = self.forward_with_error_check(observation, state)
if self.initial_model_response is not None:
thought, action, content = itemgetter("thought", "action", "content")(self.initial_model_response)
output = make_model_query_result(content)
self.initial_model_response = None
else:
thought, action, output = self.forward_with_error_check(observation, state)

last_tool_name = get_last_valid_tool_use_name(output)
last_command = self.get_command(last_tool_name)
ran_tdd_action = last_command.tdd if last_command else False
Expand Down Expand Up @@ -619,8 +646,6 @@ def forward_model(self, observation: str, state: str) -> ModelQueryResult:
templates = [self.config.instance_template]
if self.config.strategy_template is not None:
templates.append(self.config.strategy_template)
# Get tdd_results, to be rendered into the initial_prompt template.
state_vars["tdd_results"] = self._make_initial_tdd_result()
elif observation is None or observation.strip() == "":
# Show no output template if observation content was empty
templates = [self.config.next_step_no_output_template]
Expand Down Expand Up @@ -896,13 +921,15 @@ def run(
If return_type is "info_trajectory", returns a tuple of
the info dictionary and the trajectory (list of dictionaries).
"""
self.made_initial_prompt = False
done = False
# mypy checks
assert env.container_obj is not None
assert env.record is not None
assert self.config is not None

self.made_initial_prompt = False
self.manual_input.set_instance_id(env.record["instance_id"])

if env.container_obj.id != self.last_container_id:
self.logger.info(f"Initializing agent settings for container {env.container_obj.id}")
self.init_environment_vars(env)
Expand All @@ -922,6 +949,7 @@ def run(
for hook in self.hooks:
hook.on_step_start()
state = env.communicate(self.state_command) if self.state_command else None
stop_on_tdd_repro = self.manual_input.enabled() and self.initial_model_response is None
thought, action, output = self.forward(observation, env.get_available_actions(), state)
for hook in self.hooks:
hook.on_actions_generated(thought=thought, action=action, output=repr(output))
Expand All @@ -937,6 +965,16 @@ def run(
observations.append(obs)
if sub_action["cmd_name"] == self.config.submit_command:
done = True
if sub_action["action"] == "tdd_repro" and stop_on_tdd_repro:
done = True
output = env.communicate("([ -s /root/test.patch ] && git apply -R < /root/test.patch); git add -A && echo -n '<<CONVERSATION_PATCH||' > /root/conversation-patch && git diff --cached >> /root/conversation-patch && echo -n '||CONVERSATION_PATCH>>' >> /root/conversation-patch && cat /root/conversation-patch")

pattern = r"\<\<CONVERSATION_PATCH\|\|(.*)\|\|CONVERSATION_PATCH\>\>"

match = re.search(pattern, output, re.DOTALL)
if match is not None:
patch = match.group(1)
self.manual_input.save_conversation(self.history, patch)
if done:
break
else:
Expand Down
100 changes: 100 additions & 0 deletions sweagent/agent/manual_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from __future__ import annotations
import os
from typing import Tuple

from sweagent.utils.log import get_logger
from sweagent.agent.model_cache import json_serialize_file, json_deserialize_file, json_serialize_str, json_deserialize_str, hash_string

ManualTDDInputEnvVar = "MANUAL_TDD_INPUT_DIRECTORY"

logger = get_logger("manual_tdd_input")

class ManualInput:
base_dir: str | None
instance_id: str | None
conversation_path: str | None
continuation_label: str | None

def __init__(self, conversation_path: str | None, continuation_label: str | None):
self.conversation_path = conversation_path
self.continuation_label = continuation_label
self.base_dir = None
if ManualTDDInputEnvVar in os.environ:
logger.warning("⚠ ManualInput is enabled")
self.base_dir = os.environ[ManualTDDInputEnvVar]

def enabled(self) -> bool:
return self.base_dir is not None

def set_instance_id(self, instance_id: str) -> None:
self.instance_id = instance_id

def _get_conversation_dir(self) -> str:
if self.conversation_path is None:
return os.path.join(self.base_dir, self.instance_id)
return os.path.join(self.base_dir, self.instance_id, self.conversation_path)

def load_continuation_file(self) -> str | None:
if not self.enabled():
return None

try:
with open(os.path.join(self._get_conversation_dir(), f"{self.continuation_label}.md"), "r") as f:
return f.read()
except FileNotFoundError:
return None

def save_conversation(self, conversation: list[dict[str, str]], patch: str | None) -> None:
if not self.enabled():
return None

parent_dir = self._get_conversation_dir()

# if continuation_label is left off, we're storing the root conversation
if self.continuation_label is None:
self.continuation_label = "root"

content = json_serialize_str(conversation)
hash = hash_string(content)

new_subdir = os.path.join(parent_dir, f"{self.continuation_label}-{hash}")
os.makedirs(new_subdir, exist_ok=True)

with open(os.path.join(new_subdir, "conversation.json"), "w") as f:
f.write(content)

if patch is not None and patch.strip() != "":
with open(os.path.join(new_subdir, "patch.diff"), "w") as f:
f.write(patch)

subdir_to_print = new_subdir[len(self.base_dir):]
logger.info(f"Conversation saved to ${ManualTDDInputEnvVar}%s", subdir_to_print)

def load_conversation(self) -> Tuple[list[dict[str, str]], str] | None:
if not self.enabled():
return None

dir = self._get_conversation_dir()
if not os.path.exists(dir):
return None

conversation_file_path = os.path.join(dir, "conversation.json")
if not os.path.exists(conversation_file_path):
return None

with open(conversation_file_path, "r") as f:
conversation = json_deserialize_str(f.read())

patch_file_path = os.path.join(dir, "patch.diff")

# a missing patch isn't an error (will happen with the first tdd_repro call)
if os.path.exists(patch_file_path):
with open(patch_file_path, "r") as f:
patch = f.read()
else:
patch = None

dir_to_print = dir[len(self.base_dir):]
logger.info("Conversation loaded from ${ManualTDDInputEnvVar}%s", dir_to_print)

return conversation, patch
2 changes: 1 addition & 1 deletion sweagent/agent/model_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def __init__(self):

def _get_file(self, history: list[dict[str, str]]) -> str:
hash_input = json_serialize_str(history)
print(f"HASH_INPUT\n{hash_input}\nEND_OF_HASH_INPUT")
# print(f"HASH_INPUT\n{hash_input}\nEND_OF_HASH_INPUT")
hash = hash_string(hash_input)
return f"{self.directory}/model-query-{hash}.json"

Expand Down
7 changes: 6 additions & 1 deletion sweagent/agent/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
_MAX_RETRIES = keys_config.get("SWE_AGENT_MODEL_MAX_RETRIES", 0)


def make_model_query_result(content: str | list[ContentBlock]) -> ModelQueryResult:
if isinstance(content, str):
return content
return AnthropicModelResult(blocks=content)

def make_assistant_content(output: ModelQueryResult):
if isinstance(output, str):
return output
Expand Down Expand Up @@ -68,7 +73,7 @@ def make_user_reply_content(action_result: str, model_result: ModelQueryResult |
result = {
"type": "tool_result",
"tool_use_id": tool_use.id,
"content": action_result
"content": action_result,
}
if is_error:
result["is_error"] = True
Expand Down
12 changes: 12 additions & 0 deletions sweagent/environment/swe_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,18 @@ def _apply_test_patch(self):
)
self.logger.debug(f"[TDD] Applied test patch - output:\n{res}")

def apply_conversation_patch(self, patch: str) -> None:
"""
Apply patch to source in repo
"""
patch_path = "/root/conversation.patch"
self.copy_string_to_container_file(patch, patch_path)
res = self.communicate_with_handling(
f"cd /{self._repo_name} && git apply -v {patch_path} && rm {patch_path}",
error_msg="Failed to apply patch",
)
self.logger.debug(f"[TDD] Applied previous changes - output:\n{res}")

def step(self, action: str) -> tuple[str | None, int, bool, dict]:
"""
Runs an action proposed by the agent in the environment and returns the corresponding output.
Expand Down

0 comments on commit f3788c3

Please sign in to comment.