Skip to content

Commit

Permalink
refactor: Replace pexpect with libtmux in BashSession (#4881)
Browse files Browse the repository at this point in the history
Co-authored-by: openhands <[email protected]>
Co-authored-by: Engel Nyst <[email protected]>
Co-authored-by: Robert Brennan <[email protected]>
  • Loading branch information
4 people authored Jan 3, 2025
1 parent 761a574 commit ec70af9
Show file tree
Hide file tree
Showing 66 changed files with 2,343 additions and 758 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/dummy-agent-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ jobs:
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
- name: Install tmux
run: sudo apt-get update && sudo apt-get install -y tmux
- name: Install poetry via pipx
run: pipx install poetry
- name: Set up Python
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/eval-runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4

- name: Install tmux
run: sudo apt-get update && sudo apt-get install -y tmux
- name: Install poetry via pipx
run: pipx install poetry

Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/py-unit-tests-mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ jobs:
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Install tmux
run: brew install tmux
- name: Install poetry via pipx
run: pipx install poetry
- name: Install Python dependencies using Poetry
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/py-unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ jobs:
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
- name: Install tmux
run: sudo apt-get update && sudo apt-get install -y tmux
- name: Install poetry via pipx
run: pipx install poetry
- name: Set up Python
Expand Down
1 change: 0 additions & 1 deletion docs/static/img/backend_architecture.puml
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ class openhands.state.State {
updated_info: List[Tuple[Action, Observation]]
}
class openhands.observation.CmdOutputObservation {
command_id: int
command: str
exit_code: int
observation: str
Expand Down
4 changes: 1 addition & 3 deletions evaluation/benchmarks/agent_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,6 @@ def complete_runtime(

action = CmdRunAction(
command=f'chmod +x ./{script_name} && ./{script_name}',
keep_prompt=False,
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
Expand All @@ -164,8 +163,7 @@ def complete_runtime(
logger.info(f'Running get ground truth cmd: {script_name}')

action = CmdRunAction(
command=f'chmod +x ./{script_name} && ./{script_name}',
keep_prompt=False,
command=f'chmod +x ./{script_name} && ./{script_name}'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
Expand Down
5 changes: 1 addition & 4 deletions evaluation/benchmarks/aider_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,7 @@ def complete_runtime(
)
logger.info(f'Running test file: {script_name}')

action = CmdRunAction(
command=f'python3 -m unittest {script_name}',
keep_prompt=False,
)
action = CmdRunAction(command=f'python3 -m unittest {script_name}')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down
6 changes: 2 additions & 4 deletions evaluation/benchmarks/biocoder/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def complete_runtime(
if obs.exit_code == 0:
test_result['metadata']['1_copy_change_success'] = True

action = CmdRunAction(command=f'cat {generated_path}', keep_prompt=False)
action = CmdRunAction(command=f'cat {generated_path}')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
assert obs.exit_code == 0
Expand All @@ -223,9 +223,7 @@ def complete_runtime(
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

action = CmdRunAction(
command='cat /testing_files/results_biocoder.json', keep_prompt=False
)
action = CmdRunAction(command='cat /testing_files/results_biocoder.json')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
if obs.exit_code == 0:
Expand Down
1 change: 0 additions & 1 deletion evaluation/benchmarks/bird/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
"observation": "run",
"content": "california_schools/california_schools.sqlite\r\n[(1.0,)]",
"extras": {
"command_id": -1,
"command": "python3 0.py",
"exit_code": 0
}
Expand Down
10 changes: 2 additions & 8 deletions evaluation/benchmarks/bird/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,10 +268,7 @@ def initialize_runtime(
runtime.copy_to(db_file, '/workspace')

# Check the database is copied
action = CmdRunAction(
command='cd /workspace && ls -l',
keep_prompt=False,
)
action = CmdRunAction(command='cd /workspace && ls -l')
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
Expand Down Expand Up @@ -300,10 +297,7 @@ def complete_runtime(
instance_id = instance.instance_id.replace('/', '__')
path = os.path.join('/workspace', f'{instance_id}.py')

action = CmdRunAction(
command=f'cat {path}',
keep_prompt=False,
)
action = CmdRunAction(command=f'cat {path}')
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})

Expand Down
3 changes: 0 additions & 3 deletions evaluation/benchmarks/humanevalfix/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
"observation": "run",
"content": "[File: /workspace/Python__2.py (14 lines total)]\r\n1:def truncate_number(number: float) -> float:\r\n2: return number % 1.0 + 1.0\r\n3:\r\n4:\r\n5:\r\n6:\r\n7:\r\n8:\r\n9:def check(truncate_number):\r\n10: assert truncate_number(3.5) == 0.5\r\n11: assert abs(truncate_number(1.33) - 0.33) < 1e-6\r\n12: assert abs(truncate_number(123.456) - 0.456) < 1e-6\r\n13:\r\n14:check(truncate_number)",
"extras": {
"command_id": -1,
"command": "open Python__2.py",
"exit_code": 0
}
Expand All @@ -98,7 +97,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
"observation": "run",
"content": "> > [File: /workspace/Python__2.py (14 lines total)]\r\n1:def truncate_number(number: float) -> float:\r\n2: return number % 1.0\r\n3:\r\n4:\r\n5:\r\n6:\r\n7:\r\n8:\r\n9:def check(truncate_number):\r\n10: assert truncate_number(3.5) == 0.5\r\n11: assert abs(truncate_number(1.33) - 0.33) < 1e-6\r\n12: assert abs(truncate_number(123.456) - 0.456) < 1e-6\r\n13:\r\n14:check(truncate_number)\r\nFile updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.",
"extras": {
"command_id": -1,
"command": "edit 2:2 <<EOF\n return number % 1.0\nEOF",
"exit_code": 0
}
Expand All @@ -125,7 +123,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
"observation": "run",
"content": "",
"extras": {
"command_id": -1,
"command": "python3 Python__2.py",
"exit_code": 0
}
Expand Down
4 changes: 1 addition & 3 deletions evaluation/benchmarks/humanevalfix/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,7 @@ def complete_runtime(
num_workers = LANGUAGE_TO_NUM_WORKERS[language]
python_imports = '\n'.join(IMPORT_HELPER[language])

action = CmdRunAction(
command=f'cat /workspace/{_get_instance_id(instance)}.py', keep_prompt=False
)
action = CmdRunAction(command=f'cat /workspace/{_get_instance_id(instance)}.py')
obs = runtime.run_action(action)
assert obs.exit_code == 0

Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/ml_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def complete_runtime(
eval_script = os.path.join(task_path, 'run.sh')
logger.info(f'Running evaluation script: {eval_script}')

action = CmdRunAction(command=f'cat {eval_script}', keep_prompt=False)
action = CmdRunAction(command=f'cat {eval_script}')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
if obs.exit_code == 0:
Expand Down
10 changes: 2 additions & 8 deletions evaluation/benchmarks/scienceagentbench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,7 @@ def initialize_runtime(
runtime.copy_to(dataset_dir, '/workspace/benchmark/datasets', recursive=True)

# Check the dataset exists
action = CmdRunAction(
command='cd /workspace/benchmark/datasets && ls',
keep_prompt=False,
)
action = CmdRunAction(command='cd /workspace/benchmark/datasets && ls')
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
Expand Down Expand Up @@ -154,10 +151,7 @@ def complete_runtime(

assert obs.exit_code == 0

action = CmdRunAction(
command=f'cat pred_programs/{instance.pred_program_name}',
keep_prompt=False,
)
action = CmdRunAction(command=f'cat pred_programs/{instance.pred_program_name}')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)

Expand Down
21 changes: 15 additions & 6 deletions evaluation/benchmarks/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def process_instance(
metadata: EvalMetadata,
reset_logger: bool = True,
log_dir: str | None = None,
runtime_failure_count: int = 0,
) -> EvalOutput:
"""
Evaluate agent performance on a SWE-bench problem instance.
Expand Down Expand Up @@ -146,6 +147,16 @@ def process_instance(
metadata=metadata,
)

# Increase resource_factor with increasing attempt_id
if runtime_failure_count > 0:
config.sandbox.remote_runtime_resource_factor = min(
config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
4, # hardcode maximum resource factor to 4
)
logger.warning(
f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
)

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
# Get patch and save it to /tmp/patch.diff
Expand Down Expand Up @@ -177,7 +188,7 @@ def process_instance(
"(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
"echo 'APPLY_PATCH_FAIL')))"
)
action = CmdRunAction(command=exec_command, keep_prompt=False)
action = CmdRunAction(command=exec_command)
action.timeout = 600
obs = runtime.run_action(action)
assert isinstance(obs, CmdOutputObservation)
Expand All @@ -200,9 +211,7 @@ def process_instance(

# Run eval script in background and save output to log file
log_file = '/tmp/eval_output.log'
action = CmdRunAction(
command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False
)
action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!')
action.timeout = 60 # Short timeout just to get the process ID
obs = runtime.run_action(action)

Expand All @@ -224,7 +233,7 @@ def process_instance(
instance['test_result']['report']['test_timeout'] = True
break
check_action = CmdRunAction(
command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
command=f'ps -p {pid} > /dev/null; echo $?'
)
check_action.timeout = 60
check_obs = runtime.run_action(check_action)
Expand All @@ -242,7 +251,7 @@ def process_instance(
time.sleep(30) # Wait for 30 seconds before checking again

# Read the log file
cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False)
cat_action = CmdRunAction(command=f'cat {log_file}')
cat_action.timeout = 300
cat_obs = runtime.run_action(cat_action)

Expand Down
16 changes: 13 additions & 3 deletions evaluation/benchmarks/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,16 @@ def initialize_runtime(
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')

action = CmdRunAction(command='which python')
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
obs.exit_code == 0 and 'testbed' in obs.content,
f'Expected to find python interpreter from testbed, but got: {str(obs)}',
)

logger.info('-' * 30)
logger.info('END Runtime Initialization Fn')
logger.info('-' * 30)
Expand Down Expand Up @@ -337,8 +347,7 @@ def complete_runtime(
git_patch = None
while n_retries < 5:
action = CmdRunAction(
command=f'git diff --no-color --cached {instance["base_commit"]}',
keep_prompt=False,
command=f'git diff --no-color --cached {instance["base_commit"]}'
)
action.timeout = 600 + 100 * n_retries
logger.info(action, extra={'msg_type': 'ACTION'})
Expand Down Expand Up @@ -385,7 +394,7 @@ def process_instance(
if runtime_failure_count > 0:
config.sandbox.remote_runtime_resource_factor = min(
config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
2, # hardcode maximum resource factor to 2
8,
)
logger.warning(
f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
Expand Down Expand Up @@ -535,4 +544,5 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
args.eval_num_workers,
process_instance,
timeout_seconds=120 * 60, # 2 hour PER instance should be more than enough
max_retries=5,
)
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ def _get_resolved(report):
# Determine if this repo has a significant diff
is_significant = diff >= threshold
repo_color = 'red' if is_significant else 'yellow'
print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))

print(f"\n{colored(repo, repo_color, attrs=['bold'])}:")
print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))
print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green'))
if x_instances:
print(' ' + str(x_instances))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@
print(f'Converting {args.oh_output_file} to markdown files in {output_md_folder}')

oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)

swebench_eval_file = args.oh_output_file.replace('.jsonl', '.swebench_eval.jsonl')
if os.path.exists(swebench_eval_file):
eval_output_df = pd.read_json(swebench_eval_file, orient='records', lines=True)
else:
eval_output_df = None

# model name is the folder name of oh_output_file
model_name = os.path.basename(os.path.dirname(args.oh_output_file))

Expand Down Expand Up @@ -50,16 +57,29 @@ def convert_history_to_str(history):
return ret


def write_row_to_md_file(row):
def write_row_to_md_file(row, instance_id_to_test_result):
if 'git_patch' in row:
model_patch = row['git_patch']
elif 'test_result' in row and 'git_patch' in row['test_result']:
model_patch = row['test_result']['git_patch']
else:
raise ValueError(f'Row {row} does not have a git_patch')

if 'report' in row:
resolved = row['report'].get('resolved', False)
test_output = None
if row['instance_id'] in instance_id_to_test_result:
report = instance_id_to_test_result[row['instance_id']].get('report', {})
resolved = report.get('resolved', False)
test_output = instance_id_to_test_result[row['instance_id']].get(
'test_output', None
)
elif 'report' in row and row['report'] is not None:
if not isinstance(row['report'], dict):
resolved = None
print(
f'ERROR: Report is not a dict, but a {type(row["report"])}. Row: {row}'
)
else:
resolved = row['report'].get('resolved', False)
else:
resolved = None

Expand All @@ -84,5 +104,18 @@ def write_row_to_md_file(row):
f.write('## Model Patch\n')
f.write(f'{process_git_patch(model_patch)}\n')

f.write('## Test Output\n')
f.write(str(test_output))


instance_id_to_test_result = {}
if eval_output_df is not None:
instance_id_to_test_result = (
eval_output_df[['instance_id', 'test_result']]
.set_index('instance_id')['test_result']
.to_dict()
)

oh_format.progress_apply(write_row_to_md_file, axis=1)
oh_format.progress_apply(
write_row_to_md_file, axis=1, instance_id_to_test_result=instance_id_to_test_result
)
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,11 @@ def apply_report(row):
instance_id_to_status[row['instance_id']] = row['test_result']['report']
df['report'] = df.apply(apply_report, axis=1)

report_is_dict = df['report'].apply(lambda x: isinstance(x, dict))
if not report_is_dict.all():
print(df[~report_is_dict])
raise ValueError(f'Report is not a dict, but a {type(row["report"])}')

_n_instances = len(df)
_n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
_n_unresolved = _n_instances - _n_resolved
Expand Down
2 changes: 1 addition & 1 deletion evaluation/integration_tests/tests/t01_fix_simple_typo.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def initialize_runtime(cls, runtime: Runtime) -> None:
@classmethod
def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
# check if the file /workspace/bad.txt has been fixed
action = CmdRunAction(command='cat /workspace/bad.txt', keep_prompt=False)
action = CmdRunAction(command='cat /workspace/bad.txt')
obs = runtime.run_action(action)
if obs.exit_code != 0:
return TestResult(
Expand Down
Loading

0 comments on commit ec70af9

Please sign in to comment.