refactor: Replace pexpect with libtmux in BashSession (#4881)

Co-authored-by: openhands <[email protected]> Co-authored-by: Engel Nyst <[email protected]> Co-authored-by: Robert Brennan <[email protected]>
All-Hands-AI · Jan 3, 2025 · ec70af9 · ec70af9
1 parent 761a574
commit ec70af9
Show file tree

Hide file tree

Showing 66 changed files with 2,343 additions and 758 deletions.
diff --git a/.github/workflows/dummy-agent-test.yml b/.github/workflows/dummy-agent-test.yml
@@ -36,6 +36,8 @@ jobs:
       - name: Set up Docker Buildx
         id: buildx
         uses: docker/setup-buildx-action@v3
+      - name: Install tmux
+        run: sudo apt-get update && sudo apt-get install -y tmux
       - name: Install poetry via pipx
         run: pipx install poetry
       - name: Set up Python

diff --git a/.github/workflows/eval-runner.yml b/.github/workflows/eval-runner.yml
@@ -29,6 +29,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
 
+      - name: Install tmux
+        run: sudo apt-get update && sudo apt-get install -y tmux
       - name: Install poetry via pipx
         run: pipx install poetry
 

diff --git a/.github/workflows/py-unit-tests-mac.yml b/.github/workflows/py-unit-tests-mac.yml
@@ -31,6 +31,8 @@ jobs:
           key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
           restore-keys: |
             ${{ runner.os }}-poetry-
+      - name: Install tmux
+        run: brew install tmux
       - name: Install poetry via pipx
         run: pipx install poetry
       - name: Install Python dependencies using Poetry

diff --git a/.github/workflows/py-unit-tests.yml b/.github/workflows/py-unit-tests.yml
@@ -30,6 +30,8 @@ jobs:
       - name: Set up Docker Buildx
         id: buildx
         uses: docker/setup-buildx-action@v3
+      - name: Install tmux
+        run: sudo apt-get update && sudo apt-get install -y tmux
       - name: Install poetry via pipx
         run: pipx install poetry
       - name: Set up Python

diff --git a/docs/static/img/backend_architecture.puml b/docs/static/img/backend_architecture.puml
@@ -123,7 +123,6 @@ class openhands.state.State {
   updated_info: List[Tuple[Action, Observation]]
 }
 class openhands.observation.CmdOutputObservation {
-  command_id: int
   command: str
   exit_code: int
   observation: str

diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -137,7 +137,6 @@ def complete_runtime(
 
         action = CmdRunAction(
             command=f'chmod +x ./{script_name} && ./{script_name}',
-            keep_prompt=False,
         )
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
@@ -164,8 +163,7 @@ def complete_runtime(
             logger.info(f'Running get ground truth cmd: {script_name}')
 
             action = CmdRunAction(
-                command=f'chmod +x ./{script_name} && ./{script_name}',
-                keep_prompt=False,
+                command=f'chmod +x ./{script_name} && ./{script_name}'
             )
             logger.info(action, extra={'msg_type': 'ACTION'})
             obs = runtime.run_action(action)

diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -145,10 +145,7 @@ def complete_runtime(
         )
         logger.info(f'Running test file: {script_name}')
 
-    action = CmdRunAction(
-        command=f'python3 -m unittest {script_name}',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command=f'python3 -m unittest {script_name}')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})

diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py
@@ -199,7 +199,7 @@ def complete_runtime(
     if obs.exit_code == 0:
         test_result['metadata']['1_copy_change_success'] = True
 
-        action = CmdRunAction(command=f'cat {generated_path}', keep_prompt=False)
+        action = CmdRunAction(command=f'cat {generated_path}')
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         assert obs.exit_code == 0
@@ -223,9 +223,7 @@ def complete_runtime(
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
 
-    action = CmdRunAction(
-        command='cat /testing_files/results_biocoder.json', keep_prompt=False
-    )
+    action = CmdRunAction(command='cat /testing_files/results_biocoder.json')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     if obs.exit_code == 0:

diff --git a/evaluation/benchmarks/bird/README.md b/evaluation/benchmarks/bird/README.md
@@ -127,7 +127,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
         "observation": "run",
         "content": "california_schools/california_schools.sqlite\r\n[(1.0,)]",
         "extras": {
-          "command_id": -1,
           "command": "python3 0.py",
           "exit_code": 0
         }

diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py
@@ -268,10 +268,7 @@ def initialize_runtime(
     runtime.copy_to(db_file, '/workspace')
 
     # Check the database is copied
-    action = CmdRunAction(
-        command='cd /workspace && ls -l',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command='cd /workspace && ls -l')
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
@@ -300,10 +297,7 @@ def complete_runtime(
     instance_id = instance.instance_id.replace('/', '__')
     path = os.path.join('/workspace', f'{instance_id}.py')
 
-    action = CmdRunAction(
-        command=f'cat {path}',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command=f'cat {path}')
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
 

diff --git a/evaluation/benchmarks/humanevalfix/README.md b/evaluation/benchmarks/humanevalfix/README.md
@@ -71,7 +71,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
                 "observation": "run",
                 "content": "[File: /workspace/Python__2.py (14 lines total)]\r\n1:def truncate_number(number: float) -> float:\r\n2:    return number % 1.0 + 1.0\r\n3:\r\n4:\r\n5:\r\n6:\r\n7:\r\n8:\r\n9:def check(truncate_number):\r\n10:    assert truncate_number(3.5) == 0.5\r\n11:    assert abs(truncate_number(1.33) - 0.33) < 1e-6\r\n12:    assert abs(truncate_number(123.456) - 0.456) < 1e-6\r\n13:\r\n14:check(truncate_number)",
                 "extras": {
-                    "command_id": -1,
                     "command": "open Python__2.py",
                     "exit_code": 0
                 }
@@ -98,7 +97,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
                 "observation": "run",
                 "content": "> > [File: /workspace/Python__2.py (14 lines total)]\r\n1:def truncate_number(number: float) -> float:\r\n2:    return number % 1.0\r\n3:\r\n4:\r\n5:\r\n6:\r\n7:\r\n8:\r\n9:def check(truncate_number):\r\n10:    assert truncate_number(3.5) == 0.5\r\n11:    assert abs(truncate_number(1.33) - 0.33) < 1e-6\r\n12:    assert abs(truncate_number(123.456) - 0.456) < 1e-6\r\n13:\r\n14:check(truncate_number)\r\nFile updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.",
                 "extras": {
-                    "command_id": -1,
                     "command": "edit 2:2 <<EOF\n    return number % 1.0\nEOF",
                     "exit_code": 0
                 }
@@ -125,7 +123,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
                 "observation": "run",
                 "content": "",
                 "extras": {
-                    "command_id": -1,
                     "command": "python3 Python__2.py",
                     "exit_code": 0
                 }

diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py
@@ -171,9 +171,7 @@ def complete_runtime(
     num_workers = LANGUAGE_TO_NUM_WORKERS[language]
     python_imports = '\n'.join(IMPORT_HELPER[language])
 
-    action = CmdRunAction(
-        command=f'cat /workspace/{_get_instance_id(instance)}.py', keep_prompt=False
-    )
+    action = CmdRunAction(command=f'cat /workspace/{_get_instance_id(instance)}.py')
     obs = runtime.run_action(action)
     assert obs.exit_code == 0
 

diff --git a/evaluation/benchmarks/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py
@@ -163,7 +163,7 @@ def complete_runtime(
     eval_script = os.path.join(task_path, 'run.sh')
     logger.info(f'Running evaluation script: {eval_script}')
 
-    action = CmdRunAction(command=f'cat {eval_script}', keep_prompt=False)
+    action = CmdRunAction(command=f'cat {eval_script}')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     if obs.exit_code == 0:

diff --git a/evaluation/benchmarks/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py
@@ -121,10 +121,7 @@ def initialize_runtime(
     runtime.copy_to(dataset_dir, '/workspace/benchmark/datasets', recursive=True)
 
     # Check the dataset exists
-    action = CmdRunAction(
-        command='cd /workspace/benchmark/datasets && ls',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command='cd /workspace/benchmark/datasets && ls')
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
@@ -154,10 +151,7 @@ def complete_runtime(
 
     assert obs.exit_code == 0
 
-    action = CmdRunAction(
-        command=f'cat pred_programs/{instance.pred_program_name}',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command=f'cat pred_programs/{instance.pred_program_name}')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
 

diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -98,6 +98,7 @@ def process_instance(
     metadata: EvalMetadata,
     reset_logger: bool = True,
     log_dir: str | None = None,
+    runtime_failure_count: int = 0,
 ) -> EvalOutput:
     """
     Evaluate agent performance on a SWE-bench problem instance.
@@ -146,6 +147,16 @@ def process_instance(
             metadata=metadata,
         )
 
+    # Increase resource_factor with increasing attempt_id
+    if runtime_failure_count > 0:
+        config.sandbox.remote_runtime_resource_factor = min(
+            config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
+            4,  # hardcode maximum resource factor to 4
+        )
+        logger.warning(
+            f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
+        )
+
     runtime = create_runtime(config)
     call_async_from_sync(runtime.connect)
     # Get patch and save it to /tmp/patch.diff
@@ -177,7 +188,7 @@ def process_instance(
         "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
         "echo 'APPLY_PATCH_FAIL')))"
     )
-    action = CmdRunAction(command=exec_command, keep_prompt=False)
+    action = CmdRunAction(command=exec_command)
     action.timeout = 600
     obs = runtime.run_action(action)
     assert isinstance(obs, CmdOutputObservation)
@@ -200,9 +211,7 @@ def process_instance(
 
             # Run eval script in background and save output to log file
             log_file = '/tmp/eval_output.log'
-            action = CmdRunAction(
-                command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False
-            )
+            action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!')
             action.timeout = 60  # Short timeout just to get the process ID
             obs = runtime.run_action(action)
 
@@ -224,7 +233,7 @@ def process_instance(
                         instance['test_result']['report']['test_timeout'] = True
                         break
                     check_action = CmdRunAction(
-                        command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
+                        command=f'ps -p {pid} > /dev/null; echo $?'
                     )
                     check_action.timeout = 60
                     check_obs = runtime.run_action(check_action)
@@ -242,7 +251,7 @@ def process_instance(
                     time.sleep(30)  # Wait for 30 seconds before checking again
 
                 # Read the log file
-                cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False)
+                cat_action = CmdRunAction(command=f'cat {log_file}')
                 cat_action.timeout = 300
                 cat_obs = runtime.run_action(cat_action)
 

diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -282,6 +282,16 @@ def initialize_runtime(
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
 
+    action = CmdRunAction(command='which python')
+    action.timeout = 600
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0 and 'testbed' in obs.content,
+        f'Expected to find python interpreter from testbed, but got: {str(obs)}',
+    )
+
     logger.info('-' * 30)
     logger.info('END Runtime Initialization Fn')
     logger.info('-' * 30)
@@ -337,8 +347,7 @@ def complete_runtime(
     git_patch = None
     while n_retries < 5:
         action = CmdRunAction(
-            command=f'git diff --no-color --cached {instance["base_commit"]}',
-            keep_prompt=False,
+            command=f'git diff --no-color --cached {instance["base_commit"]}'
         )
         action.timeout = 600 + 100 * n_retries
         logger.info(action, extra={'msg_type': 'ACTION'})
@@ -385,7 +394,7 @@ def process_instance(
     if runtime_failure_count > 0:
         config.sandbox.remote_runtime_resource_factor = min(
             config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
-            2,  # hardcode maximum resource factor to 2
+            8,
         )
         logger.warning(
             f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
@@ -535,4 +544,5 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
         args.eval_num_workers,
         process_instance,
         timeout_seconds=120 * 60,  # 2 hour PER instance should be more than enough
+        max_retries=5,
     )
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py b/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py
@@ -104,9 +104,9 @@ def _get_resolved(report):
     # Determine if this repo has a significant diff
     is_significant = diff >= threshold
     repo_color = 'red' if is_significant else 'yellow'
-    print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))
 
     print(f"\n{colored(repo, repo_color, attrs=['bold'])}:")
+    print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))
     print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green'))
     if x_instances:
         print('  ' + str(x_instances))

diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py
@@ -20,6 +20,13 @@
 print(f'Converting {args.oh_output_file} to markdown files in {output_md_folder}')
 
 oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
+
+swebench_eval_file = args.oh_output_file.replace('.jsonl', '.swebench_eval.jsonl')
+if os.path.exists(swebench_eval_file):
+    eval_output_df = pd.read_json(swebench_eval_file, orient='records', lines=True)
+else:
+    eval_output_df = None
+
 # model name is the folder name of oh_output_file
 model_name = os.path.basename(os.path.dirname(args.oh_output_file))
 
@@ -50,16 +57,29 @@ def convert_history_to_str(history):
     return ret
 
 
-def write_row_to_md_file(row):
+def write_row_to_md_file(row, instance_id_to_test_result):
     if 'git_patch' in row:
         model_patch = row['git_patch']
     elif 'test_result' in row and 'git_patch' in row['test_result']:
         model_patch = row['test_result']['git_patch']
     else:
         raise ValueError(f'Row {row} does not have a git_patch')
 
-    if 'report' in row:
-        resolved = row['report'].get('resolved', False)
+    test_output = None
+    if row['instance_id'] in instance_id_to_test_result:
+        report = instance_id_to_test_result[row['instance_id']].get('report', {})
+        resolved = report.get('resolved', False)
+        test_output = instance_id_to_test_result[row['instance_id']].get(
+            'test_output', None
+        )
+    elif 'report' in row and row['report'] is not None:
+        if not isinstance(row['report'], dict):
+            resolved = None
+            print(
+                f'ERROR: Report is not a dict, but a {type(row["report"])}. Row: {row}'
+            )
+        else:
+            resolved = row['report'].get('resolved', False)
     else:
         resolved = None
 
@@ -84,5 +104,18 @@ def write_row_to_md_file(row):
         f.write('## Model Patch\n')
         f.write(f'{process_git_patch(model_patch)}\n')
 
+        f.write('## Test Output\n')
+        f.write(str(test_output))
+
+
+instance_id_to_test_result = {}
+if eval_output_df is not None:
+    instance_id_to_test_result = (
+        eval_output_df[['instance_id', 'test_result']]
+        .set_index('instance_id')['test_result']
+        .to_dict()
+    )
 
-oh_format.progress_apply(write_row_to_md_file, axis=1)
+oh_format.progress_apply(
+    write_row_to_md_file, axis=1, instance_id_to_test_result=instance_id_to_test_result
+)
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
@@ -111,6 +111,11 @@ def apply_report(row):
         instance_id_to_status[row['instance_id']] = row['test_result']['report']
     df['report'] = df.apply(apply_report, axis=1)
 
+    report_is_dict = df['report'].apply(lambda x: isinstance(x, dict))
+    if not report_is_dict.all():
+        print(df[~report_is_dict])
+        raise ValueError(f'Report is not a dict, but a {type(row["report"])}')
+
     _n_instances = len(df)
     _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
     _n_unresolved = _n_instances - _n_resolved

diff --git a/evaluation/integration_tests/tests/t01_fix_simple_typo.py b/evaluation/integration_tests/tests/t01_fix_simple_typo.py
@@ -24,7 +24,7 @@ def initialize_runtime(cls, runtime: Runtime) -> None:
     @classmethod
     def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
         # check if the file /workspace/bad.txt has been fixed
-        action = CmdRunAction(command='cat /workspace/bad.txt', keep_prompt=False)
+        action = CmdRunAction(command='cat /workspace/bad.txt')
         obs = runtime.run_action(action)
         if obs.exit_code != 0:
             return TestResult(