diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py index 7beacf344408..231b132dfe8b 100644 --- a/evaluation/benchmarks/swe_bench/eval_infer.py +++ b/evaluation/benchmarks/swe_bench/eval_infer.py @@ -1,3 +1,4 @@ +import json import os import tempfile import time @@ -11,6 +12,7 @@ ) from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec from swebench.harness.utils import load_swebench_dataset +from tqdm import tqdm from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image from evaluation.utils.shared import ( @@ -81,7 +83,7 @@ def get_config(instance: pd.Series) -> AppConfig: base_container_image=base_container_image, use_host_network=False, # large enough timeout, since some testcases take very long to run - timeout=1800, + timeout=600, api_key=os.environ.get('ALLHANDS_API_KEY', None), remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), remote_runtime_init_timeout=3600, @@ -157,46 +159,46 @@ def process_instance( f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}' ) - runtime = create_runtime(config) - call_async_from_sync(runtime.connect) - # Get patch and save it to /tmp/patch.diff - with tempfile.TemporaryDirectory() as temp_dir: - # Patch file - patch_file_path = os.path.join(temp_dir, 'patch.diff') - with open(patch_file_path, 'w') as f: - f.write(model_patch) - runtime.copy_to(patch_file_path, '/tmp') - # Eval script - eval_script_path = os.path.join(temp_dir, 'eval.sh') - with open(eval_script_path, 'w') as f: - f.write(test_spec.eval_script) - runtime.copy_to(eval_script_path, '/tmp') - - # Set +x - action = CmdRunAction(command='chmod +x /tmp/eval.sh') - action.timeout = 600 - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert obs.exit_code == 0 - - # Apply patch - exec_command = ( - 'cd /testbed && ' - "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || " - "(echo 'Failed to apply patch with git apply, trying with patch command...' && " - "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || " - "echo 'APPLY_PATCH_FAIL')))" - ) - action = CmdRunAction(command=exec_command) - action.timeout = 600 - obs = runtime.run_action(action) - assert isinstance(obs, CmdOutputObservation) - apply_patch_output = obs.content - assert isinstance(apply_patch_output, str) - instance['test_result']['apply_patch_output'] = apply_patch_output - try: + runtime = create_runtime(config) + call_async_from_sync(runtime.connect) + # Get patch and save it to /tmp/patch.diff + with tempfile.TemporaryDirectory() as temp_dir: + # Patch file + patch_file_path = os.path.join(temp_dir, 'patch.diff') + with open(patch_file_path, 'w') as f: + f.write(model_patch) + runtime.copy_to(patch_file_path, '/tmp') + # Eval script + eval_script_path = os.path.join(temp_dir, 'eval.sh') + with open(eval_script_path, 'w') as f: + f.write(test_spec.eval_script) + runtime.copy_to(eval_script_path, '/tmp') + + # Set +x + action = CmdRunAction(command='chmod +x /tmp/eval.sh') + action.timeout = 600 + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + + # Apply patch + exec_command = ( + 'cd /testbed && ' + "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || " + "(echo 'Failed to apply patch with git apply, trying with patch command...' && " + "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || " + "echo 'APPLY_PATCH_FAIL')))" + ) + action = CmdRunAction(command=exec_command) + action.timeout = 600 + obs = runtime.run_action(action) + assert isinstance(obs, CmdOutputObservation) + apply_patch_output = obs.content + assert isinstance(apply_patch_output, str) + instance['test_result']['apply_patch_output'] = apply_patch_output + if 'APPLY_PATCH_FAIL' in apply_patch_output: logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}') instance['test_result']['report']['failed_apply_patch'] = True @@ -352,7 +354,13 @@ def process_instance( # Load predictions assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.' - predictions = pd.read_json(args.input_file, lines=True) + required_fields = ['instance_id', 'model_patch', 'test_result'] + predictions = pd.DataFrame.from_records( + [ + {k: v for k, v in json.loads(line).items() if k in required_fields} + for line in tqdm(open(args.input_file), desc='Loading predictions') + ] + ) assert ( 'instance_id' in predictions.columns ), 'Input file must contain instance_id column.' diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index bf065ada9734..b0563928d3ea 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -41,8 +41,11 @@ from openhands.utils.shutdown_listener import sleep_if_should_continue USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' -USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true' +USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'true').lower() == 'true' RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true' +DEFAULT_RUNTIME_RESOURCE_FACTOR = int( + os.environ.get('DEFAULT_RUNTIME_RESOURCE_FACTOR', 1) +) AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': codeact_user_response, @@ -135,6 +138,7 @@ def get_config( remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), keep_runtime_alive=False, remote_runtime_init_timeout=3600, + remote_runtime_resource_factor=int(DEFAULT_RUNTIME_RESOURCE_FACTOR), ), # do not mount workspace workspace_base=None, @@ -239,7 +243,7 @@ def initialize_runtime( assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}') action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh') - action.timeout = 3600 + action.timeout = 600 logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -351,7 +355,7 @@ def complete_runtime( action = CmdRunAction( command=f'git diff --no-color --cached {instance["base_commit"]}' ) - action.timeout = 600 + 100 * n_retries + action.timeout = max(300 + 100 * n_retries, 600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -399,7 +403,7 @@ def process_instance( 8, ) logger.warning( - f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}' + f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}' ) runtime = create_runtime(config) call_async_from_sync(runtime.connect) @@ -482,6 +486,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: return dataset +SWEGYM_EXCLUDE_IDS = [ + 'dask__dask-10422', +] + if __name__ == '__main__': parser = get_parser() parser.add_argument( @@ -501,8 +509,17 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing # so we don't need to manage file uploading to OpenHands's repo dataset = load_dataset(args.dataset, split=args.split) - logger.info(f'Loaded dataset {args.dataset} with split {args.split}') swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id') + logger.info( + f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks' + ) + if 'SWE-Gym' in args.dataset: + swe_bench_tests = swe_bench_tests[ + ~swe_bench_tests['instance_id'].isin(SWEGYM_EXCLUDE_IDS) + ] + logger.info( + f'{len(swe_bench_tests)} tasks left after excluding SWE-Gym excluded tasks' + ) llm_config = None if args.llm_config: @@ -531,6 +548,7 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + print(f'### OUTPUT FILE: {output_file} ###') instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit) if len(instances) > 0 and not isinstance( diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py b/evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py new file mode 100644 index 000000000000..cc1023f264e4 --- /dev/null +++ b/evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py @@ -0,0 +1,54 @@ +import argparse +import gzip +import json +import os +from glob import glob + +from tqdm import tqdm + +tqdm.pandas() + + +# Load trajectories for resolved instances +def load_completions(output_dir: str, instance_id: str): + glob_path = os.path.join(output_dir, 'llm_completions', instance_id, '*.json') + files = sorted(glob(glob_path)) # this is ascending order + # pick the last file (last turn) + try: + file_path = files[-1] + except IndexError: + # print(f'No files found for instance {instance_id}: files={files}') + return None + with open(file_path, 'r') as f: + result = json.load(f) + # create messages + messages = result['messages'] + messages.append(result['response']['choices'][0]['message']) + tools = result['kwargs']['tools'] + return { + 'messages': messages, + 'tools': tools, + } + + +parser = argparse.ArgumentParser() +parser.add_argument('jsonl_path', type=str) +args = parser.parse_args() + +output_dir = os.path.dirname(args.jsonl_path) +output_path = os.path.join(output_dir, 'output.with_completions.jsonl.gz') + +if os.path.exists(output_path): + print(f'Output file already exists at {output_path}, overwriting? (y/n)') + if input() != 'y': + print('Exiting...') + exit(0) + +# Process line by line +with open(args.jsonl_path, 'r') as f_in, gzip.open(output_path, 'wt') as f_out: + for line in tqdm(f_in): + data = json.loads(line) + data['raw_completions'] = load_completions(output_dir, data['instance_id']) + f_out.write(json.dumps(data) + '\n') + +print(f'Saved compressed output to {output_path}') diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py index d9c5c540f24b..430e0258a7f9 100644 --- a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py @@ -3,7 +3,7 @@ import os from collections import defaultdict -import pandas as pd +from tqdm import tqdm parser = argparse.ArgumentParser() parser.add_argument('input_file', type=str) @@ -11,8 +11,7 @@ dirname = os.path.dirname(args.input_file) -df = pd.read_json(args.input_file, lines=True) - +# Initialize counters and data structures instance_id_to_status = defaultdict( lambda: { 'empty_generation': False, @@ -23,15 +22,7 @@ } ) - -# Apply the status to the dataframe -def apply_report(row): - instance_id = row['instance_id'] - if instance_id in instance_id_to_status: - return dict(instance_id_to_status[instance_id]) - return row.get('report', {}) - - +# Process official report if it exists swebench_official_report_json = os.path.join(dirname, 'report.json') openhands_remote_report_jsonl = args.input_file.replace( '.jsonl', '.swebench_eval.jsonl' @@ -90,113 +81,142 @@ def apply_report(row): f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n' ) - df['report'] = df.apply(apply_report, axis=1) - with open(output_md_filepath, 'w') as f: f.write(output_md) elif os.path.exists(openhands_remote_report_jsonl): output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md') - df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records') - - assert len(df['instance_id'].unique()) == len( - df - ), 'There are duplicate instance ids in the original output which is not allowed' - assert len(df_eval['instance_id'].unique()) == len( - df_eval - ), 'There are duplicate instance ids in the eval report which is not allowed' - - for _, row in df_eval.iterrows(): - instance_id_to_status[row['instance_id']] = row['test_result']['report'] - df['report'] = df.apply(apply_report, axis=1) - - report_is_dict = df['report'].apply(lambda x: isinstance(x, dict)) - if not report_is_dict.all(): - print(df[~report_is_dict]) - raise ValueError(f'Report is not a dict, but a {type(row["report"])}') - - _n_instances = len(df) - _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))]) - _n_unresolved = _n_instances - _n_resolved - _n_empty_patch = len( - df[df['report'].apply(lambda x: x.get('empty_generation', False))] - ) - _n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))]) + # First pass: Read eval report and count instances + instance_ids = set() + eval_instance_ids = set() + + # Count instances in original file + n_instances = 0 + with open(args.input_file, 'r') as f: + for line in tqdm(f, desc='Counting instances in original file'): + data = json.loads(line) + instance_ids.add(data['instance_id']) + n_instances += 1 + print(f'Total instances in original file: {n_instances}') + + # Process eval report + n_eval_instances = 0 + with open(openhands_remote_report_jsonl, 'r') as f: + for line in tqdm(f, desc='Processing eval report'): + data = json.loads(line) + instance_id = data['instance_id'] + eval_instance_ids.add(instance_id) + n_eval_instances += 1 + instance_id_to_status[instance_id] = data['test_result']['report'] + print(f'Total instances in eval report: {n_eval_instances}') + + # Verify no duplicates + assert ( + len(instance_ids) == n_instances + ), 'Duplicate instance ids found in original output' + assert ( + len(eval_instance_ids) == n_eval_instances + ), 'Duplicate instance ids found in eval report' + + # Initialize counters + stats = {'total': len(instance_ids), 'resolved': 0, 'empty_patch': 0, 'error': 0} + + # Collect instance IDs by category + resolved_ids = [] + unresolved_ids = [] + error_ids = [] + empty_patch_ids = [] + timeout_ids = [] + + # Process original file and categorize instances + with open(args.input_file, 'r') as f: + for line in f: + data = json.loads(line) + instance_id = data['instance_id'] + report = instance_id_to_status[instance_id] + + if report.get('resolved', False): + stats['resolved'] += 1 + resolved_ids.append(instance_id) + else: + unresolved_ids.append(instance_id) + + if report.get('empty_generation', False): + stats['empty_patch'] += 1 + empty_patch_ids.append(instance_id) + if report.get('error_eval', False): + stats['error'] += 1 + error_ids.append(instance_id) + if report.get('test_timeout', False): + timeout_ids.append(instance_id) + + # Generate markdown report + def _instance_id_to_log_path(instance_id): + path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log" + return os.path.relpath(path, start=dirname) + + # ... rest of markdown generation code remains the same ... output_md = ( '# SWE-bench Report\n' 'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n' '## Summary\n' - f'- submitted instances: {_n_instances}\n' - f'- empty patch instances: {_n_empty_patch}\n' - f'- resolved instances: {_n_resolved}\n' - f'- unresolved instances: {_n_unresolved}\n' - f'- error instances: {_n_error}\n' + f'- submitted instances: {stats["total"]}\n' + f'- empty patch instances: {stats["empty_patch"]}\n' + f'- resolved instances: {stats["resolved"]}\n' + f'- unresolved instances: {len(unresolved_ids)}\n' + f'- error instances: {stats["error"]}\n' ) - def _instance_id_to_log_path(instance_id): - path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log" - # make it relative path - path = os.path.relpath(path, start=dirname) - return path - output_md += '\n## Resolved Instances\n' # instance_id to status - for instance_id in sorted( - df[df['report'].apply(lambda x: x.get('resolved', False))][ - 'instance_id' - ].unique() - ): + for instance_id in resolved_ids: instance_id_to_status[instance_id]['resolved'] = True output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n' output_md += '\n## Unresolved Instances\n' - for instance_id in sorted( - df[~df['report'].apply(lambda x: x.get('resolved', False))][ - 'instance_id' - ].unique() - ): + for instance_id in unresolved_ids: output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n' output_md += '\n## Error Instances\n' - for instance_id in sorted( - df[df['report'].apply(lambda x: x.get('error_eval', False))][ - 'instance_id' - ].unique() - ): + for instance_id in error_ids: instance_id_to_status[instance_id]['error_eval'] = True output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n' output_md += '\n## Empty Patch Instances\n' - for instance_id in sorted( - df[df['report'].apply(lambda x: x.get('empty_generation', False))][ - 'instance_id' - ].unique() - ): + for instance_id in empty_patch_ids: instance_id_to_status[instance_id]['empty_generation'] = True output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n' output_md += '\n## Incomplete Instances\n' - for instance_id in sorted( - df[df['report'].apply(lambda x: x.get('test_timeout', False))][ - 'instance_id' - ].unique() - ): + for instance_id in timeout_ids: output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n' + with open(output_md_filepath, 'w') as f: f.write(output_md) + else: print( f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.' ) exit() +# Backup and update the original file row by row if os.path.exists(args.input_file + '.bak'): conf = input('Existing backup file found. Do you want to overwrite it? (y/n)') if conf != 'y': exit() os.remove(args.input_file + '.bak') -# backup the original file os.rename(args.input_file, args.input_file + '.bak') -df.to_json(args.input_file, orient='records', lines=True) + +# Process and write file row by row +with open(args.input_file + '.bak', 'r') as infile, open( + args.input_file, 'w' +) as outfile: + for line in tqdm(infile, desc='Updating output file'): + data = json.loads(line) + instance_id = data['instance_id'] + if instance_id in instance_id_to_status: + data['report'] = instance_id_to_status[instance_id] + outfile.write(json.dumps(data) + '\n')