diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py
index 7beacf344408..231b132dfe8b 100644
--- a/evaluation/benchmarks/swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -1,3 +1,4 @@
+import json
 import os
 import tempfile
 import time
@@ -11,6 +12,7 @@
 )
 from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
 from swebench.harness.utils import load_swebench_dataset
+from tqdm import tqdm
 
 from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image
 from evaluation.utils.shared import (
@@ -81,7 +83,7 @@ def get_config(instance: pd.Series) -> AppConfig:
             base_container_image=base_container_image,
             use_host_network=False,
             # large enough timeout, since some testcases take very long to run
-            timeout=1800,
+            timeout=600,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
             remote_runtime_init_timeout=3600,
@@ -157,46 +159,46 @@ def process_instance(
             f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
         )
 
-    runtime = create_runtime(config)
-    call_async_from_sync(runtime.connect)
-    # Get patch and save it to /tmp/patch.diff
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Patch file
-        patch_file_path = os.path.join(temp_dir, 'patch.diff')
-        with open(patch_file_path, 'w') as f:
-            f.write(model_patch)
-        runtime.copy_to(patch_file_path, '/tmp')
-        # Eval script
-        eval_script_path = os.path.join(temp_dir, 'eval.sh')
-        with open(eval_script_path, 'w') as f:
-            f.write(test_spec.eval_script)
-        runtime.copy_to(eval_script_path, '/tmp')
-
-    # Set +x
-    action = CmdRunAction(command='chmod +x /tmp/eval.sh')
-    action.timeout = 600
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0
-
-    # Apply patch
-    exec_command = (
-        'cd /testbed && '
-        "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
-        "(echo 'Failed to apply patch with git apply, trying with patch command...' && "
-        "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
-        "echo 'APPLY_PATCH_FAIL')))"
-    )
-    action = CmdRunAction(command=exec_command)
-    action.timeout = 600
-    obs = runtime.run_action(action)
-    assert isinstance(obs, CmdOutputObservation)
-    apply_patch_output = obs.content
-    assert isinstance(apply_patch_output, str)
-    instance['test_result']['apply_patch_output'] = apply_patch_output
-
     try:
+        runtime = create_runtime(config)
+        call_async_from_sync(runtime.connect)
+        # Get patch and save it to /tmp/patch.diff
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Patch file
+            patch_file_path = os.path.join(temp_dir, 'patch.diff')
+            with open(patch_file_path, 'w') as f:
+                f.write(model_patch)
+            runtime.copy_to(patch_file_path, '/tmp')
+            # Eval script
+            eval_script_path = os.path.join(temp_dir, 'eval.sh')
+            with open(eval_script_path, 'w') as f:
+                f.write(test_spec.eval_script)
+            runtime.copy_to(eval_script_path, '/tmp')
+
+        # Set +x
+        action = CmdRunAction(command='chmod +x /tmp/eval.sh')
+        action.timeout = 600
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+
+        # Apply patch
+        exec_command = (
+            'cd /testbed && '
+            "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
+            "(echo 'Failed to apply patch with git apply, trying with patch command...' && "
+            "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
+            "echo 'APPLY_PATCH_FAIL')))"
+        )
+        action = CmdRunAction(command=exec_command)
+        action.timeout = 600
+        obs = runtime.run_action(action)
+        assert isinstance(obs, CmdOutputObservation)
+        apply_patch_output = obs.content
+        assert isinstance(apply_patch_output, str)
+        instance['test_result']['apply_patch_output'] = apply_patch_output
+
         if 'APPLY_PATCH_FAIL' in apply_patch_output:
             logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}')
             instance['test_result']['report']['failed_apply_patch'] = True
@@ -352,7 +354,13 @@ def process_instance(
 
     # Load predictions
     assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
-    predictions = pd.read_json(args.input_file, lines=True)
+    required_fields = ['instance_id', 'model_patch', 'test_result']
+    predictions = pd.DataFrame.from_records(
+        [
+            {k: v for k, v in json.loads(line).items() if k in required_fields}
+            for line in tqdm(open(args.input_file), desc='Loading predictions')
+        ]
+    )
     assert (
         'instance_id' in predictions.columns
     ), 'Input file must contain instance_id column.'
diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
index bf065ada9734..b0563928d3ea 100644
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -41,8 +41,11 @@
 from openhands.utils.shutdown_listener import sleep_if_should_continue
 
 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
-USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
+USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'true').lower() == 'true'
 RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
+DEFAULT_RUNTIME_RESOURCE_FACTOR = int(
+    os.environ.get('DEFAULT_RUNTIME_RESOURCE_FACTOR', 1)
+)
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
@@ -135,6 +138,7 @@ def get_config(
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
             keep_runtime_alive=False,
             remote_runtime_init_timeout=3600,
+            remote_runtime_resource_factor=int(DEFAULT_RUNTIME_RESOURCE_FACTOR),
         ),
         # do not mount workspace
         workspace_base=None,
@@ -239,7 +243,7 @@ def initialize_runtime(
         assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
 
         action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
-        action.timeout = 3600
+        action.timeout = 600
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -351,7 +355,7 @@ def complete_runtime(
         action = CmdRunAction(
             command=f'git diff --no-color --cached {instance["base_commit"]}'
         )
-        action.timeout = 600 + 100 * n_retries
+        action.timeout = max(300 + 100 * n_retries, 600)
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -399,7 +403,7 @@ def process_instance(
             8,
         )
         logger.warning(
-            f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
+            f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
         )
     runtime = create_runtime(config)
     call_async_from_sync(runtime.connect)
@@ -482,6 +486,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
     return dataset
 
 
+SWEGYM_EXCLUDE_IDS = [
+    'dask__dask-10422',
+]
+
 if __name__ == '__main__':
     parser = get_parser()
     parser.add_argument(
@@ -501,8 +509,17 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
     # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
     # so we don't need to manage file uploading to OpenHands's repo
     dataset = load_dataset(args.dataset, split=args.split)
-    logger.info(f'Loaded dataset {args.dataset} with split {args.split}')
     swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
+    logger.info(
+        f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
+    )
+    if 'SWE-Gym' in args.dataset:
+        swe_bench_tests = swe_bench_tests[
+            ~swe_bench_tests['instance_id'].isin(SWEGYM_EXCLUDE_IDS)
+        ]
+        logger.info(
+            f'{len(swe_bench_tests)} tasks left after excluding SWE-Gym excluded tasks'
+        )
 
     llm_config = None
     if args.llm_config:
@@ -531,6 +548,7 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
     )
 
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    print(f'### OUTPUT FILE: {output_file} ###')
     instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
 
     if len(instances) > 0 and not isinstance(
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py b/evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py
new file mode 100644
index 000000000000..cc1023f264e4
--- /dev/null
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py
@@ -0,0 +1,54 @@
+import argparse
+import gzip
+import json
+import os
+from glob import glob
+
+from tqdm import tqdm
+
+tqdm.pandas()
+
+
+# Load trajectories for resolved instances
+def load_completions(output_dir: str, instance_id: str):
+    glob_path = os.path.join(output_dir, 'llm_completions', instance_id, '*.json')
+    files = sorted(glob(glob_path))  # this is ascending order
+    # pick the last file (last turn)
+    try:
+        file_path = files[-1]
+    except IndexError:
+        # print(f'No files found for instance {instance_id}: files={files}')
+        return None
+    with open(file_path, 'r') as f:
+        result = json.load(f)
+    # create messages
+    messages = result['messages']
+    messages.append(result['response']['choices'][0]['message'])
+    tools = result['kwargs']['tools']
+    return {
+        'messages': messages,
+        'tools': tools,
+    }
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('jsonl_path', type=str)
+args = parser.parse_args()
+
+output_dir = os.path.dirname(args.jsonl_path)
+output_path = os.path.join(output_dir, 'output.with_completions.jsonl.gz')
+
+if os.path.exists(output_path):
+    print(f'Output file already exists at {output_path}, overwriting? (y/n)')
+    if input() != 'y':
+        print('Exiting...')
+        exit(0)
+
+# Process line by line
+with open(args.jsonl_path, 'r') as f_in, gzip.open(output_path, 'wt') as f_out:
+    for line in tqdm(f_in):
+        data = json.loads(line)
+        data['raw_completions'] = load_completions(output_dir, data['instance_id'])
+        f_out.write(json.dumps(data) + '\n')
+
+print(f'Saved compressed output to {output_path}')
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
index d9c5c540f24b..430e0258a7f9 100644
--- a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
@@ -3,7 +3,7 @@
 import os
 from collections import defaultdict
 
-import pandas as pd
+from tqdm import tqdm
 
 parser = argparse.ArgumentParser()
 parser.add_argument('input_file', type=str)
@@ -11,8 +11,7 @@
 
 dirname = os.path.dirname(args.input_file)
 
-df = pd.read_json(args.input_file, lines=True)
-
+# Initialize counters and data structures
 instance_id_to_status = defaultdict(
     lambda: {
         'empty_generation': False,
@@ -23,15 +22,7 @@
     }
 )
 
-
-# Apply the status to the dataframe
-def apply_report(row):
-    instance_id = row['instance_id']
-    if instance_id in instance_id_to_status:
-        return dict(instance_id_to_status[instance_id])
-    return row.get('report', {})
-
-
+# Process official report if it exists
 swebench_official_report_json = os.path.join(dirname, 'report.json')
 openhands_remote_report_jsonl = args.input_file.replace(
     '.jsonl', '.swebench_eval.jsonl'
@@ -90,113 +81,142 @@ def apply_report(row):
             f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
         )
 
-    df['report'] = df.apply(apply_report, axis=1)
-
     with open(output_md_filepath, 'w') as f:
         f.write(output_md)
 
 elif os.path.exists(openhands_remote_report_jsonl):
     output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md')
 
-    df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records')
-
-    assert len(df['instance_id'].unique()) == len(
-        df
-    ), 'There are duplicate instance ids in the original output which is not allowed'
-    assert len(df_eval['instance_id'].unique()) == len(
-        df_eval
-    ), 'There are duplicate instance ids in the eval report which is not allowed'
-
-    for _, row in df_eval.iterrows():
-        instance_id_to_status[row['instance_id']] = row['test_result']['report']
-    df['report'] = df.apply(apply_report, axis=1)
-
-    report_is_dict = df['report'].apply(lambda x: isinstance(x, dict))
-    if not report_is_dict.all():
-        print(df[~report_is_dict])
-        raise ValueError(f'Report is not a dict, but a {type(row["report"])}')
-
-    _n_instances = len(df)
-    _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
-    _n_unresolved = _n_instances - _n_resolved
-    _n_empty_patch = len(
-        df[df['report'].apply(lambda x: x.get('empty_generation', False))]
-    )
-    _n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))])
+    # First pass: Read eval report and count instances
+    instance_ids = set()
+    eval_instance_ids = set()
+
+    # Count instances in original file
+    n_instances = 0
+    with open(args.input_file, 'r') as f:
+        for line in tqdm(f, desc='Counting instances in original file'):
+            data = json.loads(line)
+            instance_ids.add(data['instance_id'])
+            n_instances += 1
+    print(f'Total instances in original file: {n_instances}')
+
+    # Process eval report
+    n_eval_instances = 0
+    with open(openhands_remote_report_jsonl, 'r') as f:
+        for line in tqdm(f, desc='Processing eval report'):
+            data = json.loads(line)
+            instance_id = data['instance_id']
+            eval_instance_ids.add(instance_id)
+            n_eval_instances += 1
+            instance_id_to_status[instance_id] = data['test_result']['report']
+    print(f'Total instances in eval report: {n_eval_instances}')
+
+    # Verify no duplicates
+    assert (
+        len(instance_ids) == n_instances
+    ), 'Duplicate instance ids found in original output'
+    assert (
+        len(eval_instance_ids) == n_eval_instances
+    ), 'Duplicate instance ids found in eval report'
+
+    # Initialize counters
+    stats = {'total': len(instance_ids), 'resolved': 0, 'empty_patch': 0, 'error': 0}
+
+    # Collect instance IDs by category
+    resolved_ids = []
+    unresolved_ids = []
+    error_ids = []
+    empty_patch_ids = []
+    timeout_ids = []
+
+    # Process original file and categorize instances
+    with open(args.input_file, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            instance_id = data['instance_id']
+            report = instance_id_to_status[instance_id]
+
+            if report.get('resolved', False):
+                stats['resolved'] += 1
+                resolved_ids.append(instance_id)
+            else:
+                unresolved_ids.append(instance_id)
+
+            if report.get('empty_generation', False):
+                stats['empty_patch'] += 1
+                empty_patch_ids.append(instance_id)
+            if report.get('error_eval', False):
+                stats['error'] += 1
+                error_ids.append(instance_id)
+            if report.get('test_timeout', False):
+                timeout_ids.append(instance_id)
+
+    # Generate markdown report
+    def _instance_id_to_log_path(instance_id):
+        path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
+        return os.path.relpath(path, start=dirname)
+
+    # ... rest of markdown generation code remains the same ...
     output_md = (
         '# SWE-bench Report\n'
         'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
         '## Summary\n'
-        f'- submitted instances: {_n_instances}\n'
-        f'- empty patch instances: {_n_empty_patch}\n'
-        f'- resolved instances: {_n_resolved}\n'
-        f'- unresolved instances: {_n_unresolved}\n'
-        f'- error instances: {_n_error}\n'
+        f'- submitted instances: {stats["total"]}\n'
+        f'- empty patch instances: {stats["empty_patch"]}\n'
+        f'- resolved instances: {stats["resolved"]}\n'
+        f'- unresolved instances: {len(unresolved_ids)}\n'
+        f'- error instances: {stats["error"]}\n'
     )
 
-    def _instance_id_to_log_path(instance_id):
-        path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
-        # make it relative path
-        path = os.path.relpath(path, start=dirname)
-        return path
-
     output_md += '\n## Resolved Instances\n'
     # instance_id to status
-    for instance_id in sorted(
-        df[df['report'].apply(lambda x: x.get('resolved', False))][
-            'instance_id'
-        ].unique()
-    ):
+    for instance_id in resolved_ids:
         instance_id_to_status[instance_id]['resolved'] = True
         output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
 
     output_md += '\n## Unresolved Instances\n'
-    for instance_id in sorted(
-        df[~df['report'].apply(lambda x: x.get('resolved', False))][
-            'instance_id'
-        ].unique()
-    ):
+    for instance_id in unresolved_ids:
         output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
 
     output_md += '\n## Error Instances\n'
-    for instance_id in sorted(
-        df[df['report'].apply(lambda x: x.get('error_eval', False))][
-            'instance_id'
-        ].unique()
-    ):
+    for instance_id in error_ids:
         instance_id_to_status[instance_id]['error_eval'] = True
         output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
 
     output_md += '\n## Empty Patch Instances\n'
-    for instance_id in sorted(
-        df[df['report'].apply(lambda x: x.get('empty_generation', False))][
-            'instance_id'
-        ].unique()
-    ):
+    for instance_id in empty_patch_ids:
         instance_id_to_status[instance_id]['empty_generation'] = True
         output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
 
     output_md += '\n## Incomplete Instances\n'
-    for instance_id in sorted(
-        df[df['report'].apply(lambda x: x.get('test_timeout', False))][
-            'instance_id'
-        ].unique()
-    ):
+    for instance_id in timeout_ids:
         output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
+
     with open(output_md_filepath, 'w') as f:
         f.write(output_md)
+
 else:
     print(
         f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.'
     )
     exit()
 
+# Backup and update the original file row by row
 if os.path.exists(args.input_file + '.bak'):
     conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
     if conf != 'y':
         exit()
     os.remove(args.input_file + '.bak')
 
-# backup the original file
 os.rename(args.input_file, args.input_file + '.bak')
-df.to_json(args.input_file, orient='records', lines=True)
+
+# Process and write file row by row
+with open(args.input_file + '.bak', 'r') as infile, open(
+    args.input_file, 'w'
+) as outfile:
+    for line in tqdm(infile, desc='Updating output file'):
+        data = json.loads(line)
+        instance_id = data['instance_id']
+        if instance_id in instance_id_to_status:
+            data['report'] = instance_id_to_status[instance_id]
+        outfile.write(json.dumps(data) + '\n')