fix results report

All-Hands-AI · Jan 10, 2025 · 1fb90ec · 1fb90ec
1 parent d07f225
commit 1fb90ec
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 5 deletions.
diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
@@ -57,6 +57,7 @@ jobs:
       #     LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
       #     LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
       #     LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+      #     MAX_ITERATIONS: 10
       #   run: |
       #     echo "[llm.eval]" > config.toml
       #     echo "model = \"$LLM_MODEL\"" >> config.toml
@@ -89,6 +90,7 @@ jobs:
       #     LLM_MODEL: "litellm_proxy/deepseek-chat"
       #     LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
       #     LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+      #     MAX_ITERATIONS: 10
       #   run: |
       #     echo "[llm.eval]" > config.toml
       #     echo "model = \"$LLM_MODEL\"" >> config.toml
@@ -120,6 +122,7 @@ jobs:
           LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 30
         run: |
           echo "[llm.eval]" > config.toml
           echo "model = \"$LLM_MODEL\"" >> config.toml
@@ -151,6 +154,7 @@ jobs:
           LLM_MODEL: "litellm_proxy/deepseek-chat"
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 30
         run: |
           echo "[llm.eval]" > config.toml
           echo "model = \"$LLM_MODEL\"" >> config.toml

diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
@@ -222,7 +222,7 @@ def load_integration_tests() -> pd.DataFrame:
 
     df = pd.read_json(output_file, lines=True, orient='records')
 
-    # record success and reason for failure for the final report
+    # record success and reason
     df['success'] = df['test_result'].apply(lambda x: x['success'])
     df['reason'] = df['test_result'].apply(lambda x: x['reason'])
     logger.info('-' * 100)
@@ -237,15 +237,27 @@ def load_integration_tests() -> pd.DataFrame:
     logger.info('-' * 100)
 
     # record cost for each instance, with 3 decimal places
-    df['cost'] = df['metrics'].apply(lambda x: round(x['accumulated_cost'], 3))
+    # we sum up all the "costs" from the metrics array
+    df['cost'] = df['metrics'].apply(
+        lambda m: round(sum(c['cost'] for c in m['costs']), 3)
+        if m and 'costs' in m
+        else 0.0
+    )
+
+    # capture the top-level error if present, per instance
+    df['error_message'] = df['error']
+
     logger.info(f'Total cost: USD {df["cost"].sum():.2f}')
 
     report_file = os.path.join(metadata.eval_output_dir, 'report.md')
     with open(report_file, 'w') as f:
         f.write(
-            f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})\n'
+            f'Success rate: {df["success"].mean():.2%}'
+            f' ({df["success"].sum()}/{len(df)})\n'
         )
         f.write(f'\nTotal cost: USD {df["cost"].sum():.2f}\n')
         f.write(
-            df[['instance_id', 'success', 'reason', 'cost']].to_markdown(index=False)
+            df[
+                ['instance_id', 'success', 'reason', 'cost', 'error_message']
+            ].to_markdown(index=False)
         )
diff --git a/evaluation/integration_tests/scripts/run_infer.sh b/evaluation/integration_tests/scripts/run_infer.sh
@@ -43,7 +43,7 @@ fi
 COMMAND="poetry run python evaluation/integration_tests/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
-  --max-iterations 10 \
+  --max-iterations ${MAX_ITERATIONS:-10} \
   --eval-num-workers $NUM_WORKERS \
   --eval-note $EVAL_NOTE"