From 1fb90ec34497975a9533f66fcb9a0d5d11961336 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Fri, 10 Jan 2025 03:17:56 +0100 Subject: [PATCH] fix results report --- .github/workflows/integration-runner.yml | 4 ++++ evaluation/integration_tests/run_infer.py | 20 +++++++++++++++---- .../integration_tests/scripts/run_infer.sh | 2 +- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 69cb08f5308f..7ceae21d4479 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -57,6 +57,7 @@ jobs: # LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022" # LLM_API_KEY: ${{ secrets.LLM_API_KEY }} # LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} + # MAX_ITERATIONS: 10 # run: | # echo "[llm.eval]" > config.toml # echo "model = \"$LLM_MODEL\"" >> config.toml @@ -89,6 +90,7 @@ jobs: # LLM_MODEL: "litellm_proxy/deepseek-chat" # LLM_API_KEY: ${{ secrets.LLM_API_KEY }} # LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} + # MAX_ITERATIONS: 10 # run: | # echo "[llm.eval]" > config.toml # echo "model = \"$LLM_MODEL\"" >> config.toml @@ -120,6 +122,7 @@ jobs: LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022" LLM_API_KEY: ${{ secrets.LLM_API_KEY }} LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} + MAX_ITERATIONS: 30 run: | echo "[llm.eval]" > config.toml echo "model = \"$LLM_MODEL\"" >> config.toml @@ -151,6 +154,7 @@ jobs: LLM_MODEL: "litellm_proxy/deepseek-chat" LLM_API_KEY: ${{ secrets.LLM_API_KEY }} LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} + MAX_ITERATIONS: 30 run: | echo "[llm.eval]" > config.toml echo "model = \"$LLM_MODEL\"" >> config.toml diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index fe411c33becf..5036cc34b541 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -222,7 +222,7 @@ def load_integration_tests() -> pd.DataFrame: df = pd.read_json(output_file, lines=True, orient='records') - # record success and reason for failure for the final report + # record success and reason df['success'] = df['test_result'].apply(lambda x: x['success']) df['reason'] = df['test_result'].apply(lambda x: x['reason']) logger.info('-' * 100) @@ -237,15 +237,27 @@ def load_integration_tests() -> pd.DataFrame: logger.info('-' * 100) # record cost for each instance, with 3 decimal places - df['cost'] = df['metrics'].apply(lambda x: round(x['accumulated_cost'], 3)) + # we sum up all the "costs" from the metrics array + df['cost'] = df['metrics'].apply( + lambda m: round(sum(c['cost'] for c in m['costs']), 3) + if m and 'costs' in m + else 0.0 + ) + + # capture the top-level error if present, per instance + df['error_message'] = df['error'] + logger.info(f'Total cost: USD {df["cost"].sum():.2f}') report_file = os.path.join(metadata.eval_output_dir, 'report.md') with open(report_file, 'w') as f: f.write( - f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})\n' + f'Success rate: {df["success"].mean():.2%}' + f' ({df["success"].sum()}/{len(df)})\n' ) f.write(f'\nTotal cost: USD {df["cost"].sum():.2f}\n') f.write( - df[['instance_id', 'success', 'reason', 'cost']].to_markdown(index=False) + df[ + ['instance_id', 'success', 'reason', 'cost', 'error_message'] + ].to_markdown(index=False) ) diff --git a/evaluation/integration_tests/scripts/run_infer.sh b/evaluation/integration_tests/scripts/run_infer.sh index 3ca1529359db..e5ae35e849d2 100755 --- a/evaluation/integration_tests/scripts/run_infer.sh +++ b/evaluation/integration_tests/scripts/run_infer.sh @@ -43,7 +43,7 @@ fi COMMAND="poetry run python evaluation/integration_tests/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ - --max-iterations 10 \ + --max-iterations ${MAX_ITERATIONS:-10} \ --eval-num-workers $NUM_WORKERS \ --eval-note $EVAL_NOTE"