fix scorer - slight logic falacy

wandb · Nov 21, 2024 · ce01ec7 · ce01ec7
1 parent 4e9c192
commit ce01ec7
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 2 deletions.
diff --git a/tests/scorers/test_robustness_scorer.py b/tests/scorers/test_robustness_scorer.py
@@ -87,4 +87,4 @@ def model(questions: list[str]):
         scorers=[robustness_scorer],
     )
     result = await evaluation.evaluate(model)
-    assert result["RobustnessScorer"]["cohen_h"]["mean"] == 1.0
+    assert truncate(result["RobustnessScorer"]["cohen_h"]["mean"], 5) == 0.49999
diff --git a/weave/scorers/robustness_scorer.py b/weave/scorers/robustness_scorer.py
@@ -69,7 +69,10 @@ def score(
         score_o = 1.0 if not ground_truths else binary_scores[0]
 
         # Average perturbed similarity score
-        score_p = sum(binary_scores[1:]) / len(binary_scores[1:])
+        # When ground truths are present, start from index 1
+        # When no ground truths, use all perturbed outputs (already in binary_scores)
+        perturbed_binary_scores = binary_scores[1:] if ground_truths else binary_scores
+        score_p = sum(perturbed_binary_scores) / len(perturbed_binary_scores)
 
         def psi(score: float) -> float:
             return 2 * math.asin(math.sqrt(score))