diff --git a/tests/scorers/test_robustness_scorer.py b/tests/scorers/test_robustness_scorer.py index 2d718d8cbb1..e51f483e38e 100644 --- a/tests/scorers/test_robustness_scorer.py +++ b/tests/scorers/test_robustness_scorer.py @@ -87,4 +87,4 @@ def model(questions: list[str]): scorers=[robustness_scorer], ) result = await evaluation.evaluate(model) - assert result["RobustnessScorer"]["cohen_h"]["mean"] == 1.0 + assert truncate(result["RobustnessScorer"]["cohen_h"]["mean"], 5) == 0.49999 diff --git a/weave/scorers/robustness_scorer.py b/weave/scorers/robustness_scorer.py index 4683457171c..e65e710212d 100644 --- a/weave/scorers/robustness_scorer.py +++ b/weave/scorers/robustness_scorer.py @@ -69,7 +69,10 @@ def score( score_o = 1.0 if not ground_truths else binary_scores[0] # Average perturbed similarity score - score_p = sum(binary_scores[1:]) / len(binary_scores[1:]) + # When ground truths are present, start from index 1 + # When no ground truths, use all perturbed outputs (already in binary_scores) + perturbed_binary_scores = binary_scores[1:] if ground_truths else binary_scores + score_p = sum(perturbed_binary_scores) / len(perturbed_binary_scores) def psi(score: float) -> float: return 2 * math.asin(math.sqrt(score))