From ce01ec773355d4b829de473bebdd47b419115448 Mon Sep 17 00:00:00 2001 From: ayulockin Date: Thu, 21 Nov 2024 13:01:09 +0530 Subject: [PATCH] fix scorer - slight logic falacy --- tests/scorers/test_robustness_scorer.py | 2 +- weave/scorers/robustness_scorer.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/scorers/test_robustness_scorer.py b/tests/scorers/test_robustness_scorer.py index 2d718d8cbb1..e51f483e38e 100644 --- a/tests/scorers/test_robustness_scorer.py +++ b/tests/scorers/test_robustness_scorer.py @@ -87,4 +87,4 @@ def model(questions: list[str]): scorers=[robustness_scorer], ) result = await evaluation.evaluate(model) - assert result["RobustnessScorer"]["cohen_h"]["mean"] == 1.0 + assert truncate(result["RobustnessScorer"]["cohen_h"]["mean"], 5) == 0.49999 diff --git a/weave/scorers/robustness_scorer.py b/weave/scorers/robustness_scorer.py index 4683457171c..e65e710212d 100644 --- a/weave/scorers/robustness_scorer.py +++ b/weave/scorers/robustness_scorer.py @@ -69,7 +69,10 @@ def score( score_o = 1.0 if not ground_truths else binary_scores[0] # Average perturbed similarity score - score_p = sum(binary_scores[1:]) / len(binary_scores[1:]) + # When ground truths are present, start from index 1 + # When no ground truths, use all perturbed outputs (already in binary_scores) + perturbed_binary_scores = binary_scores[1:] if ground_truths else binary_scores + score_p = sum(perturbed_binary_scores) / len(perturbed_binary_scores) def psi(score: float) -> float: return 2 * math.asin(math.sqrt(score))