cover more ground with tests

wandb · Nov 21, 2024 · d751950 · d751950
1 parent ce01ec7
commit d751950
Showing 1 changed file with 99 additions and 0 deletions.
diff --git a/tests/scorers/test_robustness_scorer.py b/tests/scorers/test_robustness_scorer.py
@@ -56,6 +56,57 @@ def test_robustness_scorer_insufficient_outputs():
         robustness_scorer.score(output=output)
 
 
+def test_robustness_scorer_with_boolean_output():
+    output = [True, True, False, True]  # Boolean outputs from the system
+    robustness_scorer = RobustnessScorer()
+    result = robustness_scorer.score(output=output)
+    assert truncate(result["cohen_h"], 5) == 0.39182
+
+
+def test_robustness_scorer_with_boolean_output_and_ground_truths():
+    output = [True, True, False, True]  # Boolean outputs from the system
+    ground_truths = [True, True, True, True]  # Boolean ground truths
+    robustness_scorer = RobustnessScorer()
+    result = robustness_scorer.score(output=output, ground_truths=ground_truths)
+    assert truncate(result["cohen_h"], 5) == 0.39182
+
+
+def test_robustness_scorer_with_ground_truths_as_strings():
+    output = ["apple", "aple", "orange", "apple"]
+    ground_truths = ["apple", "apple", "apple", "apple"]
+    robustness_scorer = RobustnessScorer()
+    result = robustness_scorer.score(output=output, ground_truths=ground_truths)
+    assert truncate(result["cohen_h"], 5) == 0.60817
+
+
+def test_robustness_scorer_with_ground_truths_as_booleans():
+    output = ["True", "True", "False", "True"]
+    ground_truths = [False, False, False, False]  # Booleans will be converted to strings
+    robustness_scorer = RobustnessScorer()
+    result = robustness_scorer.score(output=output, ground_truths=ground_truths)
+    assert truncate(result["cohen_h"], 5) == 0.39182
+
+
+def test_robustness_scorer_ground_truths_length_mismatch():
+    output = ["True", "False", "True"]
+    ground_truths = ["True", "True"]  # Mismatched length
+    robustness_scorer = RobustnessScorer()
+    with pytest.raises(
+        AssertionError, match="Length of ground_truths must match the length of output."
+    ):
+        robustness_scorer.score(output=output, ground_truths=ground_truths)
+
+
+def test_robustness_scorer_ground_truths_edge_case():
+    output = ["True"]
+    ground_truths = ["True"]
+    robustness_scorer = RobustnessScorer()
+    with pytest.raises(
+        AssertionError, match="There must be output of at least one perturbed question."
+    ):
+        robustness_scorer.score(output=output, ground_truths=ground_truths)
+
+
 @pytest.mark.asyncio
 async def test_robustness_scorer_eval():
     dataset = [
@@ -88,3 +139,51 @@ def model(questions: list[str]):
     )
     result = await evaluation.evaluate(model)
     assert truncate(result["RobustnessScorer"]["cohen_h"]["mean"], 5) == 0.49999
+
+
+@pytest.mark.asyncio
+async def test_robustness_scorer_eval_with_ground_truths():
+    # Simulated dataset with questions and corresponding ground truths
+    dataset = [
+        {
+            "questions": [
+                "What is the capital of France?",
+                "what is the capital of france?",
+                "What is the Capital of france?",
+            ],
+            "ground_truths": ["Paris", "Paris", "Paris"],  # Ground truths as strings
+        },
+        {
+            "questions": [
+                "Who is the owner of X.com?",
+                "who owns x.com?",
+                "Who is the current owner of X.com?",
+            ],
+            "ground_truths": ["Elon Musk", "Elon Musk", "Elon Musk"],  # Ground truths as strings
+        },
+    ]
+
+    # Simulated LLM model output, matching the dataset structure
+    @weave.op
+    def model(questions: list[str]):
+        outputs = [
+            "Paris",
+            "Paris",
+            "Lyon",
+        ]
+        return outputs
+
+    # Instantiate the RobustnessScorer
+    robustness_scorer = RobustnessScorer()
+
+    # Perform evaluation using Weave's Evaluation framework
+    evaluation = weave.Evaluation(
+        dataset=dataset,
+        scorers=[robustness_scorer],
+    )
+    result = await evaluation.evaluate(model)
+
+    # Check that Cohen's h is computed as expected
+    assert "RobustnessScorer" in result, "Scorer results are missing."
+    cohen_h_mean = truncate(result["RobustnessScorer"]["cohen_h"]["mean"], 5)
+    assert cohen_h_mean == 0.24999, f"Unexpected Cohen's h mean: {cohen_h_mean}"