Skip to content

Commit

Permalink
cover more ground with tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ayulockin committed Nov 21, 2024
1 parent ce01ec7 commit d751950
Showing 1 changed file with 99 additions and 0 deletions.
99 changes: 99 additions & 0 deletions tests/scorers/test_robustness_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,57 @@ def test_robustness_scorer_insufficient_outputs():
robustness_scorer.score(output=output)


def test_robustness_scorer_with_boolean_output():
output = [True, True, False, True] # Boolean outputs from the system
robustness_scorer = RobustnessScorer()
result = robustness_scorer.score(output=output)
assert truncate(result["cohen_h"], 5) == 0.39182


def test_robustness_scorer_with_boolean_output_and_ground_truths():
output = [True, True, False, True] # Boolean outputs from the system
ground_truths = [True, True, True, True] # Boolean ground truths
robustness_scorer = RobustnessScorer()
result = robustness_scorer.score(output=output, ground_truths=ground_truths)
assert truncate(result["cohen_h"], 5) == 0.39182


def test_robustness_scorer_with_ground_truths_as_strings():
output = ["apple", "aple", "orange", "apple"]
ground_truths = ["apple", "apple", "apple", "apple"]
robustness_scorer = RobustnessScorer()
result = robustness_scorer.score(output=output, ground_truths=ground_truths)
assert truncate(result["cohen_h"], 5) == 0.60817


def test_robustness_scorer_with_ground_truths_as_booleans():
output = ["True", "True", "False", "True"]
ground_truths = [False, False, False, False] # Booleans will be converted to strings
robustness_scorer = RobustnessScorer()
result = robustness_scorer.score(output=output, ground_truths=ground_truths)
assert truncate(result["cohen_h"], 5) == 0.39182


def test_robustness_scorer_ground_truths_length_mismatch():
output = ["True", "False", "True"]
ground_truths = ["True", "True"] # Mismatched length
robustness_scorer = RobustnessScorer()
with pytest.raises(
AssertionError, match="Length of ground_truths must match the length of output."
):
robustness_scorer.score(output=output, ground_truths=ground_truths)


def test_robustness_scorer_ground_truths_edge_case():
output = ["True"]
ground_truths = ["True"]
robustness_scorer = RobustnessScorer()
with pytest.raises(
AssertionError, match="There must be output of at least one perturbed question."
):
robustness_scorer.score(output=output, ground_truths=ground_truths)


@pytest.mark.asyncio
async def test_robustness_scorer_eval():
dataset = [
Expand Down Expand Up @@ -88,3 +139,51 @@ def model(questions: list[str]):
)
result = await evaluation.evaluate(model)
assert truncate(result["RobustnessScorer"]["cohen_h"]["mean"], 5) == 0.49999


@pytest.mark.asyncio
async def test_robustness_scorer_eval_with_ground_truths():
# Simulated dataset with questions and corresponding ground truths
dataset = [
{
"questions": [
"What is the capital of France?",
"what is the capital of france?",
"What is the Capital of france?",
],
"ground_truths": ["Paris", "Paris", "Paris"], # Ground truths as strings
},
{
"questions": [
"Who is the owner of X.com?",
"who owns x.com?",
"Who is the current owner of X.com?",
],
"ground_truths": ["Elon Musk", "Elon Musk", "Elon Musk"], # Ground truths as strings
},
]

# Simulated LLM model output, matching the dataset structure
@weave.op
def model(questions: list[str]):
outputs = [
"Paris",
"Paris",
"Lyon",
]
return outputs

# Instantiate the RobustnessScorer
robustness_scorer = RobustnessScorer()

# Perform evaluation using Weave's Evaluation framework
evaluation = weave.Evaluation(
dataset=dataset,
scorers=[robustness_scorer],
)
result = await evaluation.evaluate(model)

# Check that Cohen's h is computed as expected
assert "RobustnessScorer" in result, "Scorer results are missing."
cohen_h_mean = truncate(result["RobustnessScorer"]["cohen_h"]["mean"], 5)
assert cohen_h_mean == 0.24999, f"Unexpected Cohen's h mean: {cohen_h_mean}"

0 comments on commit d751950

Please sign in to comment.