Skip to content

Commit

Permalink
feat(weave): fixes tests, summarization scorer re-write, re-names flo…
Browse files Browse the repository at this point in the history
…w/scorer dir, create weave/scorers dir
  • Loading branch information
morganmcg1 committed Oct 12, 2024
1 parent 2114c4f commit 0b2bbf2
Show file tree
Hide file tree
Showing 26 changed files with 505 additions and 178 deletions.
2 changes: 1 addition & 1 deletion docs/docs/guides/integrations/langchain.md
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ Evaluations help you measure the performance of your models. By using the [`weav

```python

from weave.flow.scorer import MultiTaskBinaryClassificationF1
from weave.scorers import MultiTaskBinaryClassificationF1

sentences = [
"There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.",
Expand Down
4 changes: 2 additions & 2 deletions docs/docs/tutorial-eval.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ Here `sentence` is passed to the model's predict function, and `target` is used

```python
import weave
from weave.flow.scorer import MultiTaskBinaryClassificationF1
from weave.scorers import MultiTaskBinaryClassificationF1

weave.init('intro-example')

Expand Down Expand Up @@ -132,7 +132,7 @@ import asyncio
# highlight-next-line
import weave
# highlight-next-line
from weave.flow.scorer import MultiTaskBinaryClassificationF1
from weave.scorers import MultiTaskBinaryClassificationF1
import openai

# We create a model class with one predict function.
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/tutorial-rag.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ On a high-level the steps to create custom Scorer are quite simple:


```python
from weave.flow.scorer import Scorer
from weave.scorers import Scorer
from weave import WeaveList

class CorrectnessLLMJudge(Scorer):
Expand Down
42 changes: 34 additions & 8 deletions tests/scorers/test_hallucination_scorer.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,63 @@
import pytest
from openai import OpenAI

from weave.flow.scorer.hallucination_scorer import (
from weave.flow.scorers.hallucination_scorer import (
HallucinationReasoning,
HallucinationResponse,
)
from weave.scorers import (
HallucinationScorer,
)


# mock the create function
@pytest.fixture
def mock_create(monkeypatch):
def _mock_create(*args, **kwargs):
return HallucinationResponse(
chain_of_thought="The output is consistent with the input data.",
is_hallucination=False
hallucination_reasonings=[
HallucinationReasoning(
observation="My observation for this is that the output is consistent with the input data.",
hallucination_type="No Hallucination",
)
],
conclusion="The output is consistent with the input data.",
is_hallucination=False,
)
monkeypatch.setattr('weave.flow.scorer.hallucination_scorer.create', _mock_create)

monkeypatch.setattr("weave.flow.scorers.hallucination_scorer.create", _mock_create)


@pytest.fixture
def hallucination_scorer(mock_create):
return HallucinationScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=4096)
return HallucinationScorer(
client=OpenAI(api_key="DUMMY_API_KEY"),
model_id="gpt-4o",
temperature=0.7,
max_tokens=4096,
)


def test_hallucination_scorer_initialization(hallucination_scorer):
assert isinstance(hallucination_scorer, HallucinationScorer)
assert hallucination_scorer.model_id == "gpt-4o"
assert hallucination_scorer.temperature == 0.7
assert hallucination_scorer.max_tokens == 4096


def test_hallucination_scorer_score(hallucination_scorer, mock_create):
output = "John's favorite cheese is cheddar."
context = "John likes various types of cheese."
result = hallucination_scorer.score(output, context)
result = hallucination_scorer.score(output=output, context=context)
assert isinstance(result, HallucinationResponse)
assert not result.is_hallucination
assert "The output is consistent with the input data." == result.chain_of_thought

# Add more tests as needed
assert isinstance(result.hallucination_reasonings, list)
assert isinstance(result.hallucination_reasonings[0], HallucinationReasoning)
assert result.chain_of_thought == "The output is consistent with the input data."
assert (
result.hallucination_reasonings[0].observation
== "My observation for this is that the output is consistent with the input data."
)
assert result.conclusion == "The output is consistent with the input data."
assert result.hallucination_reasonings[0].hallucination_type == "No Hallucination"
28 changes: 17 additions & 11 deletions tests/scorers/test_json_scorer.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,50 @@
from weave.flow.scorer.json_scorer import JSONScorer
from weave.scorers import ValidJSONScorer


def test_json_scorer_valid_json():
scorer = JSONScorer()
scorer = ValidJSONScorer()
output = '{"city": "San Francisco", "country": "USA"}'
result = scorer.score(output)
assert result["json_valid"] is True


def test_json_scorer_invalid_json():
scorer = JSONScorer()
scorer = ValidJSONScorer()
output = '{"city": "San Francisco", "country": "USA"'
result = scorer.score(output)
assert result["json_valid"] is False


def test_json_scorer_non_json_string():
scorer = JSONScorer()
scorer = ValidJSONScorer()
output = "Just a plain string."
result = scorer.score(output)
assert result["json_valid"] is False


def test_json_scorer_valid_json_list():
scorer = JSONScorer()
output = '[1, 2, 3, 4, 5]'
scorer = ValidJSONScorer()
output = "[1, 2, 3, 4, 5]"
result = scorer.score(output)
assert result["json_valid"] is True


def test_json_scorer_nested_json():
scorer = JSONScorer()
scorer = ValidJSONScorer()
output = '{"person": {"name": "John", "age": 30}, "city": "New York"}'
result = scorer.score(output)
assert result["json_valid"] is True


def test_json_scorer_empty_object():
scorer = JSONScorer()
output = '{}'
scorer = ValidJSONScorer()
output = "{}"
result = scorer.score(output)
assert result["json_valid"] is True


def test_json_scorer_empty_list():
scorer = JSONScorer()
output = '[]'
scorer = ValidJSONScorer()
output = "[]"
result = scorer.score(output)
assert result["json_valid"] is True
13 changes: 11 additions & 2 deletions tests/scorers/test_pydantic_scorer.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,55 @@
import pytest
from pydantic import BaseModel

from weave.flow.scorer.pydantic_scorer import PydanticScorer
from weave.scorers import PydanticScorer


class User(BaseModel):
name: str
age: int


@pytest.fixture
def user_scorer():
return PydanticScorer(model=User)


def test_pydantic_scorer_initialization():
scorer = PydanticScorer(model=User)
assert isinstance(scorer, PydanticScorer)
assert scorer.model == User


def test_pydantic_scorer_valid_json_string(user_scorer):
valid_json = '{"name": "John", "age": 30}'
assert user_scorer.score(valid_json) == {"valid_pydantic": True}


def test_pydantic_scorer_valid_dict(user_scorer):
valid_dict = {"name": "John", "age": 30}
assert user_scorer.score(valid_dict) == {"valid_pydantic": True}


def test_pydantic_scorer_invalid_json_string(user_scorer):
invalid_json = '{"name": "John", "age": "thirty"}'
assert user_scorer.score(invalid_json) == {"valid_pydantic": False}


def test_pydantic_scorer_invalid_dict(user_scorer):
invalid_dict = {"name": "John", "age": "thirty"}
assert user_scorer.score(invalid_dict) == {"valid_pydantic": False}


def test_pydantic_scorer_missing_field(user_scorer):
missing_field = '{"name": "John"}'
assert user_scorer.score(missing_field) == {"valid_pydantic": False}


def test_pydantic_scorer_extra_field(user_scorer):
extra_field = '{"name": "John", "age": 30, "city": "New York"}'
assert user_scorer.score(extra_field) == {"valid_pydantic": True}


def test_pydantic_scorer_invalid_input_type(user_scorer):
invalid_input = 123 # Neither a string nor a dict
assert user_scorer.score(invalid_input) == {"valid_pydantic": False}
assert user_scorer.score(invalid_input) == {"valid_pydantic": False}
36 changes: 28 additions & 8 deletions tests/scorers/test_ragas_scorer.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,60 @@
import pytest
from openai import OpenAI

from weave.flow.scorer.ragas_scorer import (
ContextEntityRecallScorer,
ContextRelevancyScorer,
from weave.flow.scorers.ragas_scorer import (
EntityExtractionResponse,
RelevancyResponse,
)
from weave.scorers import (
ContextEntityRecallScorer,
ContextRelevancyScorer,
)


# Mock the create function
@pytest.fixture
def mock_create(monkeypatch):
def _mock_create(*args, **kwargs):
# Retrieve the response_model to return appropriate mock responses
response_model = kwargs.get('response_model')
response_model = kwargs.get("response_model")
if response_model == EntityExtractionResponse:
return EntityExtractionResponse(entities=["Paris"])
elif response_model == RelevancyResponse:
return RelevancyResponse(
reasoning="The context directly answers the question.",
relevancy_score=1
relevancy_score=1,
)
else:
return None
monkeypatch.setattr('weave.flow.scorer.ragas_scorer.create', _mock_create)

monkeypatch.setattr("weave.flow.scorers.ragas_scorer.create", _mock_create)


@pytest.fixture
def context_entity_recall_scorer(mock_create):
return ContextEntityRecallScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
return ContextEntityRecallScorer(
client=OpenAI(api_key="DUMMY_API_KEY"),
model_id="gpt-4o",
temperature=0.7,
max_tokens=1024,
)


@pytest.fixture
def context_relevancy_scorer(mock_create):
return ContextRelevancyScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
return ContextRelevancyScorer(
client=OpenAI(api_key="DUMMY_API_KEY"),
model_id="gpt-4o",
temperature=0.7,
max_tokens=1024,
)


def test_context_entity_recall_scorer_initialization(context_entity_recall_scorer):
assert isinstance(context_entity_recall_scorer, ContextEntityRecallScorer)
assert context_entity_recall_scorer.model_id == "gpt-4o"


def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
output = "Paris is the capital of France."
context = "The capital city of France is Paris."
Expand All @@ -45,10 +63,12 @@ def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
assert "recall" in result
assert result["recall"] == 1.0 # Assuming full recall in mock response


def test_context_relevancy_scorer_initialization(context_relevancy_scorer):
assert isinstance(context_relevancy_scorer, ContextRelevancyScorer)
assert context_relevancy_scorer.model_id == "gpt-4o"


def test_context_relevancy_scorer_score(context_relevancy_scorer):
output = "What is the capital of France?"
context = "Paris is the capital city of France."
Expand Down
9 changes: 7 additions & 2 deletions tests/scorers/test_string_scorer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from weave.flow.scorer.string_scorer import (
from weave.scorers import (
LevenshteinScorer,
StringMatchScorer,
)
Expand All @@ -11,13 +11,15 @@ def test_string_match_scorer():
result = scorer.score(output, target)
assert result["string_in_input"] is True


def test_string_match_scorer_false():
scorer = StringMatchScorer()
output = "Alice"
target = "Hello my name is Bob"
result = scorer.score(output, target)
assert result["string_in_input"] is False


# def test_regex_scorer():
# scorer = RegexScorer(patterns="engineer")
# output = "I am an engineer"
Expand All @@ -36,23 +38,26 @@ def test_string_match_scorer_false():
# result = scorer.score(output)
# assert result["string_match"] is False


def test_levenshtein_scorer():
scorer = LevenshteinScorer()
output = "Hello"
target = "Hallo"
result = scorer.score(output, target)
assert result["levenshtein_distance"] == 1


def test_levenshtein_scorer_same_strings():
scorer = LevenshteinScorer()
output = "Hello"
target = "Hello"
result = scorer.score(output, target)
assert result["levenshtein_distance"] == 0


def test_levenshtein_scorer_completely_different():
scorer = LevenshteinScorer()
output = "Hello"
target = "World"
result = scorer.score(output, target)
assert result["levenshtein_distance"] == 4
assert result["levenshtein_distance"] == 4
Loading

0 comments on commit 0b2bbf2

Please sign in to comment.