feat(weave): fixes tests, summarization scorer re-write, re-names flo…

…w/scorer dir, create weave/scorers dir
wandb · Oct 12, 2024 · 0b2bbf2 · 0b2bbf2
1 parent 2114c4f
commit 0b2bbf2
Show file tree

Hide file tree

Showing 26 changed files with 505 additions and 178 deletions.
diff --git a/docs/docs/guides/integrations/langchain.md b/docs/docs/guides/integrations/langchain.md
@@ -196,7 +196,7 @@ Evaluations help you measure the performance of your models. By using the [`weav
 
 ```python
 
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 
 sentences = [
     "There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.",

diff --git a/docs/docs/tutorial-eval.md b/docs/docs/tutorial-eval.md
@@ -94,7 +94,7 @@ Here `sentence` is passed to the model's predict function, and `target` is used
 
 ```python
 import weave
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 
 weave.init('intro-example')
 
@@ -132,7 +132,7 @@ import asyncio
 # highlight-next-line
 import weave
 # highlight-next-line
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 import openai
 
 # We create a model class with one predict function.

diff --git a/docs/docs/tutorial-rag.md b/docs/docs/tutorial-rag.md
@@ -182,7 +182,7 @@ On a high-level the steps to create custom Scorer are quite simple:
 
 
 ```python
-from weave.flow.scorer import Scorer
+from weave.scorers import Scorer
 from weave import WeaveList
 
 class CorrectnessLLMJudge(Scorer):

diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
@@ -1,37 +1,63 @@
 import pytest
 from openai import OpenAI
 
-from weave.flow.scorer.hallucination_scorer import (
+from weave.flow.scorers.hallucination_scorer import (
+    HallucinationReasoning,
     HallucinationResponse,
+)
+from weave.scorers import (
     HallucinationScorer,
 )
 
+
 # mock the create function
 @pytest.fixture
 def mock_create(monkeypatch):
     def _mock_create(*args, **kwargs):
         return HallucinationResponse(
             chain_of_thought="The output is consistent with the input data.",
-            is_hallucination=False
+            hallucination_reasonings=[
+                HallucinationReasoning(
+                    observation="My observation for this is that the output is consistent with the input data.",
+                    hallucination_type="No Hallucination",
+                )
+            ],
+            conclusion="The output is consistent with the input data.",
+            is_hallucination=False,
         )
-    monkeypatch.setattr('weave.flow.scorer.hallucination_scorer.create', _mock_create)
+
+    monkeypatch.setattr("weave.flow.scorers.hallucination_scorer.create", _mock_create)
+
 
 @pytest.fixture
 def hallucination_scorer(mock_create):
-    return HallucinationScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=4096)
+    return HallucinationScorer(
+        client=OpenAI(api_key="DUMMY_API_KEY"),
+        model_id="gpt-4o",
+        temperature=0.7,
+        max_tokens=4096,
+    )
+
 
 def test_hallucination_scorer_initialization(hallucination_scorer):
     assert isinstance(hallucination_scorer, HallucinationScorer)
     assert hallucination_scorer.model_id == "gpt-4o"
     assert hallucination_scorer.temperature == 0.7
     assert hallucination_scorer.max_tokens == 4096
 
+
 def test_hallucination_scorer_score(hallucination_scorer, mock_create):
     output = "John's favorite cheese is cheddar."
     context = "John likes various types of cheese."
-    result = hallucination_scorer.score(output, context)
+    result = hallucination_scorer.score(output=output, context=context)
     assert isinstance(result, HallucinationResponse)
     assert not result.is_hallucination
-    assert "The output is consistent with the input data."  == result.chain_of_thought
-
-# Add more tests as needed
+    assert isinstance(result.hallucination_reasonings, list)
+    assert isinstance(result.hallucination_reasonings[0], HallucinationReasoning)
+    assert result.chain_of_thought == "The output is consistent with the input data."
+    assert (
+        result.hallucination_reasonings[0].observation
+        == "My observation for this is that the output is consistent with the input data."
+    )
+    assert result.conclusion == "The output is consistent with the input data."
+    assert result.hallucination_reasonings[0].hallucination_type == "No Hallucination"
diff --git a/tests/scorers/test_json_scorer.py b/tests/scorers/test_json_scorer.py
@@ -1,44 +1,50 @@
-from weave.flow.scorer.json_scorer import JSONScorer
+from weave.scorers import ValidJSONScorer
 
 
 def test_json_scorer_valid_json():
-    scorer = JSONScorer()
+    scorer = ValidJSONScorer()
     output = '{"city": "San Francisco", "country": "USA"}'
     result = scorer.score(output)
     assert result["json_valid"] is True
 
+
 def test_json_scorer_invalid_json():
-    scorer = JSONScorer()
+    scorer = ValidJSONScorer()
     output = '{"city": "San Francisco", "country": "USA"'
     result = scorer.score(output)
     assert result["json_valid"] is False
 
+
 def test_json_scorer_non_json_string():
-    scorer = JSONScorer()
+    scorer = ValidJSONScorer()
     output = "Just a plain string."
     result = scorer.score(output)
     assert result["json_valid"] is False
 
+
 def test_json_scorer_valid_json_list():
-    scorer = JSONScorer()
-    output = '[1, 2, 3, 4, 5]'
+    scorer = ValidJSONScorer()
+    output = "[1, 2, 3, 4, 5]"
     result = scorer.score(output)
     assert result["json_valid"] is True
 
+
 def test_json_scorer_nested_json():
-    scorer = JSONScorer()
+    scorer = ValidJSONScorer()
     output = '{"person": {"name": "John", "age": 30}, "city": "New York"}'
     result = scorer.score(output)
     assert result["json_valid"] is True
 
+
 def test_json_scorer_empty_object():
-    scorer = JSONScorer()
-    output = '{}'
+    scorer = ValidJSONScorer()
+    output = "{}"
     result = scorer.score(output)
     assert result["json_valid"] is True
 
+
 def test_json_scorer_empty_list():
-    scorer = JSONScorer()
-    output = '[]'
+    scorer = ValidJSONScorer()
+    output = "[]"
     result = scorer.score(output)
     assert result["json_valid"] is True
diff --git a/tests/scorers/test_pydantic_scorer.py b/tests/scorers/test_pydantic_scorer.py
@@ -1,46 +1,55 @@
 import pytest
 from pydantic import BaseModel
 
-from weave.flow.scorer.pydantic_scorer import PydanticScorer
+from weave.scorers import PydanticScorer
 
 
 class User(BaseModel):
     name: str
     age: int
 
+
 @pytest.fixture
 def user_scorer():
     return PydanticScorer(model=User)
 
+
 def test_pydantic_scorer_initialization():
     scorer = PydanticScorer(model=User)
     assert isinstance(scorer, PydanticScorer)
     assert scorer.model == User
 
+
 def test_pydantic_scorer_valid_json_string(user_scorer):
     valid_json = '{"name": "John", "age": 30}'
     assert user_scorer.score(valid_json) == {"valid_pydantic": True}
 
+
 def test_pydantic_scorer_valid_dict(user_scorer):
     valid_dict = {"name": "John", "age": 30}
     assert user_scorer.score(valid_dict) == {"valid_pydantic": True}
 
+
 def test_pydantic_scorer_invalid_json_string(user_scorer):
     invalid_json = '{"name": "John", "age": "thirty"}'
     assert user_scorer.score(invalid_json) == {"valid_pydantic": False}
 
+
 def test_pydantic_scorer_invalid_dict(user_scorer):
     invalid_dict = {"name": "John", "age": "thirty"}
     assert user_scorer.score(invalid_dict) == {"valid_pydantic": False}
 
+
 def test_pydantic_scorer_missing_field(user_scorer):
     missing_field = '{"name": "John"}'
     assert user_scorer.score(missing_field) == {"valid_pydantic": False}
 
+
 def test_pydantic_scorer_extra_field(user_scorer):
     extra_field = '{"name": "John", "age": 30, "city": "New York"}'
     assert user_scorer.score(extra_field) == {"valid_pydantic": True}
 
+
 def test_pydantic_scorer_invalid_input_type(user_scorer):
     invalid_input = 123  # Neither a string nor a dict
-    assert user_scorer.score(invalid_input) == {"valid_pydantic": False}
+    assert user_scorer.score(invalid_input) == {"valid_pydantic": False}
diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
@@ -1,42 +1,60 @@
 import pytest
 from openai import OpenAI
 
-from weave.flow.scorer.ragas_scorer import (
-    ContextEntityRecallScorer,
-    ContextRelevancyScorer,
+from weave.flow.scorers.ragas_scorer import (
     EntityExtractionResponse,
     RelevancyResponse,
 )
+from weave.scorers import (
+    ContextEntityRecallScorer,
+    ContextRelevancyScorer,
+)
+
 
 # Mock the create function
 @pytest.fixture
 def mock_create(monkeypatch):
     def _mock_create(*args, **kwargs):
         # Retrieve the response_model to return appropriate mock responses
-        response_model = kwargs.get('response_model')
+        response_model = kwargs.get("response_model")
         if response_model == EntityExtractionResponse:
             return EntityExtractionResponse(entities=["Paris"])
         elif response_model == RelevancyResponse:
             return RelevancyResponse(
                 reasoning="The context directly answers the question.",
-                relevancy_score=1
+                relevancy_score=1,
             )
         else:
             return None
-    monkeypatch.setattr('weave.flow.scorer.ragas_scorer.create', _mock_create)
+
+    monkeypatch.setattr("weave.flow.scorers.ragas_scorer.create", _mock_create)
+
 
 @pytest.fixture
 def context_entity_recall_scorer(mock_create):
-    return ContextEntityRecallScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+    return ContextEntityRecallScorer(
+        client=OpenAI(api_key="DUMMY_API_KEY"),
+        model_id="gpt-4o",
+        temperature=0.7,
+        max_tokens=1024,
+    )
+
 
 @pytest.fixture
 def context_relevancy_scorer(mock_create):
-    return ContextRelevancyScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+    return ContextRelevancyScorer(
+        client=OpenAI(api_key="DUMMY_API_KEY"),
+        model_id="gpt-4o",
+        temperature=0.7,
+        max_tokens=1024,
+    )
+
 
 def test_context_entity_recall_scorer_initialization(context_entity_recall_scorer):
     assert isinstance(context_entity_recall_scorer, ContextEntityRecallScorer)
     assert context_entity_recall_scorer.model_id == "gpt-4o"
 
+
 def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
     output = "Paris is the capital of France."
     context = "The capital city of France is Paris."
@@ -45,10 +63,12 @@ def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
     assert "recall" in result
     assert result["recall"] == 1.0  # Assuming full recall in mock response
 
+
 def test_context_relevancy_scorer_initialization(context_relevancy_scorer):
     assert isinstance(context_relevancy_scorer, ContextRelevancyScorer)
     assert context_relevancy_scorer.model_id == "gpt-4o"
 
+
 def test_context_relevancy_scorer_score(context_relevancy_scorer):
     output = "What is the capital of France?"
     context = "Paris is the capital city of France."

diff --git a/tests/scorers/test_string_scorer.py b/tests/scorers/test_string_scorer.py
@@ -1,4 +1,4 @@
-from weave.flow.scorer.string_scorer import (
+from weave.scorers import (
     LevenshteinScorer,
     StringMatchScorer,
 )
@@ -11,13 +11,15 @@ def test_string_match_scorer():
     result = scorer.score(output, target)
     assert result["string_in_input"] is True
 
+
 def test_string_match_scorer_false():
     scorer = StringMatchScorer()
     output = "Alice"
     target = "Hello my name is Bob"
     result = scorer.score(output, target)
     assert result["string_in_input"] is False
 
+
 # def test_regex_scorer():
 #     scorer = RegexScorer(patterns="engineer")
 #     output = "I am an engineer"
@@ -36,23 +38,26 @@ def test_string_match_scorer_false():
 #     result = scorer.score(output)
 #     assert result["string_match"] is False
 
+
 def test_levenshtein_scorer():
     scorer = LevenshteinScorer()
     output = "Hello"
     target = "Hallo"
     result = scorer.score(output, target)
     assert result["levenshtein_distance"] == 1
 
+
 def test_levenshtein_scorer_same_strings():
     scorer = LevenshteinScorer()
     output = "Hello"
     target = "Hello"
     result = scorer.score(output, target)
     assert result["levenshtein_distance"] == 0
 
+
 def test_levenshtein_scorer_completely_different():
     scorer = LevenshteinScorer()
     output = "Hello"
     target = "World"
     result = scorer.score(output, target)
-    assert result["levenshtein_distance"] == 4
+    assert result["levenshtein_distance"] == 4