From 0b2bbf2f05e053f653b9ff4dad73c3bd34eae34f Mon Sep 17 00:00:00 2001 From: Morgan McGuire Date: Sat, 12 Oct 2024 22:47:35 +0100 Subject: [PATCH] feat(weave): fixes tests, summarization scorer re-write, re-names flow/scorer dir, create weave/scorers dir --- docs/docs/guides/integrations/langchain.md | 2 +- docs/docs/tutorial-eval.md | 4 +- docs/docs/tutorial-rag.md | 2 +- tests/scorers/test_hallucination_scorer.py | 42 ++++- tests/scorers/test_json_scorer.py | 28 ++-- tests/scorers/test_pydantic_scorer.py | 13 +- tests/scorers/test_ragas_scorer.py | 36 +++- tests/scorers/test_string_scorer.py | 9 +- tests/scorers/test_summarization_scorer.py | 74 +++++--- tests/trace/test_evaluations.py | 35 ++-- weave/__init__.py | 14 -- weave/flow/eval.py | 35 ++-- weave/flow/scorers/__init__.py | 48 +++--- weave/flow/scorers/classification_scorer.py | 2 +- weave/flow/scorers/hallucination_scorer.py | 103 ++++++++++-- weave/flow/scorers/json_scorer.py | 4 +- weave/flow/scorers/llm_scorer.py | 14 +- weave/flow/scorers/llm_utils.py | 19 ++- weave/flow/scorers/moderation_scorer.py | 2 +- weave/flow/scorers/pydantic_scorer.py | 2 +- weave/flow/scorers/ragas_scorer.py | 4 +- weave/flow/scorers/similarity_score.py | 4 +- weave/flow/scorers/string_scorer.py | 5 +- weave/flow/scorers/summarization_scorer.py | 177 +++++++++++++++++--- weave/flow/scorers/xml_scorer.py | 4 +- weave/scorers/__init__.py | 1 + 26 files changed, 505 insertions(+), 178 deletions(-) create mode 100644 weave/scorers/__init__.py diff --git a/docs/docs/guides/integrations/langchain.md b/docs/docs/guides/integrations/langchain.md index b382e793e70..4487a85dfd4 100644 --- a/docs/docs/guides/integrations/langchain.md +++ b/docs/docs/guides/integrations/langchain.md @@ -196,7 +196,7 @@ Evaluations help you measure the performance of your models. By using the [`weav ```python -from weave.flow.scorer import MultiTaskBinaryClassificationF1 +from weave.scorers import MultiTaskBinaryClassificationF1 sentences = [ "There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.", diff --git a/docs/docs/tutorial-eval.md b/docs/docs/tutorial-eval.md index 929d29ab56a..0705443b1f0 100644 --- a/docs/docs/tutorial-eval.md +++ b/docs/docs/tutorial-eval.md @@ -94,7 +94,7 @@ Here `sentence` is passed to the model's predict function, and `target` is used ```python import weave -from weave.flow.scorer import MultiTaskBinaryClassificationF1 +from weave.scorers import MultiTaskBinaryClassificationF1 weave.init('intro-example') @@ -132,7 +132,7 @@ import asyncio # highlight-next-line import weave # highlight-next-line -from weave.flow.scorer import MultiTaskBinaryClassificationF1 +from weave.scorers import MultiTaskBinaryClassificationF1 import openai # We create a model class with one predict function. diff --git a/docs/docs/tutorial-rag.md b/docs/docs/tutorial-rag.md index 81cf3b2b9b2..466d6326549 100644 --- a/docs/docs/tutorial-rag.md +++ b/docs/docs/tutorial-rag.md @@ -182,7 +182,7 @@ On a high-level the steps to create custom Scorer are quite simple: ```python -from weave.flow.scorer import Scorer +from weave.scorers import Scorer from weave import WeaveList class CorrectnessLLMJudge(Scorer): diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py index b486467f717..6cae7d157d8 100644 --- a/tests/scorers/test_hallucination_scorer.py +++ b/tests/scorers/test_hallucination_scorer.py @@ -1,24 +1,43 @@ import pytest from openai import OpenAI -from weave.flow.scorer.hallucination_scorer import ( +from weave.flow.scorers.hallucination_scorer import ( + HallucinationReasoning, HallucinationResponse, +) +from weave.scorers import ( HallucinationScorer, ) + # mock the create function @pytest.fixture def mock_create(monkeypatch): def _mock_create(*args, **kwargs): return HallucinationResponse( chain_of_thought="The output is consistent with the input data.", - is_hallucination=False + hallucination_reasonings=[ + HallucinationReasoning( + observation="My observation for this is that the output is consistent with the input data.", + hallucination_type="No Hallucination", + ) + ], + conclusion="The output is consistent with the input data.", + is_hallucination=False, ) - monkeypatch.setattr('weave.flow.scorer.hallucination_scorer.create', _mock_create) + + monkeypatch.setattr("weave.flow.scorers.hallucination_scorer.create", _mock_create) + @pytest.fixture def hallucination_scorer(mock_create): - return HallucinationScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=4096) + return HallucinationScorer( + client=OpenAI(api_key="DUMMY_API_KEY"), + model_id="gpt-4o", + temperature=0.7, + max_tokens=4096, + ) + def test_hallucination_scorer_initialization(hallucination_scorer): assert isinstance(hallucination_scorer, HallucinationScorer) @@ -26,12 +45,19 @@ def test_hallucination_scorer_initialization(hallucination_scorer): assert hallucination_scorer.temperature == 0.7 assert hallucination_scorer.max_tokens == 4096 + def test_hallucination_scorer_score(hallucination_scorer, mock_create): output = "John's favorite cheese is cheddar." context = "John likes various types of cheese." - result = hallucination_scorer.score(output, context) + result = hallucination_scorer.score(output=output, context=context) assert isinstance(result, HallucinationResponse) assert not result.is_hallucination - assert "The output is consistent with the input data." == result.chain_of_thought - -# Add more tests as needed + assert isinstance(result.hallucination_reasonings, list) + assert isinstance(result.hallucination_reasonings[0], HallucinationReasoning) + assert result.chain_of_thought == "The output is consistent with the input data." + assert ( + result.hallucination_reasonings[0].observation + == "My observation for this is that the output is consistent with the input data." + ) + assert result.conclusion == "The output is consistent with the input data." + assert result.hallucination_reasonings[0].hallucination_type == "No Hallucination" diff --git a/tests/scorers/test_json_scorer.py b/tests/scorers/test_json_scorer.py index 443e8d10872..6cd1cf480cf 100644 --- a/tests/scorers/test_json_scorer.py +++ b/tests/scorers/test_json_scorer.py @@ -1,44 +1,50 @@ -from weave.flow.scorer.json_scorer import JSONScorer +from weave.scorers import ValidJSONScorer def test_json_scorer_valid_json(): - scorer = JSONScorer() + scorer = ValidJSONScorer() output = '{"city": "San Francisco", "country": "USA"}' result = scorer.score(output) assert result["json_valid"] is True + def test_json_scorer_invalid_json(): - scorer = JSONScorer() + scorer = ValidJSONScorer() output = '{"city": "San Francisco", "country": "USA"' result = scorer.score(output) assert result["json_valid"] is False + def test_json_scorer_non_json_string(): - scorer = JSONScorer() + scorer = ValidJSONScorer() output = "Just a plain string." result = scorer.score(output) assert result["json_valid"] is False + def test_json_scorer_valid_json_list(): - scorer = JSONScorer() - output = '[1, 2, 3, 4, 5]' + scorer = ValidJSONScorer() + output = "[1, 2, 3, 4, 5]" result = scorer.score(output) assert result["json_valid"] is True + def test_json_scorer_nested_json(): - scorer = JSONScorer() + scorer = ValidJSONScorer() output = '{"person": {"name": "John", "age": 30}, "city": "New York"}' result = scorer.score(output) assert result["json_valid"] is True + def test_json_scorer_empty_object(): - scorer = JSONScorer() - output = '{}' + scorer = ValidJSONScorer() + output = "{}" result = scorer.score(output) assert result["json_valid"] is True + def test_json_scorer_empty_list(): - scorer = JSONScorer() - output = '[]' + scorer = ValidJSONScorer() + output = "[]" result = scorer.score(output) assert result["json_valid"] is True diff --git a/tests/scorers/test_pydantic_scorer.py b/tests/scorers/test_pydantic_scorer.py index 1a4112fc3f6..f9953ba6abd 100644 --- a/tests/scorers/test_pydantic_scorer.py +++ b/tests/scorers/test_pydantic_scorer.py @@ -1,46 +1,55 @@ import pytest from pydantic import BaseModel -from weave.flow.scorer.pydantic_scorer import PydanticScorer +from weave.scorers import PydanticScorer class User(BaseModel): name: str age: int + @pytest.fixture def user_scorer(): return PydanticScorer(model=User) + def test_pydantic_scorer_initialization(): scorer = PydanticScorer(model=User) assert isinstance(scorer, PydanticScorer) assert scorer.model == User + def test_pydantic_scorer_valid_json_string(user_scorer): valid_json = '{"name": "John", "age": 30}' assert user_scorer.score(valid_json) == {"valid_pydantic": True} + def test_pydantic_scorer_valid_dict(user_scorer): valid_dict = {"name": "John", "age": 30} assert user_scorer.score(valid_dict) == {"valid_pydantic": True} + def test_pydantic_scorer_invalid_json_string(user_scorer): invalid_json = '{"name": "John", "age": "thirty"}' assert user_scorer.score(invalid_json) == {"valid_pydantic": False} + def test_pydantic_scorer_invalid_dict(user_scorer): invalid_dict = {"name": "John", "age": "thirty"} assert user_scorer.score(invalid_dict) == {"valid_pydantic": False} + def test_pydantic_scorer_missing_field(user_scorer): missing_field = '{"name": "John"}' assert user_scorer.score(missing_field) == {"valid_pydantic": False} + def test_pydantic_scorer_extra_field(user_scorer): extra_field = '{"name": "John", "age": 30, "city": "New York"}' assert user_scorer.score(extra_field) == {"valid_pydantic": True} + def test_pydantic_scorer_invalid_input_type(user_scorer): invalid_input = 123 # Neither a string nor a dict - assert user_scorer.score(invalid_input) == {"valid_pydantic": False} \ No newline at end of file + assert user_scorer.score(invalid_input) == {"valid_pydantic": False} diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py index 108db4f69f0..2144200d809 100644 --- a/tests/scorers/test_ragas_scorer.py +++ b/tests/scorers/test_ragas_scorer.py @@ -1,42 +1,60 @@ import pytest from openai import OpenAI -from weave.flow.scorer.ragas_scorer import ( - ContextEntityRecallScorer, - ContextRelevancyScorer, +from weave.flow.scorers.ragas_scorer import ( EntityExtractionResponse, RelevancyResponse, ) +from weave.scorers import ( + ContextEntityRecallScorer, + ContextRelevancyScorer, +) + # Mock the create function @pytest.fixture def mock_create(monkeypatch): def _mock_create(*args, **kwargs): # Retrieve the response_model to return appropriate mock responses - response_model = kwargs.get('response_model') + response_model = kwargs.get("response_model") if response_model == EntityExtractionResponse: return EntityExtractionResponse(entities=["Paris"]) elif response_model == RelevancyResponse: return RelevancyResponse( reasoning="The context directly answers the question.", - relevancy_score=1 + relevancy_score=1, ) else: return None - monkeypatch.setattr('weave.flow.scorer.ragas_scorer.create', _mock_create) + + monkeypatch.setattr("weave.flow.scorers.ragas_scorer.create", _mock_create) + @pytest.fixture def context_entity_recall_scorer(mock_create): - return ContextEntityRecallScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024) + return ContextEntityRecallScorer( + client=OpenAI(api_key="DUMMY_API_KEY"), + model_id="gpt-4o", + temperature=0.7, + max_tokens=1024, + ) + @pytest.fixture def context_relevancy_scorer(mock_create): - return ContextRelevancyScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024) + return ContextRelevancyScorer( + client=OpenAI(api_key="DUMMY_API_KEY"), + model_id="gpt-4o", + temperature=0.7, + max_tokens=1024, + ) + def test_context_entity_recall_scorer_initialization(context_entity_recall_scorer): assert isinstance(context_entity_recall_scorer, ContextEntityRecallScorer) assert context_entity_recall_scorer.model_id == "gpt-4o" + def test_context_entity_recall_scorer_score(context_entity_recall_scorer): output = "Paris is the capital of France." context = "The capital city of France is Paris." @@ -45,10 +63,12 @@ def test_context_entity_recall_scorer_score(context_entity_recall_scorer): assert "recall" in result assert result["recall"] == 1.0 # Assuming full recall in mock response + def test_context_relevancy_scorer_initialization(context_relevancy_scorer): assert isinstance(context_relevancy_scorer, ContextRelevancyScorer) assert context_relevancy_scorer.model_id == "gpt-4o" + def test_context_relevancy_scorer_score(context_relevancy_scorer): output = "What is the capital of France?" context = "Paris is the capital city of France." diff --git a/tests/scorers/test_string_scorer.py b/tests/scorers/test_string_scorer.py index 3c460cb04db..dfa05daf7e8 100644 --- a/tests/scorers/test_string_scorer.py +++ b/tests/scorers/test_string_scorer.py @@ -1,4 +1,4 @@ -from weave.flow.scorer.string_scorer import ( +from weave.scorers import ( LevenshteinScorer, StringMatchScorer, ) @@ -11,6 +11,7 @@ def test_string_match_scorer(): result = scorer.score(output, target) assert result["string_in_input"] is True + def test_string_match_scorer_false(): scorer = StringMatchScorer() output = "Alice" @@ -18,6 +19,7 @@ def test_string_match_scorer_false(): result = scorer.score(output, target) assert result["string_in_input"] is False + # def test_regex_scorer(): # scorer = RegexScorer(patterns="engineer") # output = "I am an engineer" @@ -36,6 +38,7 @@ def test_string_match_scorer_false(): # result = scorer.score(output) # assert result["string_match"] is False + def test_levenshtein_scorer(): scorer = LevenshteinScorer() output = "Hello" @@ -43,6 +46,7 @@ def test_levenshtein_scorer(): result = scorer.score(output, target) assert result["levenshtein_distance"] == 1 + def test_levenshtein_scorer_same_strings(): scorer = LevenshteinScorer() output = "Hello" @@ -50,9 +54,10 @@ def test_levenshtein_scorer_same_strings(): result = scorer.score(output, target) assert result["levenshtein_distance"] == 0 + def test_levenshtein_scorer_completely_different(): scorer = LevenshteinScorer() output = "Hello" target = "World" result = scorer.score(output, target) - assert result["levenshtein_distance"] == 4 \ No newline at end of file + assert result["levenshtein_distance"] == 4 diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py index 4534056ecf8..60b026b3080 100644 --- a/tests/scorers/test_summarization_scorer.py +++ b/tests/scorers/test_summarization_scorer.py @@ -1,45 +1,81 @@ import pytest from openai import OpenAI -from weave.flow.scorer.summarization_scorer import ( +from weave.flow.scorers.summarization_scorer import ( EntityExtractionResponse, + SummarizationEvaluationResponse, +) +from weave.scorers import ( SummarizationScorer, ) -# mock the create function @pytest.fixture def mock_create(monkeypatch): def _mock_create(*args, **kwargs): - return EntityExtractionResponse( - entities=["entity1", "entity2"] - ) - monkeypatch.setattr('weave.flow.scorer.summarization_scorer.create', _mock_create) + response_model = kwargs.get("response_model") + if response_model == EntityExtractionResponse: + return EntityExtractionResponse(entities=["entity1", "entity2"]) + elif response_model == SummarizationEvaluationResponse: + return SummarizationEvaluationResponse( + think_step_by_step="This is some reasoning.", + summarization_evaluation="excellent", + ) + else: + return None + + # Patch the 'create' function wherever it is called + monkeypatch.setattr("weave.flow.scorers.summarization_scorer.create", _mock_create) + @pytest.fixture def summarization_scorer(mock_create): - return SummarizationScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024) + return SummarizationScorer( + client=OpenAI(api_key="DUMMY_API_KEY"), + model_id="gpt-4o", + temperature=0.7, + max_tokens=1024, + ) + -def test_summarization_scorer_initialization(summarization_scorer, mock_create): +def test_summarization_scorer_evaluate_summary(summarization_scorer, mock_create): + input_text = "This is the original text." + summary_text = "This is the summary." + result = summarization_scorer.evaluate_summary( + input=input_text, summary=summary_text + ) + assert isinstance(result, SummarizationEvaluationResponse) + assert result.summarization_evaluation == "excellent" + assert result.think_step_by_step == "This is some reasoning." + + +@pytest.mark.asyncio +async def test_summarization_scorer_score(summarization_scorer): + input_text = "This is the original text." + output_text = "This is the summary." + result = await summarization_scorer.score(input=input_text, output=output_text) + assert isinstance(result, dict) + assert "summarization_eval_score" in result + assert result["summarization_eval_score"] == 1.0 # "excellent" maps to 1.0 + assert "llm_eval_reasoning" in result + assert result["llm_eval_reasoning"] == "This is some reasoning." + assert "is_entity_dense" in result + assert isinstance(result["is_entity_dense"], bool) + assert "entity_density" in result + assert isinstance(result["entity_density"], float) + + +def test_summarization_scorer_initialization(summarization_scorer): assert isinstance(summarization_scorer, SummarizationScorer) assert summarization_scorer.model_id == "gpt-4o" assert summarization_scorer.temperature == 0.7 assert summarization_scorer.max_tokens == 1024 -def test_summarization_scorer_extract_entities(summarization_scorer, mock_create): + +def test_summarization_scorer_extract_entities(summarization_scorer): text = "This is a sample text with entities." entities = summarization_scorer.extract_entities(text) assert isinstance(entities, list) assert len(entities) == 2 assert "entity1" in entities assert "entity2" in entities - -def test_summarization_scorer_score(summarization_scorer): - input_text = "This is the original text with entities." - output_text = "This is a summary with some entities." - result = summarization_scorer.score(input=input_text, output=output_text) - assert isinstance(result, dict) - assert "recall" in result - assert 0 <= result["recall"] <= 1 - -# Add more tests as needed diff --git a/tests/trace/test_evaluations.py b/tests/trace/test_evaluations.py index 49f8426cc65..16c73aed5b3 100644 --- a/tests/trace/test_evaluations.py +++ b/tests/trace/test_evaluations.py @@ -749,17 +749,15 @@ def function_score(image, dc, model, obj, text, output) -> bool: assert "file_content_read" in access_log - @pytest.mark.asyncio async def test_evaluation_with_column_map(): - # Define a dummy scorer that uses column_map class DummyScorer(Scorer): @weave.op() def score(self, foo: str, bar: str, output: str, target: str) -> dict: # Return whether foo + bar equals output return {"match": (foo + bar) == output == target} - + # Create the scorer with column_map mapping 'foo'->'col1', 'bar'->'col2' dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"}) @@ -781,12 +779,13 @@ def model_function(col1, col2): eval_out = await evaluation.evaluate(model_function) # Check that 'DummyScorer' is in the results - assert 'DummyScorer' in eval_out + assert "DummyScorer" in eval_out # The expected summary should show that 3 out of 4 predictions matched expected_results = {"true_count": 3, "true_fraction": 0.75} - assert eval_out['DummyScorer']["match"] == expected_results, "The summary should reflect the correct number of matches" - + assert ( + eval_out["DummyScorer"]["match"] == expected_results + ), "The summary should reflect the correct number of matches" # Define another dummy scorer @@ -799,16 +798,20 @@ class DummyScorer(Scorer): def score(self, foo: str, bar: str, output: str, target: str) -> dict: # Return whether foo + bar equals output return {"match": (foo + bar) == output == target} + class AnotherDummyScorer(Scorer): @weave.op() def score(self, input1: str, input2: str, output: str) -> dict: # Return whether input1 == output reversed return {"match": input1 == output[::-1]} + # First scorer maps 'foo'->'col1', 'bar'->'col2' dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"}) # Second scorer maps 'input1'->'col2', 'input2'->'col1' - another_dummy_scorer = AnotherDummyScorer(column_map={"input1": "col2", "input2": "col1"}) + another_dummy_scorer = AnotherDummyScorer( + column_map={"input1": "col2", "input2": "col1"} + ) @weave.op() def model_function(col1, col2): @@ -821,18 +824,22 @@ def model_function(col1, col2): {"col1": "xyz", "col2": "zyx", "target": "zzzzzz"}, ] - evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer, another_dummy_scorer]) + evaluation = Evaluation( + dataset=dataset, scorers=[dummy_scorer, another_dummy_scorer] + ) # Run the evaluation eval_out = await evaluation.evaluate(model_function) # Check that both scorers are in the results - assert 'DummyScorer' in eval_out - assert 'AnotherDummyScorer' in eval_out + assert "DummyScorer" in eval_out + assert "AnotherDummyScorer" in eval_out # Assertions for the first scorer - expected_results_dummy = {"true_count": 1, "true_fraction": 1.0/3} - assert eval_out['DummyScorer']["match"] == expected_results_dummy, "All concatenations should match the target" + expected_results_dummy = {"true_count": 1, "true_fraction": 1.0 / 3} + assert ( + eval_out["DummyScorer"]["match"] == expected_results_dummy + ), "All concatenations should match the target" # Assertions for the second scorer # Since input1 == col2, and output is col1 + col2, we check if col2 == (col1 + col2)[::-1] @@ -842,4 +849,6 @@ def model_function(col1, col2): # Third row: col2 = "zyx", output = "xyzzyx", output[::-1] = "xyzzyx" -> "zyx" == "xyzzyx" is False # So all matches are False expected_results_another_dummy = {"true_count": 0, "true_fraction": 0.0} - assert eval_out['AnotherDummyScorer']["match"] == expected_results_another_dummy, "No matches should be found for AnotherDummyScorer" + assert ( + eval_out["AnotherDummyScorer"]["match"] == expected_results_another_dummy + ), "No matches should be found for AnotherDummyScorer" diff --git a/weave/__init__.py b/weave/__init__.py index 7cf5b49de48..3b54ba97176 100644 --- a/weave/__init__.py +++ b/weave/__init__.py @@ -15,20 +15,6 @@ from weave.trace.util import Thread as Thread from weave.trace.util import ThreadPoolExecutor as ThreadPoolExecutor -from typing import TYPE_CHECKING - -# Helper for IDEs -if TYPE_CHECKING: - from weave.flow import scorers - -# Lazy import for the scorers module -def __getattr__(name): - if name == "scorers": - from weave.flow import scorers - globals()["scorers"] = scorers - return scorers - raise AttributeError(f"module {__name__} has no attribute {name}") - # Special object informing doc generation tooling which symbols # to document & to associate with this module. __docspec__ = [ diff --git a/weave/flow/eval.py b/weave/flow/eval.py index 9a807d47f98..4c211e5d546 100644 --- a/weave/flow/eval.py +++ b/weave/flow/eval.py @@ -206,30 +206,45 @@ async def predict_and_score( # TODO: Check for input columns parameters in the signature of the scorer - if "model_output" not in score_arg_names and "output" not in score_arg_names: + if ( + "model_output" not in score_arg_names + and "output" not in score_arg_names + ): raise OpCallError( f"Scorer {scorer_name} must have a 'model_output' or 'output' argument, to receive the output of the model function." ) if isinstance(example, dict): - # If we get a column_map from the scorer, it means that the scorer expects the input to have different names than the dataset columns - # So we need to remap the input names to the expected names in the scorer - # For instance, if the scorer expects "input" and "target" and we have a dataset with columns "question" and "expected" - # we need to remap {"question": "input", "expected": "target"} - # and pass those to the scorer + # The keys of `score_args` must match the parameter names of the scorer's `score` method. + # If scorer.column_map is set, then user is indicating that the dataset column(s) + # being passed to the scorer have different names to the scorer's parameter names. + # So we need to remap the dataset columns to the expected parameter names in the scorer, + # + # column_map k:v pairs must be structured as `scorer param name : dataset column name` + # + # For instance, if the scorer expects "input" and "ground_truth" and we have a dataset + # with columns "question" and "answer", column_map should be defined as follows: + # {"input": "question", "ground_truth": "answer"} + # # input: is the full row, we have access to it via example # output: is the model output, we have access to it via model_output if isinstance(scorer, Scorer) and scorer.column_map is not None: - print(f"scorer.column_map: {scorer.column_map}") - print(f"score_arg_names: {score_arg_names}") - print(f"example: {example}") + print( + f"scorer.column_map: {scorer.column_map}" + ) # TODO: delete print statement + print( + f"score_arg_names: {score_arg_names}" + ) # TODO: delete print statement + print(f"example: {example}") # TODO: delete print statement score_args = { arg: example[scorer.column_map.get(arg, arg)] for arg in score_arg_names if scorer.column_map.get(arg, arg) in example } else: - score_args = {k: v for k, v in example.items() if k in score_arg_names} + score_args = { + k: v for k, v in example.items() if k in score_arg_names + } else: if len(score_arg_names) == 2: diff --git a/weave/flow/scorers/__init__.py b/weave/flow/scorers/__init__.py index 68c423eea3c..811880abc44 100644 --- a/weave/flow/scorers/__init__.py +++ b/weave/flow/scorers/__init__.py @@ -1,51 +1,51 @@ -from weave.flow.scorer.base_scorer import ( +from weave.flow.scorers.base_scorer import ( Scorer, auto_summarize, get_scorer_attributes, ) -from weave.flow.scorer.classification_scorer import ( +from weave.flow.scorers.classification_scorer import ( MultiTaskBinaryClassificationF1, transpose, ) -from weave.flow.scorer.hallucination_scorer import HallucinationScorer -from weave.flow.scorer.json_scorer import JSONScorer -from weave.flow.scorer.llm_scorer import ( +from weave.flow.scorers.hallucination_scorer import HallucinationScorer +from weave.flow.scorers.json_scorer import ValidJSONScorer +from weave.flow.scorers.llm_scorer import ( InstructorLLMScorer, LLMScorer, ) -from weave.flow.scorer.moderation_scorer import OpenAIModerationScorer -from weave.flow.scorer.pydantic_scorer import PydanticScorer -from weave.flow.scorer.ragas_scorer import ( +from weave.flow.scorers.moderation_scorer import OpenAIModerationScorer +from weave.flow.scorers.pydantic_scorer import PydanticScorer +from weave.flow.scorers.ragas_scorer import ( ContextEntityRecallScorer, ContextRelevancyScorer, ) -from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer -from weave.flow.scorer.string_scorer import ( +from weave.flow.scorers.similarity_score import EmbeddingSimilarityScorer +from weave.flow.scorers.string_scorer import ( LevenshteinScorer, RegexScorer, StringMatchScorer, ) -from weave.flow.scorer.summarization_scorer import SummarizationScorer -from weave.flow.scorer.xml_scorer import XMLScorer +from weave.flow.scorers.summarization_scorer import SummarizationScorer +from weave.flow.scorers.xml_scorer import ValidXMLScorer __all__ = [ - "Scorer", "auto_summarize", + "ContextEntityRecallScorer", + "ContextRelevancyScorer", + "EmbeddingSimilarityScorer", "get_scorer_attributes", - "MultiTaskBinaryClassificationF1", - "transpose", - "RegexScorer", - "StringMatchScorer", + "HallucinationScorer", + "InstructorLLMScorer", + "ValidJSONScorer", "LevenshteinScorer", - "JSONScorer", "LLMScorer", - "InstructorLLMScorer", - "EmbeddingSimilarityScorer", + "MultiTaskBinaryClassificationF1", "OpenAIModerationScorer", "PydanticScorer", - "HallucinationScorer", - "ContextEntityRecallScorer", - "ContextRelevancyScorer", + "RegexScorer", + "Scorer", + "StringMatchScorer", "SummarizationScorer", - "XMLScorer", + "transpose", + "ValidXMLScorer", ] diff --git a/weave/flow/scorers/classification_scorer.py b/weave/flow/scorers/classification_scorer.py index 622f576e678..4082b291029 100644 --- a/weave/flow/scorers/classification_scorer.py +++ b/weave/flow/scorers/classification_scorer.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple import weave -from weave.flow.scorer.base_scorer import Scorer +from weave.flow.scorers.base_scorer import Scorer def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]: diff --git a/weave/flow/scorers/hallucination_scorer.py b/weave/flow/scorers/hallucination_scorer.py index a2e4d38fa41..9534043d2a9 100644 --- a/weave/flow/scorers/hallucination_scorer.py +++ b/weave/flow/scorers/hallucination_scorer.py @@ -1,41 +1,114 @@ +from typing import List + from pydantic import BaseModel, Field import weave -from weave.flow.scorer.llm_scorer import InstructorLLMScorer -from weave.flow.scorer.llm_utils import OPENAI_DEFAULT_MODEL, create -from weave.flow.scorer.utils import stringify +from weave.flow.scorers.llm_scorer import InstructorLLMScorer +from weave.flow.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create +from weave.flow.scorers.utils import stringify + +DEFAULT_HALLUCINATION_SYSTEM_PROMPT = """ +Given some from a user and an generated by an AI system, \ +determine if the contains any hallucinations. + +A "hallucination" is defined as information in the that is not supported by \ +the or is not factually or logically consistent with the . + +# Steps +1. Carefully read and understand the input data. +2. Examine the model output. +3. Compare the output to the input data, identifying any inconsistencies or additions. +4. Evaluate the logical connection between input and output. +5. Determine if any information in the output is not supported by or conflicts with the input. + +# Guidelines +- Focus on factual accuracy and logical consistency +- Consider both explicit and implicit information in the input data +- Be aware of potential misinterpretations or over-generalizations in the output +- Identify any information in the output that goes beyond the scope of the input + +# Examples +## Data to analyze + + +The cat is black and white. + + + +The cat has orange stripes. + + +## Analysis: +{ + "think_step_by_step": "The cat is black and white. The cat has orange stripes. \ +The output contradicts the input data because the input specifies black and white, \ +while the output mentions orange. The output also introduces a pattern not present in \ +the input.", + "reasoning": [ + { + "hallucination_type": "Color comparison", + "observation": "Input specifies black and white, output mentions orange" + }, + { + "hallucination_type": "Pattern analysis", + "observation": "Input doesn't mention any pattern, output introduces stripes" + } + ], + "conclusion": "The output contains two hallucinations: it contradicts the color information \ +and introduces a pattern not present in the input." + "is_hallucination": true, +} + +# Notes +- Ensure each step in the reasoning process is clearly articulated +- Be objective and avoid assumptions not supported by the input data +- If the output contains factual information not present in the input, it may be a \ +hallucination even if it doesn't directly contradict the input +""" + +DEFAULT_HALLUCINATION_USER_PROMPT = """ +Analyze the following and and determine if the contains any hallucinations. +# Data to analyze -DEFAULT_SYSTEM_PROMPT = """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response.""" -DEFAULT_USER_PROMPT = """Given some input_data and a output, determine if the output is a hallucination of the input_data. -## Input data {input_data} - + -## Model output {output} - -## Instructions -Think step by step before answering. Is the output an factually and logically consistent with the input_data? """ +class HallucinationReasoning(BaseModel): + hallucination_type: str = Field( + description="A short name for the type of hallucination." + ) + observation: str = Field( + description="An observation from the and that supports the hallucination." + ) + + class HallucinationResponse(BaseModel): chain_of_thought: str = Field( - description="Think step by step about whether the output is a hallucination of the dataset_row" + description="Think step by step about whether the contains hallucinations \ +based on the ." + ) + hallucination_reasonings: List[HallucinationReasoning] = Field( + description="A list of reasoning steps that lead to the conclusion about whether or not\ +the contains hallucinations." ) + conclusion: str = Field(description="The conclusion of the analysis.") is_hallucination: bool = Field( - description="Whether the model output is a hallucination of the dataset row" + description="Whether the contains hallucinations based on the ." ) class HallucinationScorer(InstructorLLMScorer): """Scorer that checks if the model output is a hallucination of the dataset row.""" - system_prompt: str = DEFAULT_SYSTEM_PROMPT - user_prompt: str = DEFAULT_USER_PROMPT + system_prompt: str = DEFAULT_HALLUCINATION_SYSTEM_PROMPT + user_prompt: str = DEFAULT_HALLUCINATION_USER_PROMPT model_id: str = OPENAI_DEFAULT_MODEL temperature: float = 0.7 max_tokens: int = 4096 diff --git a/weave/flow/scorers/json_scorer.py b/weave/flow/scorers/json_scorer.py index 598b7a4f002..f40f2d66fe6 100644 --- a/weave/flow/scorers/json_scorer.py +++ b/weave/flow/scorers/json_scorer.py @@ -1,10 +1,10 @@ import json from typing import Any -from weave.flow.scorer.base_scorer import Scorer +from weave.flow.scorers.base_scorer import Scorer -class JSONScorer(Scorer): +class ValidJSONScorer(Scorer): """Score a JSON string.""" def score(self, output: Any, **kwargs: Any) -> dict: # type: ignore diff --git a/weave/flow/scorers/llm_scorer.py b/weave/flow/scorers/llm_scorer.py index 7bcf9cf9af6..d319670ae77 100644 --- a/weave/flow/scorers/llm_scorer.py +++ b/weave/flow/scorers/llm_scorer.py @@ -2,8 +2,8 @@ from pydantic import Field, field_validator -from weave.flow.scorer.base_scorer import Scorer -from weave.flow.scorer.llm_utils import instructor_client, _LLM_CLIENTS +from weave.flow.scorers.base_scorer import Scorer +from weave.flow.scorers.llm_utils import _LLM_CLIENTS_NAMES, instructor_client class LLMScorer(Scorer): @@ -16,9 +16,10 @@ class LLMScorer(Scorer): @field_validator("client") def validate_client(cls, v): # type: ignore - if not isinstance(v, _LLM_CLIENTS): + client_type_name = type(v).__name__ + if client_type_name not in _LLM_CLIENTS_NAMES: raise ValueError( - f"Invalid client type. Expected one of {_LLM_CLIENTS}, got {type(v)}" + f"Invalid client type. Expected one of {_LLM_CLIENTS_NAMES}, got {client_type_name}" ) return v @@ -39,8 +40,9 @@ class InstructorLLMScorer(Scorer): @field_validator("client") def validate_client(cls, v): # type: ignore - if not isinstance(v, _LLM_CLIENTS): + client_type_name = type(v).__name__ + if client_type_name not in _LLM_CLIENTS_NAMES: raise ValueError( - f"Invalid client type. Expected one of {_LLM_CLIENTS}, got {type(v)}" + f"Invalid client type. Expected one of {_LLM_CLIENTS_NAMES}, got {client_type_name}" ) return instructor_client(v) diff --git a/weave/flow/scorers/llm_utils.py b/weave/flow/scorers/llm_utils.py index e6bce53f8db..5d480f080b8 100644 --- a/weave/flow/scorers/llm_utils.py +++ b/weave/flow/scorers/llm_utils.py @@ -1,4 +1,6 @@ -from typing import TYPE_CHECKING, List, Optional, Union +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, List, Optional, Union from weave.trace.autopatch import autopatch @@ -26,6 +28,14 @@ else: _LLM_CLIENTS = object +_LLM_CLIENTS_NAMES = ( + "OpenAI", + "AsyncOpenAI", + "Anthropic", + "AsyncAnthropic", + "Mistral", +) + def instructor_client(client: _LLM_CLIENTS) -> "instructor.client": # type: ignore try: @@ -47,12 +57,12 @@ def instructor_client(client: _LLM_CLIENTS) -> "instructor.client": # type: ign raise ValueError(f"Unsupported client type: {client_type}") -def create(client: _LLM_CLIENTS, *args, **kwargs): # type: ignore +def create(client: instructor.client, *args, **kwargs) -> Any: # type: ignore return client.chat.completions.create(*args, **kwargs) def embed( - client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs + client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs: Any ) -> List[List[float]]: # type: ignore client_type = type(client).__name__.lower() if "openai" in client_type: @@ -71,12 +81,15 @@ def import_client(provider: str) -> Optional[_LLM_CLIENTS]: # type: ignore try: if provider == "openai": from openai import OpenAI + return OpenAI elif provider == "anthropic": import anthropic + return anthropic.Anthropic elif provider == "mistral": from mistralai import Mistral + return Mistral except ImportError: return None diff --git a/weave/flow/scorers/moderation_scorer.py b/weave/flow/scorers/moderation_scorer.py index 51b92b9f85b..8a8e4eee9da 100644 --- a/weave/flow/scorers/moderation_scorer.py +++ b/weave/flow/scorers/moderation_scorer.py @@ -3,7 +3,7 @@ from pydantic import field_validator import weave -from weave.flow.scorer.llm_scorer import LLMScorer +from weave.flow.scorers.llm_scorer import LLMScorer class OpenAIModerationScorer(LLMScorer): diff --git a/weave/flow/scorers/pydantic_scorer.py b/weave/flow/scorers/pydantic_scorer.py index 90fdffd6378..5566326774d 100644 --- a/weave/flow/scorers/pydantic_scorer.py +++ b/weave/flow/scorers/pydantic_scorer.py @@ -2,7 +2,7 @@ from pydantic import BaseModel, ValidationError -from weave.flow.scorer.base_scorer import Scorer +from weave.flow.scorers.base_scorer import Scorer class PydanticScorer(Scorer): diff --git a/weave/flow/scorers/ragas_scorer.py b/weave/flow/scorers/ragas_scorer.py index 6180697f59f..8b3493c3542 100644 --- a/weave/flow/scorers/ragas_scorer.py +++ b/weave/flow/scorers/ragas_scorer.py @@ -6,8 +6,8 @@ from pydantic import BaseModel, Field import weave -from weave.flow.scorer.llm_scorer import InstructorLLMScorer -from weave.flow.scorer.llm_utils import create +from weave.flow.scorers.llm_scorer import InstructorLLMScorer +from weave.flow.scorers.llm_utils import create class EntityExtractionResponse(BaseModel): diff --git a/weave/flow/scorers/similarity_score.py b/weave/flow/scorers/similarity_score.py index 722c16e98c2..82d8760b747 100644 --- a/weave/flow/scorers/similarity_score.py +++ b/weave/flow/scorers/similarity_score.py @@ -4,8 +4,8 @@ from pydantic import Field import weave -from weave.flow.scorer.llm_scorer import LLMScorer -from weave.flow.scorer.llm_utils import embed +from weave.flow.scorers.llm_scorer import LLMScorer +from weave.flow.scorers.llm_utils import embed class EmbeddingSimilarityScorer(LLMScorer): diff --git a/weave/flow/scorers/string_scorer.py b/weave/flow/scorers/string_scorer.py index a34fa561960..4dc58922668 100644 --- a/weave/flow/scorers/string_scorer.py +++ b/weave/flow/scorers/string_scorer.py @@ -4,7 +4,7 @@ from pydantic import Field, model_validator import weave -from weave.flow.scorer.base_scorer import Scorer +from weave.flow.scorers.base_scorer import Scorer class StringMatchScorer(Scorer): @@ -53,7 +53,7 @@ def score( text_to_search = "".join(text_to_search.split()) match_found = any( - pattern.search(text_to_search) for pattern in compiled_patterns + pattern.search(str(text_to_search)) for pattern in compiled_patterns ) return {"string_match": match_found} @@ -70,6 +70,7 @@ def check_levenshtein(self): # type: ignore from Levenshtein import distance self.distance = distance + return self except ImportError: raise ValueError( "Levenshtein package not found. Please install it with `pip install Levenshtein`" diff --git a/weave/flow/scorers/summarization_scorer.py b/weave/flow/scorers/summarization_scorer.py index ee43a7f48b1..effc4a990a2 100644 --- a/weave/flow/scorers/summarization_scorer.py +++ b/weave/flow/scorers/summarization_scorer.py @@ -1,40 +1,121 @@ -from textwrap import dedent -from typing import Any, List +import asyncio +from typing import Any, List, Literal from pydantic import BaseModel, Field import weave -from weave.flow.scorer.llm_scorer import InstructorLLMScorer -from weave.flow.scorer.llm_utils import create +from weave.flow.scorers.llm_scorer import InstructorLLMScorer +from weave.flow.scorers.llm_utils import create + +DEFAULT_EXTRACTION_SYSTEM_PROMPT = """ +Given a , extract all the unique entities from the text without repetition. +""" + +DEFAULT_EXTRACTION_USER_PROMPT = """ +Extract all the unique entities from the following without repetition: + +{text} + +""" + +DEFAULT_SUMMARIZATION_EVALUATION_SYSTEM_PROMPT = """ +Given an and a , evaluate the quality of the . + +# Considerations +- Does the contain the key information in the ? +- Is the concise and informative? +- Is the grammatically correct? +- Does the contain information or assertions that are not present in the ? + +# Scoring Rubric +`excellent`: The contains all of the key information and entities in the , \ +is concise and information dense, is grammatically correct and doesn't contain any \ +information or assertions that are not present in the . + +`ok`: The contains most of the key information and entities in the , \ +is somewhat concise and informative, is mostly grammatically correct and doesn't contain any \ +information or assertions that are not present in the . + +`poor`: The misses most or all of the key information in the , \ +or is very verbose or vague, or is not concise or informative, or has many grammatical errors, \ +or contains information or assertions that are not present in the . +""" + +DEFAULT_SUMMARIZATION_EVALUATION_USER_PROMPT = """ +Evaluate the quality of the following given the : + + +{input} + + + +{summary} + +""" class EntityExtractionResponse(BaseModel): entities: List[str] = Field( - description="A list of unique entities extracted from the text" + description="A list of unique entities extracted from the text." ) -class SummarizationScorer(InstructorLLMScorer): - """Estimates summary quality by computing the recall of entities in the model output compared to the input.""" +summarization_quality_options = Literal["poor", "ok", "excellent"] +summarization_quality_mapping = {"poor": 0.0, "ok": 0.5, "excellent": 1.0} + + +class SummarizationEvaluationResponse(BaseModel): + think_step_by_step: str = Field( + description="Think step-by-step about the quality of the before deciding \ +on the summarization_score." + ) + summarization_evaluation: summarization_quality_options = Field( + description="The evaluation of the summary" + ) - extraction_prompt: str = dedent(""" - Extract unique entities from the following text without repetition. - Text: {text} - Entities: - """) +class SummarizationScorer(InstructorLLMScorer): + """ + Estimates summary quality by both: + - Calculating the entity density of the summary, similar to how entity density is + used in the Chain of Density paper, https://arxiv.org/abs/2309.04269. + - Using an LLM to evaluate the summary quality. + + column_map: A `scorer parameter name : dataset column name` mapping. + + This summarization scorer expects the input column in the dataset to be named "input" \ + and the output column in the dataset to be named "summary". + You can specify a different mapping in the `column_map` argument. For example, \ + if your dataset contains columns "news_article" and "news_summary" then you can \ + specify `column_map={"input": "news_article", "output": "news_summary"}`. + + Parameters to the `score` function + - input: The text that was to be summarized + - output: the summary of the text + """ + extraction_system_prompt: str = DEFAULT_EXTRACTION_SYSTEM_PROMPT + extraction_prompt: str = DEFAULT_EXTRACTION_USER_PROMPT + summarization_evaluation_system_prompt: str = ( + DEFAULT_SUMMARIZATION_EVALUATION_SYSTEM_PROMPT + ) + summarization_evaluation_prompt: str = DEFAULT_SUMMARIZATION_EVALUATION_USER_PROMPT + fast_model_id: str = "gpt-4o-mini" + entity_density_threshold: float = 0.08 temperature: float = 0.7 max_tokens: int = 1024 + @weave.op def extract_entities(self, text: str) -> List[str]: - # Use LLM to extract entities - prompt = self.extraction_prompt.format(text=text) + """Use an LLM to extract entities""" response = create( self.client, - messages=[{"role": "user", "content": prompt}], + messages=[ + {"role": "system", "content": self.extraction_system_prompt}, + {"role": "user", "content": self.extraction_prompt.format(text=text)}, + ], response_model=EntityExtractionResponse, - model=self.model_id, + model=self.fast_model_id, temperature=self.temperature, max_tokens=self.max_tokens, ) @@ -42,13 +123,57 @@ def extract_entities(self, text: str) -> List[str]: return entities @weave.op - def score(self, input: str, output: str, **kwargs: Any) -> dict: - # Extract entities - output_entities = self.extract_entities(output) - input_entities = self.extract_entities(input) - # Calculate recall - if not output_entities: - return {"recall": 0.0} - matches = set(output_entities) & set(input_entities) - recall = len(matches) / len(input_entities) - return {"recall": recall} + def evaluate_summary( + self, input: str, summary: str + ) -> SummarizationEvaluationResponse: + """Evaluate the quality of a summary using an LLM""" + return create( + self.client, + messages=[ + { + "role": "system", + "content": self.summarization_evaluation_system_prompt, + }, + { + "role": "user", + "content": self.summarization_evaluation_prompt.format( + input=input, summary=summary + ), + }, + ], + response_model=SummarizationEvaluationResponse, + model=self.model_id, + temperature=self.temperature, + max_tokens=self.max_tokens, + ) + + def simple_word_tokenize(self, text: str) -> List[str]: + """Simple word tokenization""" + return text.split() + + @weave.op + async def score(self, input: str, output: str, **kwargs: Any) -> dict: + """ + - input: the piece of text that was to be summarized + - output: the generated summary of the input + """ + extract_task = asyncio.to_thread(self.extract_entities, text=output) + evaluate_task = asyncio.to_thread( + self.evaluate_summary, input=input, summary=output + ) + summary_entities, llm_eval = await asyncio.gather(extract_task, evaluate_task) + + # LLM evaluation + result = {} + result["summarization_eval_score"] = summarization_quality_mapping.get( + llm_eval.summarization_evaluation.lower() + ) + result["llm_eval_reasoning"] = llm_eval.think_step_by_step + + # Entity density evaluation + summary_words = self.simple_word_tokenize(output) + entity_density = len(summary_entities) / len(summary_words) + result["is_entity_dense"] = entity_density >= self.entity_density_threshold + result["entity_density"] = entity_density + + return result diff --git a/weave/flow/scorers/xml_scorer.py b/weave/flow/scorers/xml_scorer.py index 7bd42516e69..2ea8384477f 100644 --- a/weave/flow/scorers/xml_scorer.py +++ b/weave/flow/scorers/xml_scorer.py @@ -1,10 +1,10 @@ import xml.etree.ElementTree as ET from typing import Union -from weave.flow.scorer.base_scorer import Scorer +from weave.flow.scorers.base_scorer import Scorer -class XMLScorer(Scorer): +class ValidXMLScorer(Scorer): """Score an XML string.""" def score(self, output: Union[str, dict]) -> dict: # type: ignore diff --git a/weave/scorers/__init__.py b/weave/scorers/__init__.py new file mode 100644 index 00000000000..a1db6897f34 --- /dev/null +++ b/weave/scorers/__init__.py @@ -0,0 +1 @@ +from weave.flow.scorers import * \ No newline at end of file