diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 64b764b99d2..0605b0534df 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -240,6 +240,7 @@ jobs: 'mistral1', 'notdiamond', 'openai', + 'scorers_tests', 'pandas-test', ] fail-fast: false @@ -292,6 +293,9 @@ jobs: WF_CLICKHOUSE_HOST: weave_clickhouse WEAVE_SERVER_DISABLE_ECOSYSTEM: 1 GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | nox -e "tests-${{ matrix.python-version-major }}.${{ matrix.python-version-minor }}(shard='${{ matrix.nox-shard }}')" trace-tests-matrix-check: # This job does nothing and is only used for the branch protection diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md new file mode 100644 index 00000000000..ce7ea3b86c1 --- /dev/null +++ b/docs/docs/guides/evaluation/scorers.md @@ -0,0 +1,670 @@ +# Evaluation Metrics + +## Evaluations in Weave +In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics. They take the AI's output, analyze it, and return a dictionary of results. Scorers can use your input data as reference if needed and can also output extra information, such as explanations or reasonings from the evaluation. + +Scorers are passed to a `weave.Evaluation` object during evaluation. There are two types of Scorers in weave: + +1. **Function-based Scorers:** Simple Python functions decorated with `@weave.op`. +2. **Class-based Scorers:** Python classes that inherit from `weave.Scorer` for more complex evaluations. + +Scorers must return a dictionary and can return multiple metrics, nested metrics and non-numeric values such as text returned from a LLM-evaluator about its reasoning. + +## Create your own Scorers +### Function-based Scorers +These are functions decorated with `@weave.op` that return a dictionary. They're great for simple evaluations like: + +```python +import weave + +@weave.op +def evaluate_uppercase(text: str) -> dict: # Added return type hint + return {"text_is_uppercase": text.isupper()} + +my_eval = weave.Evaluation( + dataset=[{"text": "HELLO WORLD"}], + scorers=[evaluate_uppercase] +) +``` + +When the evaluation is run, `evaluate_uppercase` checks if the text is all uppercase. + +### Class-based Scorers +For more advanced evaluations, especially when you need to keep track of additional scorer metadata, try different prompts for your LLM-evaluators, or make multiple function calls, you can use the `Scorer` class. + +**Requirements:** +1. Inherit from `weave.Scorer`. +2. Define a `score` method decorated with `@weave.op`. +3. The `score` method must return a dictionary. + +Example: + + +```python +import weave +from openai import OpenAI +from weave import Scorer + +llm_client = OpenAI() + +#highlight-next-line +class SummarizationScorer(Scorer): + model_id: str = "gpt-4o" + system_prompt: str = "Evaluate whether the summary is good." + + @weave.op + def some_complicated_preprocessing(self, text: str) -> str: + processed_text = "Original text: \n" + text + "\n" + return processed_text + + @weave.op + def call_llm(self, summary: str, processed_text: str) -> dict: + res = llm_client.chat.completions.create( + messages=[ + {"role": "system", "content": self.system_prompt}, + {"role": "user", "content": ( + f"Analyse how good the summary is compared to the original text." + f"Summary: {summary}\n{processed_text}" + )}]) + return {"summary_quality": res} + + @weave.op + def score(self, output: str, text: str) -> dict: + """Score the summary quality. + + Args: + output: The summary generated by an AI system + text: The original text being summarized + """ + processed_text = self.some_complicated_preprocessing(text) + eval_result = self.call_llm(summary=output, processed_text=processed_text) + return {"summary_quality": eval_result} + +evaluation = weave.Evaluation( + dataset=[{"text": "The quick brown fox jumps over the lazy dog."}], + scorers=[summarization_scorer]) +``` +This class evaluates how good a summary is by comparing it to the original text. + +## How Scorers Work +### Scorer Keyword Arguments +Scorers can access both the output from your AI system and the input data from the dataset row. + +- **Input:** If you would like your scorer to use data from your dataset row, such as a "label" or "target" column then you can easily make this available to the scorer by adding a `label` or `target` keyword argument to your scorer definition. + +For example if you wanted to use a column called "label" from your dataset then your scorer function (or `score` class method) would have a parameter list like this: + +```python +@weave.op +def my_custom_scorer(output: str, label: int) -> dict: # Added return type hint + ... +``` + +When a weave `Evaluation` is run, the output of the AI system is passed to the `output` parameter. The `Evaluation` also automatically tries to match any additional scorer argument names to your dataset columns. If customizing your scorer arguments or dataset columns is not feasible, you can use column mapping - see below for more. + +- **Output:** Include an `output` parameter in your scorer function's signature to access the AI system's output. + + +### Mapping Column Names with column_map +Sometimes, the `score` methods' argument names don't match the column names in your dataset. You can fix this using a `column_map`. + +If you're using a class-based scorer, pass a dictionary to the `column_map` attribute of `Scorer` when you initialise your scorer class. This dictionary maps your `score` method's argument names to the dataset's column names, in the order: `{scorer_keyword_argument: dataset_column_name}`. + +Example: + +```python +import weave +from weave import Scorer + +# A dataset with news articles to be summarised +dataset = [ + {"news_article": "The news today was great...", "date": "2030-04-20", "source": "Bright Sky Network"}, + ... +] + +# Scorer class +class SummarizationScorer(Scorer): + + @weave.op + def score(output, text) -> dict: + """ + output: output summary from a LLM summarization system + text: the text being summarised + """ + ... # evaluate the quality of the summary + +# create a scorer with a column mapping the `text` argument to the `news_article` data column +scorer = SummarizationScorer(column_map={"text" : "news_article"}) +``` + +Now, the `text` argument in the `score` method will receive data from the `news_article` dataset column. + +**Notes:** +- Another equivalent option to map your columns is to subclass the `Scorer` and overload the `score` method mapping the columns explicitly. + +```python +import weave +from weave import Scorer + +class MySummarizationScorer(SummarizationScorer): + + @weave.op + def score(self, output: str, news_article: str) -> dict: # Added type hints + # overload the score method and map columns manually + return super().score(output=output, text=news_article) +``` + +### Final summarization of the scorer + +During evaluation, the scorer will be computed for each row of your dataset. To provide a final score for the evaluation we provide an `auto_summarize` depending on the returning type of the output. + - average will be computed for numerical columns + - count and fraction for boolean cols + - other col types are ignored + +You can override the `summarize` method on the `Scorer` class and provide your own way of computing the final scores. The `summarize` function expects: + +- A single parameter `score_rows`: This is a list of dictionaries, where each dictionary contains the scores returned by the `score` method for a single row of your dataset. +- It should return a dictionary containing the summarized scores. + +**Why this is useful?** + +When you need to score all rows before deciding on the final value of the score for the dataset. + +```python +class MyBinaryScorer(Scorer): + """ + Returns True if the full output matches the target, False if not + """ + + @weave.op + def score(output, target): + return {"match": if output == target} + + def summarize(self, score_rows: list) -> dict: + full_match = all(row["match"] for row in score_rows) + return {"full_match": full_match} +``` +> In this example, the default `auto_summarize` would have returned the count and proportion of True. + +If you want to learn more, check the implementation of [CorrectnessLLMJudge](/tutorial-rag#optional-defining-a-scorer-class). + +## Predefined Scorers + +**Installation** + +To use Weave's predefined scorers you need to install some additional dependencies: + +```bash +pip install weave[scorers] +``` + +**LLM-evaluators** + +The pre-defined scorers that use LLMs support the OpenAI, Anthropic, Google GenerativeAI and MistralAI clients. They also use `weave`'s `InstructorLLMScorer` class, so you'll need to install the [`instructor`](https://github.com/instructor-ai/instructor) Python package to be able to use them. You can get all necessary dependencies with `pip install "weave[scorers]"` + +### `HallucinationFreeScorer` + +This scorer checks if your AI system's output includes any hallucinations based on the input data. + +```python +from weave.scorers import HallucinationFreeScorer + +llm_client = ... # initialize your LLM client here + +scorer = HallucinationFreeScorer( + client=llm_client, + model_id="gpt4o" +) +``` + +**Customization:** +- Customize the `system_prompt` and `user_prompt` attributes of the scorer to define what "hallucination" means for you. + +**Notes:** +- The `score` method expects an input column named `context`. If your dataset uses a different name, use the `column_map` attribute to map `context` to the dataset column. + +Here you have an example in the context of an evaluation: + +```python +import asyncio +from openai import OpenAI +import weave +from weave.scorers import HallucinationFreeScorer + +# Initialize clients and scorers +llm_client = OpenAI() +hallucination_scorer = HallucinationFreeScorer( + client=llm_client, + model_id="gpt-4o", + column_map={"context": "input", "output": "other_col"} +) + +# Create dataset +dataset = [ + {"input": "John likes various types of cheese."}, + {"input": "Pepe likes various types of cheese."}, +] + +@weave.op +def model(input: str) -> str: + return "The person's favorite cheese is cheddar." + +# Run evaluation +evaluation = weave.Evaluation( + dataset=dataset, + scorers=[hallucination_scorer], +) +result = asyncio.run(evaluation.evaluate(model)) +print(result) +# {'HallucinationFreeScorer': {'has_hallucination': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 1.4395725727081299}} +``` +--- + +### `SummarizationScorer` + +Use an LLM to compare a summary to the original text and evaluate the quality of the summary. + +```python +from weave.scorers import SummarizationScorer + +llm_client = ... # initialize your LLM client here + +scorer = SummarizationScorer( + client=llm_client, + model_id="gpt4o" +) +``` + +**How It Works:** + +This scorer evaluates summaries in two ways: + +1. **Entity Density:** Checks the ratio of unique entities (like names, places, or things) mentioned in the summary to the total word count in the summary in order to estimate the "information density" of the summary. Uses an LLM to extract the entities. Similar to how entity density is used in the Chain of Density paper, https://arxiv.org/abs/2309.04269 + +2. **Quality Grading:** Uses an LLM-evaluator to grade the summary as `poor`, `ok`, or `excellent`. These grades are converted to scores (0.0 for poor, 0.5 for ok, and 1.0 for excellent) so you can calculate averages. + +**Customization:** +- Adjust `summarization_evaluation_system_prompt` and `summarization_evaluation_prompt` to define what makes a good summary. + +**Notes:** +- This scorer uses the `InstructorLLMScorer` class. +- The `score` method expects the original text that was summarized to be present in the `input` column of the dataset. Use the `column_map` class attribute to map `input` to the correct dataset column if needed. + + +Here you have an example usage of the `SummarizationScorer` in the context of an evaluation: + +```python +import asyncio +from openai import OpenAI +import weave +from weave.scorers import SummarizationScorer + +class SummarizationModel(weave.Model): + @weave.op() + async def predict(self, input: str) -> str: + return "This is a summary of the input text." + +# Initialize clients and scorers +llm_client = OpenAI() +model = SummarizationModel() +summarization_scorer = SummarizationScorer( + client=llm_client, + model_id="gpt-4o", +) +# Create dataset +dataset = [ + {"input": "The quick brown fox jumps over the lazy dog."}, + {"input": "Artificial Intelligence is revolutionizing various industries."} +] + +# Run evaluation +evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer]) +results = asyncio.run(evaluation.evaluate(model)) +print(results) +# {'SummarizationScorer': {'is_entity_dense': {'true_count': 0, 'true_fraction': 0.0}, 'summarization_eval_score': {'mean': 0.0}, 'entity_density': {'mean': 0.0}}, 'model_latency': {'mean': 6.210803985595703e-05}} +``` + +--- + +### `OpenAIModerationScorer` + +The `OpenAIModerationScorer` uses OpenAI's Moderation API to check if the AI system's output contains disallowed content, such as hate speech or explicit material. + +```python +from weave.scorers import OpenAIModerationScorer +from openai import OpenAI + +oai_client = OpenAI(api_key=...) # initialize your LLM client here + +scorer = OpenAIModerationScorer( + client=oai_client, + model_id="text-embedding-3-small" +) +``` + +**How It Works:** + +- Sends the AI's output to the OpenAI Moderation endpoint and returns a dictionary indicating whether the content is flagged and details about the categories involved. + +**Notes:** +- Requires the `openai` Python package. +- The client must be an instance of OpenAI's `OpenAI` or `AsyncOpenAI` client. + + +Here you have an example in the context of an evaluation: +```python +import asyncio +from openai import OpenAI +import weave +from weave.scorers import OpenAIModerationScorer + +class MyModel(weave.Model): + @weave.op + async def predict(self, input: str) -> str: + return input + +# Initialize clients and scorers +client = OpenAI() +model = MyModel() +moderation_scorer = OpenAIModerationScorer(client=client) + +# Create dataset +dataset = [ + {"input": "I love puppies and kittens!"}, + {"input": "I hate everyone and want to hurt them."} +] + +# Run evaluation +evaluation = weave.Evaluation(dataset=dataset, scorers=[moderation_scorer]) +results = asyncio.run(evaluation.evaluate(model)) +print(results) +# {'OpenAIModerationScorer': {'flagged': {'true_count': 1, 'true_fraction': 0.5}, 'categories': {'violence': {'true_count': 1, 'true_fraction': 1.0}}}, 'model_latency': {'mean': 9.500980377197266e-05}} +``` + +--- + +### `EmbeddingSimilarityScorer` + +The `EmbeddingSimilarityScorer` computes the cosine similarity between the embeddings of the AI system's output and a target text from your dataset. It's useful for measuring how similar the AI's output is to a reference text. + +```python +from weave.scorers import EmbeddingSimilarityScorer + +llm_client = ... # initialise your LlM client + +similarity_scorer = EmbeddingSimilarityScorer( + client=llm_client + target_column="reference_text", # the dataset column to compare the output against + threshold=0.4 # the cosine similarity threshold to use +) +``` + +**Parameters:** + +- `target`: This scorer expects a `target` column in your dataset, it will calculate the cosine similarity of the embeddings of the `target` column to the AI system output. If your dataset doesn't contain a column called `target` you can use the scorers `column_map` attribute to map `target` to the appropriate column name in your dataset. See the Column Mapping section for more. +- `threshold` (float): The minimum cosine similarity score between the embedding of the AI system output and the embdedding of the `target`, above which the 2 samples are considered "similar", (defaults to `0.5`). `threshold` can be in a range from -1 to 1: + - 1 indicates identical direction. + - 0 indicates orthogonal vectors. + - -1 indicates opposite direction. + +The correct cosine similarity threshold to set can fluctuate quite a lot depending on your use case, we advise exploring different thresholds. + + +Here you have an example usage of the `EmbeddingSimilarityScorer` in the context of an evaluation: + +```python +import asyncio +from openai import OpenAI +import weave +from weave.scorers import EmbeddingSimilarityScorer + +# Initialize clients and scorers +client = OpenAI() +similarity_scorer = EmbeddingSimilarityScorer( + client=client, + threshold=0.7, + column_map={"target": "reference"} +) + +# Create dataset +dataset = [ + { + "input": "He's name is John", + "reference": "John likes various types of cheese.", + }, + { + "input": "He's name is Pepe.", + "reference": "Pepe likes various types of cheese.", + }, +] + +# Define model +@weave.op +def model(input: str) -> str: + return "John likes various types of cheese." + +# Run evaluation +evaluation = weave.Evaluation( + dataset=dataset, + scorers=[similarity_scorer], +) +result = asyncio.run(evaluation.evaluate(model)) +print(result) +# {'EmbeddingSimilarityScorer': {'is_similar': {'true_count': 1, 'true_fraction': 0.5}, 'similarity_score': {'mean': 0.8448514031462045}}, 'model_latency': {'mean': 0.45862746238708496}} +``` + +--- + +### `ValidJSONScorer` + +The ValidJSONScorer checks whether the AI system's output is valid JSON. This scorer is useful when you expect the output to be in JSON format and need to verify its validity. + +```python +from weave.scorers import ValidJSONScorer + +json_scorer = ValidJSONScorer() +``` + +Here you have an example usage of the `ValidJSONScorer` in the context of an evaluation: + +```python +import asyncio +import weave +from weave.scorers import ValidJSONScorer + +class JSONModel(weave.Model): + @weave.op() + async def predict(self, input: str) -> str: + # This is a placeholder. + # In a real scenario, this would generate JSON. + return '{"key": "value"}' + +model = JSONModel() +json_scorer = ValidJSONScorer() + +dataset = [ + {"input": "Generate a JSON object with a key and value"}, + {"input": "Create an invalid JSON"} +] + +evaluation = weave.Evaluation(dataset=dataset, scorers=[json_scorer]) +results = asyncio.run(evaluation.evaluate(model)) +print(results) +# {'ValidJSONScorer': {'json_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 8.58306884765625e-05}} +``` + + +--- + +### `ValidXMLScorer` + +The `ValidXMLScorer` checks whether the AI system's output is valid XML. This is useful when expecting XML-formatted outputs. + +```python +from weave.scorers import ValidXMLScorer + +xml_scorer = ValidXMLScorer() +``` + + +Here you have an example usage of the `ValidXMLScorer` in the context of an evaluation: + +```python +import asyncio +import weave +from weave.scorers import ValidXMLScorer + +class XMLModel(weave.Model): + @weave.op() + async def predict(self, input: str) -> str: + # This is a placeholder. In a real scenario, this would generate XML. + return 'value' + +model = XMLModel() +xml_scorer = ValidXMLScorer() + +dataset = [ + {"input": "Generate a valid XML with a root element"}, + {"input": "Create an invalid XML"} +] + +evaluation = weave.Evaluation(dataset=dataset, scorers=[xml_scorer]) +results = asyncio.run(evaluation.evaluate(model)) +print(results) +# {'ValidXMLScorer': {'xml_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 8.20159912109375e-05}} +``` + +--- + +### `PydanticScorer` + +The `PydanticScorer` validates the AI system's output against a Pydantic model to ensure it adheres to a specified schema or data structure. + +```python +from weave.scorers import PydanticScorer +from pydantic import BaseModel + +class FinancialReport(BaseModel): + revenue: int + year: str + +pydantic_scorer = PydanticScorer(model=FinancialReport) +``` + +--- + +### RAGAS - `ContextEntityRecallScorer` + +The `ContextEntityRecallScorer` estimates context recall by extracting entities from both the AI system's output and the provided context, then computing the recall score. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library + +```python +from weave.scorers import ContextEntityRecallScorer + +llm_client = ... # initialise your LlM client + +entity_recall_scorer = ContextEntityRecallScorer( + client=llm_client + model_id="your-model-id" +) +``` + +**How It Works:** + +- Uses an LLM to extract unique entities from the output and context and calculates recall. +- **Recall** indicates the proportion of important entities from the context that are captured in the output, helping to assess the model's effectiveness in retrieving relevant information. +- Returns a dictionary with the recall score. + +**Notes:** + +- Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed. + +--- + +### RAGAS - `ContextRelevancyScorer` + +The `ContextRelevancyScorer` evaluates the relevancy of the provided context to the AI system's output. It helps determine if the context used is appropriate for generating the output. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library. + +```python +from weave.scorers import ContextRelevancyScorer + +llm_client = ... # initialise your LlM client + +relevancy_scorer = ContextRelevancyScorer( + llm_client = ... # initialise your LlM client + model_id="your-model-id" + ) +``` + +**How It Works:** + +- Uses an LLM to rate the relevancy of the context to the output on a scale from 0 to 1. +- Returns a dictionary with the `relevancy_score`. + +**Notes:** + +- Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed. +- Customize the `relevancy_prompt` to define how relevancy is assessed. + + +Here you have an example usage of `ContextEntityRecallScorer` and `ContextRelevancyScorer` in the context of an evaluation: + +```python +import asyncio +from textwrap import dedent +from openai import OpenAI +import weave +from weave.scorers import ContextEntityRecallScorer, ContextRelevancyScorer + +class RAGModel(weave.Model): + @weave.op() + async def predict(self, question: str) -> str: + "Retrieve relevant context" + return "Paris is the capital of France." + + +model = RAGModel() + +# Define prompts +relevancy_prompt: str = dedent(""" + Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1. + + Question: {question} + Context: {context} + Relevancy Score (0-1): + """) + +# Initialize clients and scorers +llm_client = OpenAI() +entity_recall_scorer = ContextEntityRecallScorer( + client=client, + model_id="gpt-4o", +) + +relevancy_scorer = ContextRelevancyScorer( + client=llm_client, + model_id="gpt-4o", + relevancy_prompt=relevancy_prompt +) + +# Create dataset +dataset = [ + { + "question": "What is the capital of France?", + "context": "Paris is the capital city of France." + }, + { + "question": "Who wrote Romeo and Juliet?", + "context": "William Shakespeare wrote many famous plays." + } +] + +# Run evaluation +evaluation = weave.Evaluation( + dataset=dataset, + scorers=[entity_recall_scorer, relevancy_scorer] +) +results = asyncio.run(evaluation.evaluate(model)) +print(results) +# {'ContextEntityRecallScorer': {'recall': {'mean': 0.3333333333333333}}, 'ContextRelevancyScorer': {'relevancy_score': {'mean': 0.5}}, 'model_latency': {'mean': 9.393692016601562e-05}} +``` + diff --git a/docs/docs/guides/integrations/langchain.md b/docs/docs/guides/integrations/langchain.md index b382e793e70..4487a85dfd4 100644 --- a/docs/docs/guides/integrations/langchain.md +++ b/docs/docs/guides/integrations/langchain.md @@ -196,7 +196,7 @@ Evaluations help you measure the performance of your models. By using the [`weav ```python -from weave.flow.scorer import MultiTaskBinaryClassificationF1 +from weave.scorers import MultiTaskBinaryClassificationF1 sentences = [ "There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.", diff --git a/docs/docs/guides/tracking/costs.md b/docs/docs/guides/tracking/costs.md index 8bcddeb2e0c..bedca15aa17 100644 --- a/docs/docs/guides/tracking/costs.md +++ b/docs/docs/guides/tracking/costs.md @@ -1,9 +1,5 @@ # Costs -:::info -Custom costs are accessible via Python and REST queries. UI uptake is under development and expected to be complete by middle of October 2024 -::: - ## Adding a custom cost You can add a custom cost by using the [`add_cost`](/reference/python-sdk/weave/trace/weave.trace.weave_client#method-add_cost) method. diff --git a/docs/docs/reference/gen_notebooks/audio_with_weave.md b/docs/docs/reference/gen_notebooks/audio_with_weave.md index c351bda16e0..a8c6b45efc1 100644 --- a/docs/docs/reference/gen_notebooks/audio_with_weave.md +++ b/docs/docs/reference/gen_notebooks/audio_with_weave.md @@ -1,3 +1,6 @@ +--- +title: Log Audio With Weave +--- :::tip[This is a notebook] @@ -9,6 +12,9 @@ ::: + + + # How to use Weave with Audio Data: An OpenAI Example This demo uses the OpenAI chat completions API with GPT 4o Audio Preview to generate audio responses to text prompts and track these in Weave. @@ -39,6 +45,7 @@ Next, load the required API keys for OpenAI and Weave. Here, we use set_env whic ```python # Set environment variables. +from set_env import set_env _ = set_env("OPENAI_API_KEY") _ = set_env("WANDB_API_KEY") @@ -152,7 +159,7 @@ prompt_endpoint_and_log_trace( display(Audio("output.wav", rate=SAMPLE_RATE, autoplay=True)) ``` -# Advanced Usage: Realtime Audio API with Weave #TODO: Record video of weave traces and me chatting with it w/ the terminal output side by side. +# Advanced Usage: Realtime Audio API with Weave
(Advanced) Realtime Audio API with Weave diff --git a/docs/docs/reference/gen_notebooks/custom_model_cost.md b/docs/docs/reference/gen_notebooks/custom_model_cost.md index fc7bf819c2a..093286e9f06 100644 --- a/docs/docs/reference/gen_notebooks/custom_model_cost.md +++ b/docs/docs/reference/gen_notebooks/custom_model_cost.md @@ -92,6 +92,7 @@ class YourModel(Model): "usage": { "input_tokens": prompt_tokens, "output_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, }, "model": "your_model_name", "output": prediction, diff --git a/docs/docs/tutorial-eval.md b/docs/docs/tutorial-eval.md index 2b4f202244d..44ccdfa5a9d 100644 --- a/docs/docs/tutorial-eval.md +++ b/docs/docs/tutorial-eval.md @@ -94,7 +94,7 @@ Here `sentence` is passed to the model's predict function, and `target` is used ```python import weave -from weave.flow.scorer import MultiTaskBinaryClassificationF1 +from weave.scorers import MultiTaskBinaryClassificationF1 weave.init('intro-example') @@ -132,7 +132,7 @@ import asyncio # highlight-next-line import weave # highlight-next-line -from weave.flow.scorer import MultiTaskBinaryClassificationF1 +from weave.scorers import MultiTaskBinaryClassificationF1 import openai # We create a model class with one predict function. diff --git a/docs/docs/tutorial-rag.md b/docs/docs/tutorial-rag.md index 43fbf3d9994..e88e27e38bc 100644 --- a/docs/docs/tutorial-rag.md +++ b/docs/docs/tutorial-rag.md @@ -182,7 +182,7 @@ On a high-level the steps to create custom Scorer are quite simple: ```python -from weave.flow.scorer import Scorer +from weave.scorers import Scorer from weave import WeaveList class CorrectnessLLMJudge(Scorer): diff --git a/docs/notebooks/audio_with_weave.ipynb b/docs/notebooks/audio_with_weave.ipynb index c192c589e57..a4a3dfa2d04 100644 --- a/docs/notebooks/audio_with_weave.ipynb +++ b/docs/notebooks/audio_with_weave.ipynb @@ -1,5 +1,18 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "" + ] + }, { "cell_type": "markdown", "metadata": { @@ -66,6 +79,7 @@ "outputs": [], "source": [ "# Set environment variables.\n", + "from set_env import set_env\n", "\n", "_ = set_env(\"OPENAI_API_KEY\")\n", "_ = set_env(\"WANDB_API_KEY\")" @@ -253,7 +267,7 @@ "id": "P7zY5fho4hOG" }, "source": [ - "# Advanced Usage: Realtime Audio API with Weave #TODO: Record video of weave traces and me chatting with it w/ the terminal output side by side. \n", + "# Advanced Usage: Realtime Audio API with Weave\n", "\n", "
\n", " (Advanced) Realtime Audio API with Weave \n", diff --git a/docs/notebooks/custom_model_cost.ipynb b/docs/notebooks/custom_model_cost.ipynb index 87b0187593d..def49145f9f 100644 --- a/docs/notebooks/custom_model_cost.ipynb +++ b/docs/notebooks/custom_model_cost.ipynb @@ -12,7 +12,6 @@ "---\n", "docusaurus_head_meta::end -->\n", "\n", - "" ] }, @@ -196,6 +195,7 @@ " \"usage\": {\n", " \"input_tokens\": prompt_tokens,\n", " \"output_tokens\": completion_tokens,\n", + " \"total_tokens\": prompt_tokens + completion_tokens,\n", " },\n", " \"model\": \"your_model_name\",\n", " \"output\": prediction,\n", diff --git a/docs/sidebars.ts b/docs/sidebars.ts index 64c8e3126ec..c5da61462b5 100644 --- a/docs/sidebars.ts +++ b/docs/sidebars.ts @@ -54,9 +54,18 @@ const sidebars: SidebarsConfig = { "guides/tracking/objects", ], }, + { + type: "category", + collapsible: true, + collapsed: false, + label: "Evaluation", + link: { type: "doc", id: "guides/core-types/evaluations"}, + items: [ + "guides/evaluation/scorers", + ], + }, "guides/core-types/models", "guides/core-types/datasets", - "guides/core-types/evaluations", "guides/tracking/feedback", "guides/tracking/costs", "guides/core-types/media", diff --git a/examples/text-extract/evaluate.py b/examples/text-extract/evaluate.py index abb292b198e..357f101e387 100644 --- a/examples/text-extract/evaluate.py +++ b/examples/text-extract/evaluate.py @@ -6,7 +6,7 @@ import openai import weave -from weave.flow.scorer import MultiTaskBinaryClassificationF1 +from weave.scorers import MultiTaskBinaryClassificationF1 class TextExtractModel(weave.Model): diff --git a/examples/tutorial_scripts/05_eval_pipeline.py b/examples/tutorial_scripts/05_eval_pipeline.py index ccb14126a03..0a6a5baf9ab 100644 --- a/examples/tutorial_scripts/05_eval_pipeline.py +++ b/examples/tutorial_scripts/05_eval_pipeline.py @@ -60,7 +60,7 @@ async def predict(self, sentence: str) -> dict: ] import weave -from weave.flow.scorer import MultiTaskBinaryClassificationF1 +from weave.scorers import MultiTaskBinaryClassificationF1 @weave.op() diff --git a/examples/tutorial_scripts/06_eval_pipeline_all.py b/examples/tutorial_scripts/06_eval_pipeline_all.py index 6be10f08a44..0d5fe8fd3b2 100644 --- a/examples/tutorial_scripts/06_eval_pipeline_all.py +++ b/examples/tutorial_scripts/06_eval_pipeline_all.py @@ -4,7 +4,7 @@ import openai import weave -from weave.flow.scorer import MultiTaskBinaryClassificationF1 +from weave.scorers import MultiTaskBinaryClassificationF1 # We create a model class with one predict function. # All inputs, predictions and parameters are automatically captured for easy inspection. diff --git a/noxfile.py b/noxfile.py index bb74b97ec34..90aa3bfaac4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -11,6 +11,7 @@ "litellm", "notdiamond", "google_ai_studio", + "scorers_tests", ] @@ -40,6 +41,7 @@ def lint(session): "mistral1", "notdiamond", "openai", + "scorers_tests", "pandas-test", ], ) @@ -64,12 +66,21 @@ def tests(session, shard): if shard == "google_ai_studio": env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY") + # we are doing some integration test in test_llm_integrations.py that requires + # setting some environment variables for the LLM providers + if shard == "scorers_tests": + env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY") + env["ANTHROPIC_API_KEY"] = session.env.get("ANTHROPIC_API_KEY") + env["MISTRAL_API_KEY"] = session.env.get("MISTRAL_API_KEY") + env["OPENAI_API_KEY"] = session.env.get("OPENAI_API_KEY") + default_test_dirs = [f"integrations/{shard}/"] test_dirs_dict = { "trace": ["trace/"], "trace_server": ["trace_server/"], "mistral0": ["integrations/mistral/v0/"], "mistral1": ["integrations/mistral/v1/"], + "scorers_tests": ["scorers/"], } test_dirs = test_dirs_dict.get(shard, default_test_dirs) diff --git a/pyproject.toml b/pyproject.toml index ff5403c4e89..407b7548327 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,8 @@ litellm = ["litellm>=1.36.1"] llamaindex = ["llama-index>=0.10.35"] mistral0 = ["mistralai>=0.1.8,<1.0.0"] mistral1 = ["mistralai>=1.0.0"] +scorers = ["Levenshtein>=0.26.0", "instructor>=1.5.2"] +scorers_tests = ["instructor>=1.5.2", "Levenshtein>=0.26.0", "openai>=1.0.0", "google-generativeai>=0.8.0", "mistralai>=1.0.3", "anthropic>=0.30.0"] notdiamond = ["notdiamond>=0.3.21", "litellm<=1.49.1"] openai = ["openai>=1.0.0"] pandas-test = ["pandas>=2.2.3"] diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py new file mode 100644 index 00000000000..5f71fe724b9 --- /dev/null +++ b/tests/scorers/test_hallucination_scorer.py @@ -0,0 +1,105 @@ +import pytest +from openai import OpenAI + +import weave +from weave.scorers import ( + HallucinationFreeScorer, +) +from weave.scorers.hallucination_scorer import ( + HallucinationReasoning, + HallucinationResponse, +) + + +# mock the create function +@pytest.fixture +def mock_create(monkeypatch): + def _mock_create(*args, **kwargs): + return HallucinationResponse( + chain_of_thought="The output is consistent with the input data.", + reasonings=[ + HallucinationReasoning( + observation="My observation for this is that the output is consistent with the input data.", + hallucination_type="No Hallucination", + ) + ], + conclusion="The output is consistent with the input data.", + has_hallucination=True, + ) + + monkeypatch.setattr("weave.scorers.hallucination_scorer.create", _mock_create) + + +@pytest.fixture +def hallucination_scorer(mock_create): + return HallucinationFreeScorer( + client=OpenAI(api_key="DUMMY_API_KEY"), + model_id="gpt-4o", + temperature=0.7, + max_tokens=4096, + ) + + +def test_hallucination_scorer_score(hallucination_scorer, mock_create): + output = "John's favorite cheese is cheddar." + context = "John likes various types of cheese." + result = hallucination_scorer.score(output=output, context=context) + # we should be able to do this validation + _ = HallucinationResponse.model_validate(result) + + assert result["has_hallucination"] == True + assert result["conclusion"] == "The output is consistent with the input data." + assert len(result["reasonings"]) == 1 + assert result["reasonings"][0]["hallucination_type"] == "No Hallucination" + + +@pytest.mark.asyncio +async def test_hallucination_scorer_eval(hallucination_scorer): + dataset = [ + {"context": "John likes various types of cheese."}, + {"context": "Pepe likes various types of cheese."}, + ] + + @weave.op + def model(): + return "John's favorite cheese is cheddar." + + evaluation = weave.Evaluation( + dataset=dataset, + scorers=[hallucination_scorer], + ) + result = await evaluation.evaluate(model) + assert result["HallucinationFreeScorer"]["has_hallucination"]["true_count"] == 2 + assert ( + result["HallucinationFreeScorer"]["has_hallucination"]["true_fraction"] == 1.0 + ) + + +@pytest.mark.asyncio +async def test_hallucination_scorer_eval2(hallucination_scorer): + dataset = [ + { + "input": "John likes various types of cheese.", + "other_col": "John's favorite cheese is cheddar.", + }, + { + "input": "Pepe likes various types of cheese.", + "other_col": "Pepe's favorite cheese is gouda.", + }, + ] + + @weave.op + def model(input): + return "The person's favorite cheese is cheddar." + + hallucination_scorer.column_map = {"context": "input", "output": "other_col"} + + evaluation = weave.Evaluation( + dataset=dataset, + scorers=[hallucination_scorer], + ) + result = await evaluation.evaluate(model) + assert result["HallucinationFreeScorer"]["has_hallucination"]["true_count"] == 2 + assert ( + result["HallucinationFreeScorer"]["has_hallucination"]["true_fraction"] == 1.0 + ) diff --git a/tests/scorers/test_json_scorer.py b/tests/scorers/test_json_scorer.py new file mode 100644 index 00000000000..c80b7a54743 --- /dev/null +++ b/tests/scorers/test_json_scorer.py @@ -0,0 +1,21 @@ +import pytest + +from weave.scorers import ValidJSONScorer + + +@pytest.mark.parametrize( + "output, expected_result", + [ + ('{"city": "San Francisco", "country": "USA"}', True), + ('{"city": "San Francisco", "country": "USA"', False), + ("Just a plain string.", False), + ("[1, 2, 3, 4, 5]", True), + ('{"person": {"name": "John", "age": 30}, "city": "New York"}', True), + ("{}", True), + ("[]", True), + ], +) +def test_json_scorer(output, expected_result): + scorer = ValidJSONScorer() + result = scorer.score(output) + assert result["json_valid"] is expected_result diff --git a/tests/scorers/test_llm_integrations.py b/tests/scorers/test_llm_integrations.py new file mode 100644 index 00000000000..0336955d740 --- /dev/null +++ b/tests/scorers/test_llm_integrations.py @@ -0,0 +1,82 @@ +import os + +import pytest + +from weave.scorers.summarization_scorer import ( + SummarizationEvaluationResponse, + SummarizationScorer, +) + +# Define providers and their models +TEST_MODELS = { + "openai": ["gpt-4o-mini", "gpt-4o"], + "anthropic": ["claude-3-haiku-20240307", "claude-3-5-sonnet-20240620"], + "mistral": ["mistral-small-latest", "mistral-large-latest"], + "gemini": ["gemini-1.5-flash", "gemini-1.5-pro-latest"], +} + + +def get_client_and_model(provider, model): + api_key_env_vars = { + "openai": "OPENAI_API_KEY", + "anthropic": "ANTHROPIC_API_KEY", + "mistral": "MISTRAL_API_KEY", + "gemini": "GOOGLE_API_KEY", + } + + if provider not in TEST_MODELS: + raise ValueError(f"Unknown provider: {provider}") + + if model not in TEST_MODELS[provider]: + raise ValueError(f"Model '{model}' not available for provider '{provider}'") + + api_key = os.getenv(api_key_env_vars[provider]) + if not api_key: + raise EnvironmentError( + f"API key for {provider} not found. Please set '{api_key_env_vars[provider]}' environment variable." + ) + + if provider == "openai": + from openai import OpenAI + + client = OpenAI(api_key=api_key) + elif provider == "anthropic": + from anthropic import Anthropic + + client = Anthropic(api_key=api_key) + elif provider == "mistral": + from mistralai import Mistral + + client = Mistral(api_key=api_key) + elif provider == "gemini": + import google.generativeai as genai + + genai.configure(api_key=api_key) + client = genai.GenerativeModel(model_name=model) + model = "gemini" # Adjust if necessary + + return client, model + + +# Generate test parameters +test_params = [ + (provider, model) for provider, models in TEST_MODELS.items() for model in models +] + + +@pytest.mark.parametrize("provider,model", test_params, ids=lambda p: f"{p[0]}:{p[1]}") +def test_summarization_scorer_evaluate_summary(provider, model): + client, model_id = get_client_and_model(provider, model) + + summarization_scorer = SummarizationScorer( + client=client, + model_id=model_id, + temperature=0.7, + max_tokens=1024, + ) + input_text = "This is the original text." + summary_text = "This is the summary." + result = summarization_scorer.evaluate_summary( + input=input_text, summary=summary_text + ) + assert isinstance(result, SummarizationEvaluationResponse) diff --git a/tests/scorers/test_pydantic_scorer.py b/tests/scorers/test_pydantic_scorer.py new file mode 100644 index 00000000000..f06dc83bca7 --- /dev/null +++ b/tests/scorers/test_pydantic_scorer.py @@ -0,0 +1,30 @@ +import pytest +from pydantic import BaseModel + +from weave.scorers import PydanticScorer + + +class User(BaseModel): + name: str + age: int + + +@pytest.fixture +def user_scorer(): + return PydanticScorer(model=User) + + +@pytest.mark.parametrize( + "input_data, expected_result", + [ + ('{"name": "John", "age": 30}', {"valid_pydantic": True}), + ({"name": "John", "age": 30}, {"valid_pydantic": True}), + ('{"name": "John", "age": "thirty"}', {"valid_pydantic": False}), + ({"name": "John", "age": "thirty"}, {"valid_pydantic": False}), + ('{"name": "John"}', {"valid_pydantic": False}), + ('{"name": "John", "age": 30, "city": "New York"}', {"valid_pydantic": True}), + (123, {"valid_pydantic": False}), + ], +) +def test_pydantic_scorer(user_scorer, input_data, expected_result): + assert user_scorer.score(input_data) == expected_result diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py new file mode 100644 index 00000000000..f663ac965c2 --- /dev/null +++ b/tests/scorers/test_ragas_scorer.py @@ -0,0 +1,66 @@ +import pytest +from openai import OpenAI + +from weave.scorers import ( + ContextEntityRecallScorer, + ContextRelevancyScorer, +) +from weave.scorers.ragas_scorer import ( + EntityExtractionResponse, + RelevancyResponse, +) + + +# Mock the create function +@pytest.fixture +def mock_create(monkeypatch): + def _mock_create(*args, **kwargs): + # Retrieve the response_model to return appropriate mock responses + response_model = kwargs.get("response_model") + if response_model is EntityExtractionResponse: + return EntityExtractionResponse(entities=["Paris"]) + elif response_model is RelevancyResponse: + return RelevancyResponse( + reasoning="The context directly answers the question.", + relevancy_score=1, + ) + + monkeypatch.setattr("weave.scorers.ragas_scorer.create", _mock_create) + + +@pytest.fixture +def context_entity_recall_scorer(mock_create): + return ContextEntityRecallScorer( + client=OpenAI(api_key="DUMMY_API_KEY"), + model_id="gpt-4o", + temperature=0.7, + max_tokens=1024, + ) + + +@pytest.fixture +def context_relevancy_scorer(mock_create): + return ContextRelevancyScorer( + client=OpenAI(api_key="DUMMY_API_KEY"), + model_id="gpt-4o", + temperature=0.7, + max_tokens=1024, + ) + + +def test_context_entity_recall_scorer_score(context_entity_recall_scorer): + output = "Paris is the capital of France." + context = "The capital city of France is Paris." + result = context_entity_recall_scorer.score(output, context) + assert isinstance(result, dict) + assert "recall" in result + assert result["recall"] == 1.0 # Assuming full recall in mock response + + +def test_context_relevancy_scorer_score(context_relevancy_scorer): + output = "What is the capital of France?" + context = "Paris is the capital city of France." + result = context_relevancy_scorer.score(output, context) + assert isinstance(result, dict) + assert "relevancy_score" in result + assert result["relevancy_score"] == 1 # Assuming relevancy in mock response diff --git a/tests/scorers/test_similarity_scorer.py b/tests/scorers/test_similarity_scorer.py new file mode 100644 index 00000000000..0a02296a55a --- /dev/null +++ b/tests/scorers/test_similarity_scorer.py @@ -0,0 +1,92 @@ +import pytest +from openai import OpenAI + +import weave +from weave.scorers.llm_utils import OPENAI_DEFAULT_EMBEDDING_MODEL +from weave.scorers.similarity_scorer import EmbeddingSimilarityScorer + + +# mock the create function +@pytest.fixture +def mock_embed(monkeypatch): + def _mock_embed(*args, **kwargs): + import random + + return [[random.random() for _ in range(1024)] for _ in range(2)] + + monkeypatch.setattr("weave.scorers.similarity_scorer.embed", _mock_embed) + + +@pytest.fixture +def similarity_scorer(mock_embed): + return EmbeddingSimilarityScorer( + client=OpenAI(api_key="DUMMY_API_KEY"), + model_id=OPENAI_DEFAULT_EMBEDDING_MODEL, + threshold=0.9, + ) + + +def test_similarity_scorer_score(similarity_scorer): + output = "John's favorite cheese is cheddar." + target = "John likes various types of cheese." + similarity_scorer.threshold = 0.0 + result = similarity_scorer.score(output=output, target=target) + assert result["similarity_score"] > 0.0 + assert result["is_similar"] is True + + +def test_similarity_scorer_not_similar(similarity_scorer): + output = "John's favorite cheese is cheddar." + target = "John likes various types of cheese." + similarity_scorer.threshold = 0.99 + result = similarity_scorer.score(output=output, target=target) + assert result["similarity_score"] < 0.99 + assert result["is_similar"] is False + + +@pytest.mark.asyncio +async def test_similarity_scorer_eval(similarity_scorer): + dataset = [ + {"target": "John likes various types of cheese."}, + {"target": "Pepe likes various types of cheese."}, + ] + + @weave.op + def model(): + return "He's name is John" + + evaluation = weave.Evaluation( + dataset=dataset, + scorers=[similarity_scorer], + ) + result = await evaluation.evaluate(model) + assert result["EmbeddingSimilarityScorer"]["similarity_score"]["mean"] > 0.0 + assert 0 <= result["EmbeddingSimilarityScorer"]["is_similar"]["true_count"] <= 2 + + +@pytest.mark.asyncio +async def test_similarity_scorer_eval2(similarity_scorer): + dataset = [ + { + "input": "He's name is John", + "other_col": "John likes various types of cheese.", + }, + { + "input": "He's name is Pepe.", + "other_col": "Pepe likes various types of cheese.", + }, + ] + + @weave.op + def model(input): + return "John likes various types of cheese." + + similarity_scorer.column_map = {"target": "other_col"} + + evaluation = weave.Evaluation( + dataset=dataset, + scorers=[similarity_scorer], + ) + result = await evaluation.evaluate(model) + assert result["EmbeddingSimilarityScorer"]["similarity_score"]["mean"] > 0.0 + assert 0 <= result["EmbeddingSimilarityScorer"]["is_similar"]["true_count"] <= 2 diff --git a/tests/scorers/test_string_scorer.py b/tests/scorers/test_string_scorer.py new file mode 100644 index 00000000000..2c635ea81db --- /dev/null +++ b/tests/scorers/test_string_scorer.py @@ -0,0 +1,33 @@ +import pytest + +from weave.scorers import ( + LevenshteinScorer, + StringMatchScorer, +) + + +@pytest.mark.parametrize( + "output, target, expected_result", + [ + ("Morgan", "Hello my name is Morgan", True), + ("Alice", "Hello my name is Bob", False), + ], +) +def test_string_match_scorer(output, target, expected_result): + scorer = StringMatchScorer() + result = scorer.score(output, target) + assert result["string_in_input"] is expected_result + + +@pytest.mark.parametrize( + "output, target, expected_distance", + [ + ("Hello", "Hallo", 1), + ("Hello", "Hello", 0), + ("Hello", "World", 4), + ], +) +def test_levenshtein_scorer(output, target, expected_distance): + scorer = LevenshteinScorer() + result = scorer.score(output, target) + assert result["levenshtein_distance"] == expected_distance diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py new file mode 100644 index 00000000000..ca6c3f7139b --- /dev/null +++ b/tests/scorers/test_summarization_scorer.py @@ -0,0 +1,110 @@ +import pytest +from openai import OpenAI + +import weave +from weave.scorers import ( + SummarizationScorer, +) +from weave.scorers.summarization_scorer import ( + EntityExtractionResponse, + SummarizationEvaluationResponse, +) + + +@pytest.fixture +def mock_create(monkeypatch): + def _mock_create(*args, **kwargs): + response_model = kwargs.get("response_model") + if response_model == EntityExtractionResponse: + return EntityExtractionResponse(entities=["entity1", "entity2"]) + elif response_model == SummarizationEvaluationResponse: + return SummarizationEvaluationResponse( + think_step_by_step="This is some reasoning.", + summarization_evaluation="excellent", + ) + + # Patch the 'create' function wherever it is called + monkeypatch.setattr("weave.scorers.summarization_scorer.create", _mock_create) + + +@pytest.fixture +def summarization_scorer(mock_create): + return SummarizationScorer( + client=OpenAI(api_key="DUMMY_API_KEY"), + model_id="gpt-4o", + temperature=0.7, + max_tokens=1024, + ) + + +def test_summarization_scorer_evaluate_summary(summarization_scorer, mock_create): + input_text = "This is the original text." + summary_text = "This is the summary." + result = summarization_scorer.evaluate_summary( + input=input_text, summary=summary_text + ) + assert isinstance(result, SummarizationEvaluationResponse) + assert result.summarization_evaluation == "excellent" + assert result.think_step_by_step == "This is some reasoning." + + +@pytest.mark.asyncio +async def test_summarization_scorer_score(summarization_scorer): + input_text = "This is the original text." + output_text = "This is the summary." + result = await summarization_scorer.score(input=input_text, output=output_text) + assert isinstance(result, dict) + assert "summarization_eval_score" in result + assert result["summarization_eval_score"] == 1.0 # "excellent" maps to 1.0 + assert "llm_eval_reasoning" in result + assert result["llm_eval_reasoning"] == "This is some reasoning." + assert "is_entity_dense" in result + assert isinstance(result["is_entity_dense"], bool) + assert "entity_density" in result + assert isinstance(result["entity_density"], float) + + +def test_summarization_scorer_initialization(summarization_scorer): + assert isinstance(summarization_scorer, SummarizationScorer) + assert summarization_scorer.model_id == "gpt-4o" + assert summarization_scorer.temperature == 0.7 + assert summarization_scorer.max_tokens == 1024 + + +def test_summarization_scorer_extract_entities(summarization_scorer): + text = "This is a sample text with entities." + entities = summarization_scorer.extract_entities(text) + assert isinstance(entities, list) + assert len(entities) == 2 + assert "entity1" in entities + assert "entity2" in entities + + +@pytest.mark.asyncio +async def test_evaluate_summary_scorer(summarization_scorer): + dataset = [ + { + "input": "This is the original text.", + }, + { + "input": "This is another original text.", + }, + ] + evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer]) + + @weave.op + def model(input: str): + return "This is the summary." + + result = await evaluation.evaluate(model) + assert isinstance(result, dict) + assert "SummarizationScorer" in result + assert "entity_density" in result["SummarizationScorer"] + assert "is_entity_dense" in result["SummarizationScorer"] + assert "summarization_eval_score" in result["SummarizationScorer"] + assert "model_latency" in result + + assert result["SummarizationScorer"]["entity_density"]["mean"] == pytest.approx(0.5) + assert result["SummarizationScorer"]["is_entity_dense"]["true_count"] == 2 + assert result["SummarizationScorer"]["is_entity_dense"]["true_fraction"] == 1.0 + assert result["SummarizationScorer"]["summarization_eval_score"]["mean"] == 1.0 diff --git a/tests/scorers/test_utils.py b/tests/scorers/test_utils.py new file mode 100644 index 00000000000..03d95aff6c9 --- /dev/null +++ b/tests/scorers/test_utils.py @@ -0,0 +1,8 @@ +from weave.scorers.utils import stringify + + +def test_stringify(): + assert stringify("Hello, world!") == "Hello, world!" + assert stringify(123) == "123" + assert stringify([1, 2, 3]) == "[\n 1,\n 2,\n 3\n]" + assert stringify({"a": 1, "b": 2}) == '{\n "a": 1,\n "b": 2\n}' diff --git a/tests/trace/test_client_trace.py b/tests/trace/test_client_trace.py index 857d9b50042..2f444e30198 100644 --- a/tests/trace/test_client_trace.py +++ b/tests/trace/test_client_trace.py @@ -1443,7 +1443,7 @@ def test_named_reuse(client): dataset = weave.ref(d_ref.uri()).get() @weave.op() - async def dummy_score(model_output): + async def dummy_score(output): return 1 class SimpleModel(weave.Model): diff --git a/tests/trace/test_evaluate.py b/tests/trace/test_evaluate.py index f5ada25215f..76cc9f5b739 100644 --- a/tests/trace/test_evaluate.py +++ b/tests/trace/test_evaluate.py @@ -4,14 +4,14 @@ import weave from weave import Dataset, Evaluation, Model -from weave.flow.scorer import MultiTaskBinaryClassificationF1 +from weave.scorers import MultiTaskBinaryClassificationF1 dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}] dataset = Dataset(rows=dataset_rows) expected_eval_result = { - "model_output": {"mean": 9.5}, + "output": {"mean": 9.5}, "score": {"true_count": 1, "true_fraction": 0.5}, "model_latency": {"mean": pytest.approx(0, abs=1)}, } @@ -24,8 +24,8 @@ async def predict(self, input) -> str: @weave.op() -def score(target, model_output): - return target == model_output +def score(target, output): + return target == output @weave.op() @@ -57,7 +57,7 @@ async def model_predict(input, target) -> str: ) result = asyncio.run(evaluation.evaluate(model_predict)) assert result == { - "model_output": {"mean": 18.5}, + "output": {"mean": 18.5}, "score": {"true_count": 0, "true_fraction": 0.0}, "model_latency": { "mean": pytest.approx(0, abs=1), @@ -111,8 +111,8 @@ async def infer(self, input) -> str: def test_score_as_class(client): class MyScorer(weave.Scorer): @weave.op() - def score(self, target, model_output): - return target == model_output + def score(self, target, output): + return target == output evaluation = Evaluation( dataset=dataset_rows, @@ -121,7 +121,7 @@ def score(self, target, model_output): model = EvalModel() result = asyncio.run(evaluation.evaluate(model)) assert result == { - "model_output": {"mean": 9.5}, + "output": {"mean": 9.5}, "MyScorer": {"true_count": 1, "true_fraction": 0.5}, "model_latency": { "mean": pytest.approx(0, abs=1), @@ -137,8 +137,8 @@ def summarize(self, score_rows): return {"awesome": 3} @weave.op() - def score(self, target, model_output): - return target == model_output + def score(self, target, output): + return target == output evaluation = Evaluation( dataset=dataset_rows, @@ -147,7 +147,7 @@ def score(self, target, model_output): model = EvalModel() result = asyncio.run(evaluation.evaluate(model)) assert result == { - "model_output": {"mean": 9.5}, + "output": {"mean": 9.5}, "MyScorer": {"awesome": 3}, "model_latency": { "mean": pytest.approx(0, abs=1), @@ -167,7 +167,7 @@ def return_pred(pred): result = asyncio.run(evaluation.evaluate(return_pred)) assert result == { - "model_output": { + "output": { "a": {"true_count": 1, "true_fraction": 1.0}, "b": {"true_count": 0, "true_fraction": 0.0}, }, diff --git a/tests/trace/test_evaluation_performance.py b/tests/trace/test_evaluation_performance.py index 51aceb0c1e8..8ccd8f9639b 100644 --- a/tests/trace/test_evaluation_performance.py +++ b/tests/trace/test_evaluation_performance.py @@ -91,8 +91,8 @@ def predict(question: str): return "I don't know" @weave.op() - def score(question: str, expected: str, model_output: str): - return model_output == expected + def score(question: str, expected: str, output: str): + return output == expected evaluation = weave.Evaluation( name="My Evaluation", diff --git a/tests/trace/test_evaluations.py b/tests/trace/test_evaluations.py index 7585980ce78..d137a92d4ef 100644 --- a/tests/trace/test_evaluations.py +++ b/tests/trace/test_evaluations.py @@ -9,6 +9,7 @@ import weave from tests.trace.util import AnyIntMatcher from weave import Evaluation, Model +from weave.scorers import Scorer from weave.trace.feedback_types.score import SCORE_TYPE_NAME from weave.trace.weave_client import get_ref from weave.trace_server import trace_server_interface as tsi @@ -45,7 +46,6 @@ class MyModel(Model): @weave.op() def predict(self, question: str): - # Here's where you would add your LLM call and return the output return {"generated_text": "Hello, " + question + self.prompt} @@ -58,12 +58,12 @@ async def do_quickstart(): ] @weave.op() - def match_score1(expected: str, model_output: dict) -> dict: - return {"match": expected == model_output["generated_text"]} + def match_score1(expected: str, output: dict) -> dict: + return {"match": expected == output["generated_text"]} @weave.op() - def match_score2(expected: dict, model_output: dict) -> dict: - return {"match": expected == model_output["generated_text"]} + def match_score2(expected: dict, output: dict) -> dict: + return {"match": expected == output["generated_text"]} model = MyModel(prompt="World") evaluation = Evaluation(dataset=examples, scorers=[match_score1, match_score2]) @@ -192,32 +192,32 @@ def predict(self, question: str): return {"response": res["response"], "confidence": 1 / (len(res) + 1)} -def score_int(expected: str, model_output: dict) -> int: +def score_int(expected: str, output: dict) -> int: matches = 0 - for i in range(min(len(expected), len(model_output["response"]))): - if expected[i] == model_output["response"][i]: + for i in range(min(len(expected), len(output["response"]))): + if expected[i] == output["response"][i]: matches += 1 return matches -def score_float(expected: str, model_output: dict) -> float: - matches = score_int(expected, model_output) - return matches / max(len(expected), len(model_output["response"])) +def score_float(expected: str, output: dict) -> float: + matches = score_int(expected, output) + return matches / max(len(expected), len(output["response"])) -def score_bool(expected: str, model_output: dict) -> bool: - return score_float(expected, model_output) == 1.0 +def score_bool(expected: str, output: dict) -> bool: + return score_float(expected, output) == 1.0 -def score_dict(expected: str, model_output: dict) -> dict: +def score_dict(expected: str, output: dict) -> dict: return { - "d_int": score_int(expected, model_output), - "d_float": score_float(expected, model_output), - "d_bool": score_bool(expected, model_output), + "d_int": score_int(expected, output), + "d_float": score_float(expected, output), + "d_bool": score_bool(expected, output), "d_nested": { - "d_int": score_int(expected, model_output), - "d_float": score_float(expected, model_output), - "d_bool": score_bool(expected, model_output), + "d_int": score_int(expected, output), + "d_float": score_float(expected, output), + "d_bool": score_bool(expected, output), }, "reason": "This is a test reason", } @@ -225,32 +225,32 @@ def score_dict(expected: str, model_output: dict) -> dict: class MyIntScorer(weave.Scorer): @weave.op() - def score(self, expected: str, model_output: dict) -> int: - return score_int(expected, model_output) + def score(self, expected: str, output: dict) -> int: + return score_int(expected, output) class MyFloatScorer(weave.Scorer): @weave.op() - def score(self, expected: str, model_output: dict) -> float: - return score_float(expected, model_output) + def score(self, expected: str, output: dict) -> float: + return score_float(expected, output) class MyBoolScorer(weave.Scorer): @weave.op() - def score(self, expected: str, model_output: dict) -> bool: - return score_bool(expected, model_output) + def score(self, expected: str, output: dict) -> bool: + return score_bool(expected, output) class MyDictScorer(weave.Scorer): @weave.op() - def score(self, expected: str, model_output: dict) -> dict: - return score_dict(expected, model_output) + def score(self, expected: str, output: dict) -> dict: + return score_dict(expected, output) class MyDictScorerWithCustomFloatSummary(weave.Scorer): @weave.op() - def score(self, expected: str, model_output: dict) -> dict: - return score_dict(expected, model_output) + def score(self, expected: str, output: dict) -> dict: + return score_dict(expected, output) @weave.op() def summarize(self, score_rows: list) -> Optional[dict]: @@ -260,8 +260,8 @@ def summarize(self, score_rows: list) -> Optional[dict]: class MyDictScorerWithCustomBoolSummary(weave.Scorer): @weave.op() - def score(self, expected: str, model_output: dict) -> dict: - return score_dict(expected, model_output) + def score(self, expected: str, output: dict) -> dict: + return score_dict(expected, output) @weave.op() def summarize(self, score_rows: list) -> Optional[dict]: @@ -271,8 +271,8 @@ def summarize(self, score_rows: list) -> Optional[dict]: class MyDictScorerWithCustomDictSummary(weave.Scorer): @weave.op() - def score(self, expected: str, model_output: dict) -> dict: - return score_dict(expected, model_output) + def score(self, expected: str, output: dict) -> dict: + return score_dict(expected, output) @weave.op() def summarize(self, score_rows: list) -> Optional[dict]: @@ -393,7 +393,7 @@ async def test_evaluation_data_topology(client): # Prediction Section confidence = 1 / 4 - model_output = { + output = { "response": "A", "confidence": confidence, } @@ -432,7 +432,7 @@ async def test_evaluation_data_topology(client): } # Prediction - assert predict_call.output == model_output + assert predict_call.output == output assert with_empty_feedback(predict_call.summary) == with_empty_feedback( predict_usage ) @@ -457,7 +457,7 @@ async def test_evaluation_data_topology(client): # Predict And Score Group assert predict_and_score_call.output == { - "model_output": model_output, + "output": output, "scores": { "score_int": score_int_score, "score_float": score_float_score, @@ -471,7 +471,7 @@ async def test_evaluation_data_topology(client): } # Summary section - model_output_summary = { + output_summary = { "confidence": {"mean": confidence}, } score_int_auto_summary = {"mean": 1.5} @@ -544,7 +544,7 @@ async def test_evaluation_data_topology(client): "MyDictScorerWithCustomBoolSummary": dict_scorer_bool_summary, "MyDictScorerWithCustomDictSummary": dict_scorer_dict_summary, "model_latency": model_latency, - "model_output": model_output_summary, + "output": output_summary, } ) assert evaluate_call.summary == with_empty_feedback(predict_usage_summary) @@ -566,13 +566,13 @@ async def test_evaluation_data_topology(client): def make_test_eval(): - def function_score(target: dict, model_output: dict) -> dict: - return {"correct": target == model_output} + def function_score(expected: str, output: dict) -> dict: + return {"correct": expected == output["generated_text"]} evaluation = weave.Evaluation( name="fruit_eval", dataset=[ - {"id": "0", "sentence": "a", "target": "b"}, + {"id": "0", "sentence": "a", "expected": "b"}, ], scorers=[function_score], ) @@ -665,7 +665,7 @@ async def test_eval_is_robust_to_missing_values(client): def model_func(model_res) -> dict: return resp[model_res] - def function_score(scorer_res, model_output) -> dict: + def function_score(scorer_res, output) -> dict: return resp[scorer_res] evaluation = weave.Evaluation( @@ -676,7 +676,7 @@ def function_score(scorer_res, model_output) -> dict: res = await evaluation.evaluate(model_func) assert res == { - "model_output": {"a": {"mean": 3.0}, "b": {"c": {"mean": 2.0}}}, + "output": {"a": {"mean": 3.0}, "b": {"c": {"mean": 2.0}}}, "function_score": {"a": {"mean": 3.0}, "b": {"c": {"mean": 2.0}}}, "model_latency": {"mean": pytest.approx(0, abs=1)}, } @@ -715,7 +715,7 @@ def model_func( return text - def function_score(image, dc, model, obj, text, model_output) -> bool: + def function_score(image, dc, model, obj, text, output) -> bool: assert isinstance(image, Image.Image) # Note: when we start recursively saving dataset rows, this will @@ -728,7 +728,7 @@ def function_score(image, dc, model, obj, text, model_output) -> bool: assert isinstance(model, MyModel) assert isinstance(obj, MyObj) assert isinstance(text, str) - assert isinstance(model_output, str) + assert isinstance(output, str) return True @@ -780,6 +780,161 @@ def function_score(image, dc, model, obj, text, model_output) -> bool: @pytest.mark.asyncio +async def test_evaluation_with_column_map(): + # Define a dummy scorer that uses column_map + class DummyScorer(Scorer): + @weave.op() + def score(self, foo: str, bar: str, output: str, target: str) -> dict: + # Return whether foo + bar equals output + return {"match": (foo + bar) == output == target} + + # Create the scorer with column_map mapping 'foo'->'col1', 'bar'->'col2' + dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"}) + + @weave.op() + def model_function(col1, col2): + # For testing, return the concatenation of col1 and col2 + return col1 + col2 + + dataset = [ + {"col1": "Hello", "col2": "World", "target": "HelloWorld"}, + {"col1": "Hi", "col2": "There", "target": "HiThere"}, + {"col1": "Good", "col2": "Morning", "target": "GoodMorning"}, + {"col1": "Bad", "col2": "Evening", "target": "GoodEvening"}, + ] + + evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer]) + + # Run the evaluation + eval_out = await evaluation.evaluate(model_function) + + # Check that 'DummyScorer' is in the results + assert "DummyScorer" in eval_out + + # The expected summary should show that 3 out of 4 predictions matched + expected_results = {"true_count": 3, "true_fraction": 0.75} + assert ( + eval_out["DummyScorer"]["match"] == expected_results + ), "The summary should reflect the correct number of matches" + + +@pytest.mark.asyncio +async def test_evaluation_with_wrong_column_map(): + # Define a dummy scorer that uses column_map + class DummyScorer(Scorer): + @weave.op() + def score(self, foo: str, bar: str, output: str, target: str) -> dict: + # Return whether foo + bar equals output + return {"match": (foo + bar) == output == target} + + @weave.op() + def model_function(col1, col2): + # For testing, return the concatenation of col1 and col2 + return col1 + col2 + + dataset = [ + {"col1": "Hello", "col2": "World", "target": "HelloWorld"}, # True + {"col1": "Hi", "col2": "There", "target": "HiThere"}, # True + {"col1": "Good", "col2": "Morning", "target": "GoodMorning"}, # True + {"col1": "Bad", "col2": "Evening", "target": "GoodEvening"}, # False + ] + + # Test that the column map is correctly used + dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"}) + evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer]) + eval_out = await evaluation.evaluate(model_function) + assert "DummyScorer" in eval_out + assert eval_out["DummyScorer"]["match"] == {"true_count": 3, "true_fraction": 0.75} + + with pytest.raises(ValueError) as excinfo: + # Create the scorer with column_map mapping 'foo'->'col1', 'bar'->'col3' + # this is wrong because col3 does not exist + dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col3"}) + evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer]) + await evaluation.predict_and_score(model_function, dataset[0]) + assert "which is not in the scorer's argument names" in str(excinfo.value) + + with pytest.raises(ValueError) as excinfo: + # Create the scorer with column_map missing a column + dummy_scorer = DummyScorer(column_map={"foo": "col1"}) + evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer]) + await evaluation.predict_and_score(model_function, dataset[0]) + assert "is not found in the dataset columns" in str(excinfo.value) + + with pytest.raises(ValueError) as excinfo: + # Create the scorer with wrong argument name + dummy_scorer = DummyScorer(column_map={"jeez": "col1"}) + evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer]) + await evaluation.predict_and_score(model_function, dataset[0]) + assert "is not found in the dataset columns and is not mapped" in str( + excinfo.value + ) + + +# Define another dummy scorer +@pytest.mark.asyncio +async def test_evaluation_with_multiple_column_maps(): + class DummyScorer(Scorer): + @weave.op() + def score(self, foo: str, bar: str, output: str, target: str) -> dict: + # Return whether foo + bar equals output + return {"match": (foo + bar) == output == target} + + class AnotherDummyScorer(Scorer): + @weave.op() + def score(self, input1: str, input2: str, output: str) -> dict: + # Return whether input1 == output reversed + return {"match": input1 == output[::-1]} + + # First scorer maps 'foo'->'col1', 'bar'->'col2' + dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"}) + + # Second scorer maps 'input1'->'col2', 'input2'->'col1' + another_dummy_scorer = AnotherDummyScorer( + column_map={"input1": "col2", "input2": "col1"} + ) + + @weave.op() + def model_function(col1, col2): + # For testing, return the concatenation of col1 and col2 + return col1 + col2 + + dataset = [ + {"col1": "abc", "col2": "def", "target": "abcdef"}, + {"col1": "123", "col2": "456", "target": "1111"}, + {"col1": "xyz", "col2": "zyx", "target": "zzzzzz"}, + ] + + evaluation = Evaluation( + dataset=dataset, scorers=[dummy_scorer, another_dummy_scorer] + ) + + # Run the evaluation + eval_out = await evaluation.evaluate(model_function) + + # Check that both scorers are in the results + assert "DummyScorer" in eval_out + assert "AnotherDummyScorer" in eval_out + + # Assertions for the first scorer + expected_results_dummy = {"true_count": 1, "true_fraction": 1.0 / 3} + assert ( + eval_out["DummyScorer"]["match"] == expected_results_dummy + ), "All concatenations should match the target" + + # Assertions for the second scorer + # Since input1 == col2, and output is col1 + col2, we check if col2 == (col1 + col2)[::-1] + # Evaluate manually: + # First row: col2 = "def", output = "abcdef", output[::-1] = "fedcba" -> "def" != "fedcba" + # Second row: col2 = "456", output = "123456", output[::-1] = "654321" -> "456" != "654321" + # Third row: col2 = "zyx", output = "xyzzyx", output[::-1] = "xyzzyx" -> "zyx" == "xyzzyx" is False + # So all matches are False + expected_results_another_dummy = {"true_count": 0, "true_fraction": 0.0} + assert ( + eval_out["AnotherDummyScorer"]["match"] == expected_results_another_dummy + ), "No matches should be found for AnotherDummyScorer" + + async def test_feedback_is_correctly_linked(client): @weave.op def predict(text: str) -> str: diff --git a/tests/trace/test_weave_client.py b/tests/trace/test_weave_client.py index 95866e3ea5c..6f0af63d103 100644 --- a/tests/trace/test_weave_client.py +++ b/tests/trace/test_weave_client.py @@ -754,8 +754,8 @@ async def model_predict(input) -> str: dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}] @weave.op() - async def score(target, model_output): - return target == model_output + async def score(target, output): + return target == output evaluation = Evaluation( name="my-eval", @@ -764,7 +764,7 @@ async def score(target, model_output): ) result = asyncio.run(evaluation.evaluate(model_predict)) expected_eval_result = { - "model_output": {"mean": 9.5}, + "output": {"mean": 9.5}, "score": {"true_count": 1, "true_fraction": 0.5}, } assert result == expected_eval_result @@ -864,8 +864,8 @@ def test_nested_ref_is_inner(client): dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}] @weave.op() - async def score(target, model_output): - return target == model_output + async def score(target, output): + return target == output evaluation = Evaluation( name="my-eval", diff --git a/weave-js/src/common/components/Markdown.tsx b/weave-js/src/common/components/Markdown.tsx index 70c647642e2..733b4367ae2 100644 --- a/weave-js/src/common/components/Markdown.tsx +++ b/weave-js/src/common/components/Markdown.tsx @@ -105,10 +105,15 @@ const Markdown: React.FC = ({ updateHeight(); }, [html, updateHeight]); + // The `tw-eject` class is used to optionally eject from `.tw-style` resets if this component happens to be rendered with a `.tw-style` parent in the tree + // see: src/wandbTailwindPreflight.css return ( -
+
3: + if len(dataset_column_names) > 10: dataset_column_names_str += ", ..." required_arg_names = [ param.name for param in score_signature.parameters.values() if param.default == inspect.Parameter.empty ] - required_arg_names.remove("model_output") + required_arg_names.remove("output") message = textwrap.dedent( f""" Call error: {e} + If using the `Scorer` weave class, you can set the `scorer.column_map` + attribute to map scorer argument names to dataset columns. + + For example, if the `score` expects "output", "input" and "ground_truth" and we have a dataset + with columns "question" and "answer", `column_map` can be used to map the non-output parameter like so: + {{"input": "question", "ground_truth": "answer"}} + + scorer argument names: {score_arg_names} + dataset keys: {example.keys()} + scorer.column_map: {getattr(scorer, 'column_map', '{}')} + Options for resolving: - a. change {scorer_name} argument names to match a subset of dataset column names ({dataset_column_names_str}) - b. change dataset column names to match expected {scorer_name} argument names: {required_arg_names} + a. if using the `Scorer` weave class, you can set the `scorer.column_map` attribute to map scorer argument names to dataset column names or + b. change the argument names the in the scoring function of {scorer_name} to match a subset of dataset column names: ({dataset_column_names_str}) or + c. change dataset column names to match expected {scorer_name} argument names: {required_arg_names} """ ) raise OpCallError(message) scores[scorer_name] = result return { - "model_output": model_output, + "output": model_output, "scores": scores, "model_latency": model_latency, } @@ -341,7 +441,7 @@ async def eval_example(example: dict) -> dict: except Exception as e: print("Predict and score failed") traceback.print_exc() - return {"model_output": None, "scores": {}} + return {"output": None, "scores": {}} return eval_row n_complete = 0 @@ -358,7 +458,7 @@ async def eval_example(example: dict) -> dict: # f"Evaluating... {duration:.2f}s [{n_complete} / {len(self.dataset.rows)} complete]" # type:ignore # ) if eval_row is None: - eval_row = {"model_output": None, "scores": {}} + eval_row = {"output": None, "scores": {}} else: eval_row["scores"] = eval_row.get("scores", {}) for scorer in self.scorers or []: diff --git a/weave/flow/scorer.py b/weave/flow/scorer.py index e69f3afeb3f..86df3d6a055 100644 --- a/weave/flow/scorer.py +++ b/weave/flow/scorer.py @@ -1,158 +1,12 @@ -from collections import defaultdict -from numbers import Number -from typing import Any, Callable, Optional, Sequence, Tuple, Union - -import numpy as np -from pydantic import BaseModel - -import weave -from weave.flow.obj import Object -from weave.trace.isinstance import weave_isinstance -from weave.trace.op import Op, as_op, is_op - - -class Scorer(Object): - def score(self, target: Any, model_output: Any) -> Any: - raise NotImplementedError - - @weave.op() - def summarize(self, score_rows: list) -> Optional[dict]: - return auto_summarize(score_rows) - - -def stderr(data: Sequence[Union[int, float]]) -> float: - if len(data) > 1: - sample_variance = np.var(data, ddof=1) - return float(np.sqrt(sample_variance / len(data))) - else: - return 0 - - -def auto_summarize(data: list) -> Optional[dict[str, Any]]: - """Automatically summarize a list of (potentially nested) dicts. - - Computes: - - avg for numeric cols - - count and fraction for boolean cols - - other col types are ignored - - If col is all None, result is None - - Returns: - dict of summary stats, with structure matching input dict structure. - """ - if not data: - return {} - data = [x for x in data if x is not None] - - if not data: - return None - - val = data[0] - - if isinstance(val, bool): - return { - "true_count": (true_count := sum(1 for x in data if x)), - "true_fraction": true_count / len(data), - } - elif isinstance(val, Number): - return {"mean": np.mean(data).item()} - elif isinstance(val, dict): - result = {} - all_keys = set().union(*[x.keys() for x in data if isinstance(x, dict)]) - for k in all_keys: - if ( - summary := auto_summarize( - [x.get(k) for x in data if isinstance(x, dict)] - ) - ) is not None: - if k in summary: - result.update(summary) - else: - result[k] = summary - if not result: - return None - return result - elif isinstance(val, BaseModel): - return auto_summarize([x.model_dump() for x in data]) - return None - - -def get_scorer_attributes( - scorer: Union[Callable, Op, Scorer], -) -> Tuple[str, Callable, Callable]: - if weave_isinstance(scorer, Scorer): - scorer_name = scorer.name - if scorer_name is None: - scorer_name = scorer.__class__.__name__ - try: - score_fn = scorer.score - summarize_fn = scorer.summarize # type: ignore - except AttributeError: - raise ValueError( - f"Scorer {scorer_name} must implement score and summarize methods. Did you forget to wrap with @weave.op()?" - ) - elif callable(scorer): - if is_op(scorer): - scorer = as_op(scorer) - scorer_name = scorer.name - else: - scorer_name = scorer.__name__ - score_fn = scorer - summarize_fn = auto_summarize # type: ignore - else: - raise ValueError(f"Unknown scorer type: {scorer}") - return (scorer_name, score_fn, summarize_fn) # type: ignore - - -def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]: - # if any denom is zero, then zero. could use NaN instead... - precision: float = 0 - if tp or fp: - precision = tp / (tp + fp) - recall: float = 0 - if tp or fn: - recall = tp / (tp + fn) - f1: float = 0 - if precision or recall: - f1 = 2 * (precision * recall) / (precision + recall) - return precision, recall, f1 - - -class MultiTaskBinaryClassificationF1(Scorer): - class_names: list[str] - - @weave.op() - def summarize(self, score_rows: list) -> Optional[dict]: - result = {} - cols = transpose(score_rows) - - for class_name in self.class_names: - col = cols[class_name] - tp = sum(r["correct"] and not r["negative"] for r in col) - fp = sum(not r["correct"] and not r["negative"] for r in col) - fn = sum(not r["correct"] and r["negative"] for r in col) - precision, recall, f1 = p_r_f1(tp, fp, fn) - result[class_name] = {"f1": f1, "precision": precision, "recall": recall} - - return result - - @weave.op() - def score(self, target: dict, model_output: Optional[dict]) -> dict: - result = {} - for class_name in self.class_names: - class_label = target.get(class_name) - class_model_output = model_output.get(class_name) if model_output else None - result[class_name] = { - "correct": class_label == class_model_output, - "negative": not class_model_output, - } - return result - - -def transpose(rows: list[dict]) -> dict[str, list]: - cols = defaultdict(list) - for row in rows: - for k, v in row.items(): - cols[k].append(v) - return dict(cols) +# Keeping this file for now to avoid breaking changes. +# In future, users should import all scoring functionality from weave.scorers +import warnings + +from weave.scorers import * + +warnings.warn( + "Importing from weave.flow.scorer is deprecated. " + "Please import from weave.scorers in the future.", + DeprecationWarning, + stacklevel=2, +) diff --git a/weave/scorers/__init__.py b/weave/scorers/__init__.py new file mode 100644 index 00000000000..941f48e7b13 --- /dev/null +++ b/weave/scorers/__init__.py @@ -0,0 +1,55 @@ +from weave.scorers.base_scorer import ( + Scorer, + auto_summarize, + get_scorer_attributes, +) +from weave.scorers.classification_scorer import ( + MultiTaskBinaryClassificationF1, + transpose, +) +from weave.scorers.hallucination_scorer import HallucinationFreeScorer +from weave.scorers.json_scorer import ValidJSONScorer +from weave.scorers.llm_scorer import ( + InstructorLLMScorer, + LLMScorer, +) +from weave.scorers.llm_utils import ( + create, + embed, +) +from weave.scorers.moderation_scorer import OpenAIModerationScorer +from weave.scorers.pydantic_scorer import PydanticScorer +from weave.scorers.ragas_scorer import ( + ContextEntityRecallScorer, + ContextRelevancyScorer, +) +from weave.scorers.similarity_scorer import EmbeddingSimilarityScorer +from weave.scorers.string_scorer import ( + LevenshteinScorer, + StringMatchScorer, +) +from weave.scorers.summarization_scorer import SummarizationScorer +from weave.scorers.xml_scorer import ValidXMLScorer + +__all__ = [ + "auto_summarize", + "create", + "embed", + "ContextEntityRecallScorer", + "ContextRelevancyScorer", + "EmbeddingSimilarityScorer", + "get_scorer_attributes", + "HallucinationFreeScorer", + "InstructorLLMScorer", + "ValidJSONScorer", + "LevenshteinScorer", + "LLMScorer", + "MultiTaskBinaryClassificationF1", + "OpenAIModerationScorer", + "PydanticScorer", + "Scorer", + "StringMatchScorer", + "SummarizationScorer", + "transpose", + "ValidXMLScorer", +] diff --git a/weave/scorers/base_scorer.py b/weave/scorers/base_scorer.py new file mode 100644 index 00000000000..a0eec1ac09c --- /dev/null +++ b/weave/scorers/base_scorer.py @@ -0,0 +1,109 @@ +from numbers import Number +from typing import Any, Callable, Optional, Sequence, Tuple, Union + +import numpy as np +from pydantic import BaseModel, Field + +import weave +from weave.flow.obj import Object +from weave.trace.isinstance import weave_isinstance +from weave.trace.op import Op, as_op, is_op + + +class Scorer(Object): + column_map: Optional[dict[str, str]] = Field( + default=None, + description="A mapping from column names in the dataset to the names expected by the scorer", + ) + + def score(self, input: Any, target: Any, output: Any) -> Any: + raise NotImplementedError + + @weave.op() + def summarize(self, score_rows: list) -> Optional[dict]: + return auto_summarize(score_rows) + + +def stderr(data: Sequence[Union[int, float]]) -> float: + if len(data) > 1: + sample_variance = np.var(data, ddof=1) + return float(np.sqrt(sample_variance / len(data))) + else: + return 0 + + +def auto_summarize(data: list) -> Optional[dict[str, Any]]: + """Automatically summarize a list of (potentially nested) dicts. + + Computes: + - avg for numeric cols + - count and fraction for boolean cols + - other col types are ignored + + If col is all None, result is None + + Returns: + dict of summary stats, with structure matching input dict structure. + """ + if not data: + return {} + data = [x for x in data if x is not None] + + if not data: + return None + + val = data[0] + + if isinstance(val, bool): + return { + "true_count": (true_count := sum(1 for x in data if x)), + "true_fraction": true_count / len(data), + } + elif isinstance(val, Number): + return {"mean": np.mean(data).item()} + elif isinstance(val, dict): + result = {} + all_keys = set().union(*[x.keys() for x in data if isinstance(x, dict)]) + for k in all_keys: + if ( + summary := auto_summarize( + [x.get(k) for x in data if isinstance(x, dict)] + ) + ) is not None: + if k in summary: + result.update(summary) + else: + result[k] = summary + if not result: + return None + return result + elif isinstance(val, BaseModel): + return auto_summarize([x.model_dump() for x in data]) + return None + + +def get_scorer_attributes( + scorer: Union[Callable, Op, Scorer], +) -> Tuple[str, Callable, Callable]: + if weave_isinstance(scorer, Scorer): + scorer_name = scorer.name + if scorer_name is None: + scorer_name = scorer.__class__.__name__ + try: + score_fn = scorer.score + summarize_fn = scorer.summarize # type: ignore + except AttributeError: + raise ValueError( + f"Scorer {scorer_name} must implement score and summarize methods. Did you forget to wrap with @weave.op()?" + ) + elif callable(scorer): + if is_op(scorer): + scorer = as_op(scorer) + scorer_name = scorer.name + else: + scorer_name = scorer.__name__ + score_fn = scorer + summarize_fn = auto_summarize # type: ignore + else: + raise ValueError(f"Unknown scorer type: {scorer}") + return (scorer_name, score_fn, summarize_fn) # type: ignore diff --git a/weave/scorers/classification_scorer.py b/weave/scorers/classification_scorer.py new file mode 100644 index 00000000000..7c6cb1207c3 --- /dev/null +++ b/weave/scorers/classification_scorer.py @@ -0,0 +1,58 @@ +from collections import defaultdict +from typing import Optional, Tuple + +import weave +from weave.scorers.base_scorer import Scorer + + +def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]: + # if any denom is zero, then zero. could use NaN instead... + precision: float = 0 + if tp or fp: + precision = tp / (tp + fp) + recall: float = 0 + if tp or fn: + recall = tp / (tp + fn) + f1: float = 0 + if precision or recall: + f1 = 2 * (precision * recall) / (precision + recall) + return precision, recall, f1 + + +class MultiTaskBinaryClassificationF1(Scorer): + class_names: list[str] + + @weave.op() + def summarize(self, score_rows: list) -> Optional[dict]: + result = {} + cols = transpose(score_rows) + + for class_name in self.class_names: + col = cols[class_name] + tp = sum(r["correct"] and not r["negative"] for r in col) + fp = sum(not r["correct"] and not r["negative"] for r in col) + fn = sum(not r["correct"] and r["negative"] for r in col) + precision, recall, f1 = p_r_f1(tp, fp, fn) + result[class_name] = {"f1": f1, "precision": precision, "recall": recall} + + return result + + @weave.op() + def score(self, target: dict, output: Optional[dict]) -> dict: + result = {} + for class_name in self.class_names: + class_label = target.get(class_name) + class_output = output.get(class_name) if output else None + result[class_name] = { + "correct": class_label == class_output, + "negative": not class_output, + } + return result + + +def transpose(rows: list[dict]) -> dict[str, list]: + cols = defaultdict(list) + for row in rows: + for k, v in row.items(): + cols[k].append(v) + return dict(cols) diff --git a/weave/scorers/hallucination_scorer.py b/weave/scorers/hallucination_scorer.py new file mode 100644 index 00000000000..1aee2012134 --- /dev/null +++ b/weave/scorers/hallucination_scorer.py @@ -0,0 +1,160 @@ +from typing import List + +from pydantic import BaseModel, Field + +import weave +from weave.scorers.llm_scorer import InstructorLLMScorer +from weave.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create +from weave.scorers.utils import stringify + +DEFAULT_HALLUCINATION_SYSTEM_PROMPT = """ +Given some from a user and an generated by an AI system, \ +determine if the contains any hallucinations. + +A "hallucination" is defined as information in the that is not supported by \ +the or is not factually or logically consistent with the . + +# Steps +1. Carefully read and understand the input data. +2. Examine the model output. +3. Compare the output to the input data, identifying any inconsistencies or additions. +4. Evaluate the logical connection between input and output. +5. Determine if any information in the output is not supported by or conflicts with the input. + +# Guidelines +- Focus on factual accuracy and logical consistency +- Consider both explicit and implicit information in the input data +- Be aware of potential misinterpretations or over-generalizations in the output +- Identify any information in the output that goes beyond the scope of the input + +# Examples +## Data to analyze + + +The cat is black and white. + + + +The cat has orange stripes. + + +## Analysis: +{ + "think_step_by_step": "The cat is black and white. The cat has orange stripes. \ +The output contradicts the input data because the input specifies black and white, \ +while the output mentions orange. The output also introduces a pattern not present in \ +the input.", + "reasoning": [ + { + "hallucination_type": "Color comparison", + "observation": "Input specifies black and white, output mentions orange" + }, + { + "hallucination_type": "Pattern analysis", + "observation": "Input doesn't mention any pattern, output introduces stripes" + } + ], + "conclusion": "The output contains two hallucinations: it contradicts the color information \ +and introduces a pattern not present in the input." + "is_hallucination": true, +} + +# Notes +- Ensure each step in the reasoning process is clearly articulated +- Be objective and avoid assumptions not supported by the input data +- If the output contains factual information not present in the input, it may be a \ +hallucination even if it doesn't directly contradict the input +""" + +DEFAULT_HALLUCINATION_USER_PROMPT = """ +Analyze the following and and determine if the contains any hallucinations. +# Data to analyze + + +{input_data} + + + +{output} + +""" + + +class HallucinationReasoning(BaseModel): + hallucination_type: str = Field( + description="A short name for the type of hallucination." + ) + observation: str = Field( + description="An observation from the and that supports the hallucination." + ) + + +class HallucinationResponse(BaseModel): + chain_of_thought: str = Field( + description="Think step by step about whether the contains hallucinations \ +based on the ." + ) + reasonings: List[HallucinationReasoning] = Field( + description="A list of reasoning steps that lead to the conclusion about whether or not\ +the contains hallucinations." + ) + conclusion: str = Field(description="The conclusion of the analysis.") + has_hallucination: bool = Field( + description="Whether the is free of hallucinations based on the . True means it is NOT a hallucination." + ) + + +class HallucinationFreeScorer(InstructorLLMScorer): + """ + A Scorer that uses an LLM to determine if the model output contains any hallucinations + based on the input data. + + Note: + - The meaning of "hallucination" can vary from person to person, you will likely want to + customize the `system_prompt` and `user_prompt` to fit your specific needs. + - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM + provider's response; you will have to install the `instructor` python package to use it. + - The `score` method expects the input column from the dataset to be named "context". It will use + this data as the ground-truth to check hallucinations against. If your dataset column has a + different name, you can specify a different mapping using the `column_map` argument in the init + of HallucinationFreeScorer by passing `column_map={"context": "context"}`. + + Attributes: + system_prompt (str): The prompt describing the task, defines what a "hallucination" is. + user_prompt (str): The string template to pass the input and output data. The template must + contain placeholders for both `{input_data}` and `{output}`. + model_id (str): The LLM model name, depends on the LLM's providers to be used `client` being used. + temperature (float): LLM temperature setting. + max_tokens (int): Maximum number of tokens in the LLM's response. + + Methods: + score(output: str, context: str) -> HallucinationResponse: + Analyzes the output to detect hallucinations based on the given context. + """ + + system_prompt: str = DEFAULT_HALLUCINATION_SYSTEM_PROMPT + user_prompt: str = DEFAULT_HALLUCINATION_USER_PROMPT + model_id: str = OPENAI_DEFAULT_MODEL + temperature: float = 0.7 + max_tokens: int = 4096 + + @weave.op + def score(self, output: str, context: str) -> HallucinationResponse: + output = stringify(output) + response = create( + self.client, + messages=[ + {"role": "system", "content": self.system_prompt}, + { + "role": "user", + "content": self.user_prompt.format( + input_data=context, output=output + ), + }, + ], + model=self.model_id, + response_model=HallucinationResponse, + temperature=self.temperature, + max_tokens=self.max_tokens, + ) + return response.model_dump() # Morgan wants this to be a dict diff --git a/weave/scorers/json_scorer.py b/weave/scorers/json_scorer.py new file mode 100644 index 00000000000..e7604a8f0ae --- /dev/null +++ b/weave/scorers/json_scorer.py @@ -0,0 +1,17 @@ +import json +from typing import Any + +import weave +from weave.scorers.base_scorer import Scorer + + +class ValidJSONScorer(Scorer): + """Validate whether a string is valid JSON.""" + + @weave.op + def score(self, output: Any) -> dict: + try: + _ = json.loads(output) + return {"json_valid": True} + except json.JSONDecodeError: + return {"json_valid": False} diff --git a/weave/scorers/llm_scorer.py b/weave/scorers/llm_scorer.py new file mode 100644 index 00000000000..b3660a3b9cd --- /dev/null +++ b/weave/scorers/llm_scorer.py @@ -0,0 +1,70 @@ +from pydantic import Field, field_validator + +from weave.scorers.base_scorer import Scorer +from weave.scorers.llm_utils import ( + _LLM_CLIENTS, + _LLM_CLIENTS_NAMES, + instructor_client, +) + + +class LLMScorer(Scorer): + """Score model outputs using a Large Language Model (LLM). + + This scorer leverages LLMs to evaluate and score model outputs. It provides a flexible + way to use different LLM providers for scoring purposes. + + Attributes: + client: An instantiated LLM client with valid API credentials + model_id: The specific model identifier to use for scoring + """ + + client: _LLM_CLIENTS = Field( + description="The LLM client to use, has to be instantiated with an api_key" + ) + model_id: str = Field(description="The model to use") + + @field_validator("client") + def validate_client(cls, v: _LLM_CLIENTS) -> _LLM_CLIENTS: + client_type_name = type(v).__name__ + if client_type_name not in _LLM_CLIENTS_NAMES: + raise ValueError( + f"Invalid client type. Expected one of {_LLM_CLIENTS_NAMES}, got {client_type_name}" + ) + return v + + +class InstructorLLMScorer(Scorer): + """Score a model using an LLM with structured outputs. + + This scorer extends the base LLM scoring capability by adding temperature and + token control for more precise scoring behavior. It automatically wraps the + provided client with [instructor](https://github.com/instructor-ai/instructor) + functionality for structured outputs. + + Attributes: + client: An instantiated LLM client with valid API credentials + model_id: The specific model identifier to use for scoring + temperature: Controls randomness in the LLM's responses (0.0 to 1.0) + max_tokens: Maximum number of tokens allowed in the LLM's response + """ + + client: _LLM_CLIENTS = Field( + description="The LLM client to use, has to be instantiated with an api_key" + ) + model_id: str = Field(description="The model to use") + temperature: float = Field( + ..., description="The temperature to use for the response" + ) + max_tokens: int = Field( + ..., description="The maximum number of tokens in the response" + ) + + @field_validator("client") + def validate_client(cls, v: _LLM_CLIENTS) -> _LLM_CLIENTS: + client_type_name = type(v).__name__ + if client_type_name not in _LLM_CLIENTS_NAMES: + raise ValueError( + f"Invalid client type. Expected one of {_LLM_CLIENTS_NAMES}, got {client_type_name}" + ) + return instructor_client(v) diff --git a/weave/scorers/llm_utils.py b/weave/scorers/llm_utils.py new file mode 100644 index 00000000000..4cf70af729a --- /dev/null +++ b/weave/scorers/llm_utils.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, List, Union + +from weave.trace.autopatch import autopatch + +autopatch() # ensure both weave patching and instructor patching are applied + +OPENAI_DEFAULT_MODEL = "gpt-4o" +OPENAI_DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small" +OPENAI_DEFAULT_MODERATION_MODEL = "text-moderation-latest" + +ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet" + +MISTRAL_DEFAULT_MODEL = "mistral-large-latest" +MISTRAL_DEFAULT_EMBEDDING_MODEL = "mistral-embed" + +DEFAULT_MAX_TOKENS = 4096 + +if TYPE_CHECKING: + import instructor + from anthropic import Anthropic, AsyncAnthropic + from google.generativeai import GenerativeModel + from instructor.patch import InstructorChatCompletionCreate + from mistralai import Mistral + from openai import AsyncOpenAI, OpenAI + + _LLM_CLIENTS = Union[ + OpenAI, AsyncOpenAI, Anthropic, AsyncAnthropic, Mistral, GenerativeModel + ] +else: + _LLM_CLIENTS = object + +_LLM_CLIENTS_NAMES = ( + "OpenAI", + "AsyncOpenAI", + "Anthropic", + "AsyncAnthropic", + "Mistral", + "GenerativeModel", +) + + +def instructor_client(client: _LLM_CLIENTS) -> "instructor.client": + try: + import instructor + except ImportError: + raise ImportError( + "The `instructor` package is required to use LLM-powered scorers, please run `pip install instructor`" + ) + + client_type = type(client).__name__.lower() + + if "openai" in client_type: + return instructor.from_openai(client) + elif "anthropic" in client_type: + return instructor.from_anthropic(client) + elif "mistral" in client_type: + return instructor.from_mistral(client) + elif "generativemodel" in client_type: + return instructor.from_gemini( + client=client, + mode=instructor.Mode.GEMINI_JSON, + ) + else: + raise ValueError(f"Unsupported client type: {client_type}") + + +def create( + client: instructor.client, *args: Any, **kwargs: Any +) -> InstructorChatCompletionCreate: + # gemini has slightly different argument namings... + # max_tokens -> max_output_tokens + if "generativemodel" in type(client.client).__name__.lower(): + max_output_tokens = kwargs.pop("max_tokens") + temperature = kwargs.pop("temperature", None) + _ = kwargs.pop("model") # model is baked in the client + kwargs["generation_config"] = dict( + max_output_tokens=max_output_tokens, + temperature=temperature, + ) + return client.chat.completions.create(*args, **kwargs) + + +def embed( + client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs: Any +) -> List[List[float]]: + client_type = type(client).__name__.lower() + if "openai" in client_type: + response = client.embeddings.create(model=model_id, input=texts, **kwargs) + return [embedding.embedding for embedding in response.data] + elif "mistral" in client_type: + response = client.embeddings.create(model=model_id, inputs=texts, **kwargs) + return [embedding.embedding for embedding in response.data] + else: + raise ValueError(f"Unsupported client type: {type(client).__name__.lower()}") diff --git a/weave/scorers/moderation_scorer.py b/weave/scorers/moderation_scorer.py new file mode 100644 index 00000000000..aaadeb7952c --- /dev/null +++ b/weave/scorers/moderation_scorer.py @@ -0,0 +1,41 @@ +from typing import Any + +from pydantic import field_validator + +import weave +from weave.scorers.llm_scorer import LLMScorer +from weave.scorers.llm_utils import _LLM_CLIENTS, OPENAI_DEFAULT_MODERATION_MODEL + + +class OpenAIModerationScorer(LLMScorer): + """Use OpenAI moderation API to check if the model output is safe. + + Args: + model_id: The OpenAI model to use for moderation. Defaults to `text-moderation-latest`. + """ + + model_id: str = OPENAI_DEFAULT_MODERATION_MODEL + + @field_validator("client") + def validate_openai_client(cls, v: _LLM_CLIENTS) -> _LLM_CLIENTS: + # Method implementation + try: + from openai import ( # Ensure these are the correct imports + AsyncOpenAI, + OpenAI, + ) + except ImportError: + raise ValueError("Install openai to use this scorer") + + if not isinstance(v, (OpenAI, AsyncOpenAI)): + raise ValueError("Moderation scoring only works with OpenAI or AsyncOpenAI") + return v + + @weave.op + def score(self, output: Any) -> dict: + response = self.client.moderations.create( + model=self.model_id, + input=output, + ).results[0] + categories = {k: v for k, v in response.categories.items() if v} + return {"flagged": response.flagged, "categories": categories} diff --git a/weave/scorers/pydantic_scorer.py b/weave/scorers/pydantic_scorer.py new file mode 100644 index 00000000000..0a5dcf1e768 --- /dev/null +++ b/weave/scorers/pydantic_scorer.py @@ -0,0 +1,27 @@ +from typing import Any, Type + +from pydantic import BaseModel, ValidationError + +import weave +from weave.scorers.base_scorer import Scorer + + +class PydanticScorer(Scorer): + """Validate the model output against a pydantic model.""" + + model: Type[BaseModel] + + @weave.op + def score(self, output: Any) -> dict: + if isinstance(output, str): + try: + self.model.model_validate_json(output) + return {"valid_pydantic": True} + except ValidationError: + return {"valid_pydantic": False} + else: + try: + self.model.model_validate(output) + return {"valid_pydantic": True} + except ValidationError: + return {"valid_pydantic": False} diff --git a/weave/scorers/ragas_scorer.py b/weave/scorers/ragas_scorer.py new file mode 100644 index 00000000000..a8b754af541 --- /dev/null +++ b/weave/scorers/ragas_scorer.py @@ -0,0 +1,135 @@ +# implementing metrics from ragas: https://github.com/explodinggradients/ragas + +from textwrap import dedent + +from pydantic import BaseModel, Field + +import weave +from weave.scorers.llm_scorer import InstructorLLMScorer +from weave.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create + + +class EntityExtractionResponse(BaseModel): + entities: list[str] = Field( + description="A list of unique entities extracted from the text" + ) + + +class ContextEntityRecallScorer(InstructorLLMScorer): + """ + A Scorer that estimates context recall by extracting entities from both the model output + and the context, then computing the recall score between them. + + Note: + - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM + provider's response; you will have to install the `instructor` python package to use it. + - The `score` method expects two arguments: 'output' (the model's response) and 'context' + (the reference text). If your dataset columns have different names, use the `column_map` + argument when initializing the scorer. + - Entity extraction is performed using an LLM, so results may vary based on the model used. + + Attributes: + extraction_prompt (str): The prompt template used to extract entities from text. Must + contain a {text} placeholder. + model_id (str): The LLM model name, depends on the LLM provider being used. + temperature (float): LLM temperature setting. + max_tokens (int): Maximum number of tokens in the LLM's response. + + Methods: + score(output: str, context: str) -> dict: + Computes the recall score by comparing entities in the output against those in the context. + Returns a dict with a 'recall' key containing the score (0.0 to 1.0). + """ + + extraction_prompt: str = dedent(""" + Extract unique entities from the following text without repetition. + + Text: {text} + Entities: + """) + model_id: str = OPENAI_DEFAULT_MODEL + temperature: float = 0.7 + max_tokens: int = 4096 + + def extract_entities(self, text: str) -> list[str]: + # Use LLM to extract entities + prompt = self.extraction_prompt.format(text=text) + response = create( + self.client, + messages=[{"role": "user", "content": prompt}], + response_model=EntityExtractionResponse, + model=self.model_id, + ) + # Assume entities are returned as a comma-separated list + entities = [e.strip() for e in response.entities] + return entities + + @weave.op + def score(self, output: str, context: str) -> dict: + expected_entities = self.extract_entities(output) + context_entities = self.extract_entities(context) + # Calculate recall + if not expected_entities: + return {"recall": 0.0} + matches = set(expected_entities) & set(context_entities) + recall = len(matches) / len(expected_entities) + return {"recall": recall} + + +class RelevancyResponse(BaseModel): + reasoning: str = Field( + description="Think step by step about whether the context is relevant to the question" + ) + relevancy_score: int = Field( + ge=0, + le=1, + description="The relevancy score of the context to the question (0 for not relevant, 1 for relevant)", + ) + + +class ContextRelevancyScorer(InstructorLLMScorer): + """ + A Scorer that evaluates the relevancy of the provided context to the model output using an LLM. + + Note: + - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM + provider's response; you will have to install the `instructor` python package to use it. + - The `score` method expects two arguments: 'output' (treated as the question) and 'context' + (the reference text). If your dataset columns have different names, use the `column_map` + argument when initializing the scorer. + - The relevancy score is binary (0 or 1) where 1 indicates relevant context. + + Attributes: + relevancy_prompt (str): The prompt template used to evaluate context relevancy. Must + contain placeholders for both {question} and {context}. + model_id (str): The LLM model name, depends on the LLM provider being used. + temperature (float): LLM temperature setting. + max_tokens (int): Maximum number of tokens in the LLM's response. + + Methods: + score(output: str, context: str) -> dict: + Evaluates the relevancy of the context to the output/question. + Returns a dict with 'relevancy_score' (0 or 1) and 'reasoning' keys. + """ + + relevancy_prompt: str = dedent(""" + Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1. + + Question: {question} + Context: {context} + Relevancy Score (0-1): + """) + model_id: str = OPENAI_DEFAULT_MODEL + temperature: float = 0.7 + max_tokens: int = 4096 + + @weave.op + def score(self, output: str, context: str) -> dict: + prompt = self.relevancy_prompt.format(question=output, context=context) + response = create( + self.client, + messages=[{"role": "user", "content": prompt}], + response_model=RelevancyResponse, + model=self.model_id, + ) + return response.model_dump() diff --git a/weave/scorers/similarity_scorer.py b/weave/scorers/similarity_scorer.py new file mode 100644 index 00000000000..a20e4a13841 --- /dev/null +++ b/weave/scorers/similarity_scorer.py @@ -0,0 +1,46 @@ +from typing import Any + +import numpy as np +from pydantic import Field + +import weave +from weave.scorers.llm_scorer import LLMScorer +from weave.scorers.llm_utils import OPENAI_DEFAULT_EMBEDDING_MODEL, embed + + +class EmbeddingSimilarityScorer(LLMScorer): + """Check the cosine similarity distance between the model output and the target. + + The threshold is the minimum cosine similarity score that is considered similar. + + Args: + threshold: The minimum cosine similarity score that is considered similar. Defaults to 0.5 + """ + + threshold: float = Field(0.5, description="The threshold for the similarity score") + model_id: str = OPENAI_DEFAULT_EMBEDDING_MODEL + + @weave.op + def score(self, output: str, target: str) -> Any: + assert ( + self.threshold >= -1 and self.threshold <= 1 + ), "`threshold` should be between -1 and 1" + model_embedding, target_embedding = self._compute_embeddings(output, target) + return self.cosine_similarity(model_embedding, target_embedding) + + def _compute_embeddings( + self, output: str, target: str + ) -> tuple[list[float], list[float]]: + embeddings = embed(self.client, self.model_id, [output, target]) + return embeddings[0], embeddings[1] + + def cosine_similarity(self, vec1: list[float], vec2: list[float]) -> dict: + """Compute the cosine similarity between two vectors.""" + arr1 = np.array(vec1) + arr2 = np.array(vec2) + cosine_sim = np.dot(arr1, arr2) / (np.linalg.norm(arr1) * np.linalg.norm(arr2)) + cosine_sim = float(cosine_sim) + return { + "similarity_score": cosine_sim, + "is_similar": cosine_sim >= self.threshold, + } diff --git a/weave/scorers/string_scorer.py b/weave/scorers/string_scorer.py new file mode 100644 index 00000000000..83dec55c762 --- /dev/null +++ b/weave/scorers/string_scorer.py @@ -0,0 +1,38 @@ +from typing import Callable + +from pydantic import Field, model_validator + +import weave +from weave.scorers.base_scorer import Scorer + + +class StringMatchScorer(Scorer): + """Scorer that checks if the model output string is found in the search columns of the dataset row.""" + + @weave.op + def score(self, output: str, target: str) -> dict: + string_in_input = output.lower() in target.lower() + return {"string_in_input": string_in_input} + + +class LevenshteinScorer(Scorer): + distance: Callable[[str, str], int] = Field( + default=None, description="The Levenshtein distance function" + ) + + @model_validator(mode="after") + def check_levenshtein(self) -> "LevenshteinScorer": + try: + from Levenshtein import distance + + self.distance = distance + return self + except ImportError: + raise ValueError( + "Levenshtein package not found. Please install it with `pip install Levenshtein`" + ) + + @weave.op + def score(self, output: str, target: str) -> dict: + distance = self.distance(output, target) + return {"levenshtein_distance": distance} diff --git a/weave/scorers/summarization_scorer.py b/weave/scorers/summarization_scorer.py new file mode 100644 index 00000000000..18e7c7cb64b --- /dev/null +++ b/weave/scorers/summarization_scorer.py @@ -0,0 +1,200 @@ +import asyncio +from typing import List, Literal + +from pydantic import BaseModel, Field + +import weave +from weave.scorers.llm_scorer import InstructorLLMScorer +from weave.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create + +DEFAULT_EXTRACTION_SYSTEM_PROMPT = """ +Given a , extract all the unique entities from the text without repetition. +""" + +DEFAULT_EXTRACTION_USER_PROMPT = """ +Extract all the unique entities from the following without repetition: + +{text} + +""" + +DEFAULT_SUMMARIZATION_EVALUATION_SYSTEM_PROMPT = """ +Given an and a , evaluate the quality of the . + +# Considerations +- Does the contain the key information in the ? +- Is the concise and informative? +- Is the grammatically correct? +- Does the contain information or assertions that are not present in the ? + +# Scoring Rubric +`excellent`: The contains all of the key information and entities in the , \ +is concise and information dense, is grammatically correct and doesn't contain any \ +information or assertions that are not present in the . + +`ok`: The contains most of the key information and entities in the , \ +is somewhat concise and informative, is mostly grammatically correct and doesn't contain any \ +information or assertions that are not present in the . + +`poor`: The misses most or all of the key information in the , \ +or is very verbose or vague, or is not concise or informative, or has many grammatical errors, \ +or contains information or assertions that are not present in the . +""" + +DEFAULT_SUMMARIZATION_EVALUATION_USER_PROMPT = """ +Evaluate the quality of the following given the : + + +{input} + + + +{summary} + +""" + + +class EntityExtractionResponse(BaseModel): + entities: List[str] = Field( + description="A list of unique entities extracted from the text." + ) + + +summarization_quality_options = Literal["poor", "ok", "excellent"] +summarization_quality_mapping = {"poor": 0.0, "ok": 0.5, "excellent": 1.0} + + +class SummarizationEvaluationResponse(BaseModel): + think_step_by_step: str = Field( + description="Think step-by-step about the quality of the before deciding \ +on the summarization_score." + ) + summarization_evaluation: summarization_quality_options = Field( + description="The evaluation of the summary" + ) + + +class SummarizationScorer(InstructorLLMScorer): + """ + A Scorer that evaluates the quality of summaries in two ways: + - using an LLM to calculate the entity density of the summary, similar to how entity density is + used in the Chain of Density paper, https://arxiv.org/abs/2309.04269. This is a rough measure for + how information-dense the summary is. + - using another LLM evaluator to grade the summary quality from `poor`, `ok`, to `excellent`. These + grades are then mapped to numerical scores, {`poor`: 0.0, `ok`: 0.5, `excellent`: 1.0}, in order to + be able to calculate an average score across a dataset of summaries if needed. + + To customise the LLM evaluator you can customise the `summarization_evaluation_system_prompt`and + `summarization_evaluation_prompt` attributes to be tailored your specific definition of what a good summary + should look like. + + Note: + - This Scorer uses the `InstructorLLMScorer` class to generate structured outputs from the LLM + provider's response; you will have to install the `instructor` python package to use it. + - The `score` method expects the input column from the dataset to be named "input". If your dataset + column has a different name, you can specify a different mapping using the `column_map` argument in the + init of SummarizationScorer by passing `column_map={"input": "news_article"}`. + + Attributes: + extraction_system_prompt (str): System prompt to extract the distinct entities in the input. Customising + this can help ensure that the LLM identifies the `entities` that you care about. + extraction_prompt (str): Prompt template for entity extraction; must contain a `{text}` placeholder. + summarization_evaluation_system_prompt (str): System prompt defining how to evaluate the quality of a summary. + Asks an LLM to grade the summary from `poor`, `ok`, to `excellent` and provide a rationale for the grade. + summarization_evaluation_prompt (str): Prompt template for summarization evaluation instruction; must contain + `{input}` and `{summary}` placeholders. + entity_density_threshold (float): Threshold for determining if a summary is sufficiently entity-dense. + model_id (str): The LLM model name, depends on the LLM's providers to be used `client` being used. + temperature (float): LLM temperature setting. + max_tokens (int): Maximum number of tokens in the LLM's response. + + Methods: + extract_entities(text: str) -> List[str]: + Uses an LLM to extract unique entities from the text. + + evaluate_summary(input: str, summary: str) -> SummarizationEvaluationResponse: + Evaluates the quality of a summary using an LLM. + + score(input: str, output: str) -> dict: + Calculates summarization score and entity density score for the given input and output. + """ + + extraction_system_prompt: str = DEFAULT_EXTRACTION_SYSTEM_PROMPT + extraction_prompt: str = DEFAULT_EXTRACTION_USER_PROMPT + summarization_evaluation_system_prompt: str = ( + DEFAULT_SUMMARIZATION_EVALUATION_SYSTEM_PROMPT + ) + summarization_evaluation_prompt: str = DEFAULT_SUMMARIZATION_EVALUATION_USER_PROMPT + entity_density_threshold: float = 0.08 + model_id: str = OPENAI_DEFAULT_MODEL + temperature: float = 0.7 + max_tokens: int = 1024 + + @weave.op + def extract_entities(self, text: str) -> List[str]: + """Use an LLM to extract entities""" + response = create( + self.client, + messages=[ + {"role": "system", "content": self.extraction_system_prompt}, + {"role": "user", "content": self.extraction_prompt.format(text=text)}, + ], + response_model=EntityExtractionResponse, + model=self.model_id, + temperature=self.temperature, + max_tokens=self.max_tokens, + ) + entities = [e.strip().lower() for e in response.entities] + return entities + + @weave.op + def evaluate_summary( + self, input: str, summary: str + ) -> SummarizationEvaluationResponse: + """Evaluate the quality of a summary using an LLM""" + return create( + self.client, + messages=[ + { + "role": "system", + "content": self.summarization_evaluation_system_prompt, + }, + { + "role": "user", + "content": self.summarization_evaluation_prompt.format( + input=input, summary=summary + ), + }, + ], + response_model=SummarizationEvaluationResponse, + model=self.model_id, + temperature=self.temperature, + max_tokens=self.max_tokens, + ) + + def simple_word_tokenize(self, text: str) -> List[str]: + """Simple word tokenization""" + return text.split() + + @weave.op + async def score(self, input: str, output: str) -> dict: + extract_task = asyncio.to_thread(self.extract_entities, text=str(output)) + evaluate_task = asyncio.to_thread( + self.evaluate_summary, input=str(input), summary=str(output) + ) + summary_entities, llm_eval = await asyncio.gather(extract_task, evaluate_task) + + # LLM evaluation + result = {} + result["summarization_eval_score"] = summarization_quality_mapping.get( + llm_eval.summarization_evaluation.lower() + ) + result["llm_eval_reasoning"] = llm_eval.think_step_by_step + + # Entity density evaluation + summary_words = self.simple_word_tokenize(output) + entity_density = len(summary_entities) / len(summary_words) + result["is_entity_dense"] = entity_density >= self.entity_density_threshold + result["entity_density"] = entity_density + + return result diff --git a/weave/scorers/utils.py b/weave/scorers/utils.py new file mode 100644 index 00000000000..4080f304fb5 --- /dev/null +++ b/weave/scorers/utils.py @@ -0,0 +1,25 @@ +import json +from typing import Any + +from pydantic import BaseModel + + +def stringify(output: Any) -> str: + """ + Convert any output to a string. If the output is a Pydantic BaseModel, + convert it to a JSON string using the model's dump_json method. + """ + if isinstance(output, str): + return output + elif isinstance(output, int): + return str(output) + elif isinstance(output, float): + return str(output) + elif isinstance(output, (list, tuple)): + return json.dumps(output, indent=2) + elif isinstance(output, dict): + return json.dumps(output, indent=2) + elif isinstance(output, BaseModel): + return output.model_dump_json(indent=2) + else: + raise ValueError(f"Unsupported model output type: {type(output)}") diff --git a/weave/scorers/xml_scorer.py b/weave/scorers/xml_scorer.py new file mode 100644 index 00000000000..8545a96686b --- /dev/null +++ b/weave/scorers/xml_scorer.py @@ -0,0 +1,22 @@ +import xml.etree.ElementTree as ET +from typing import Union + +import weave +from weave.scorers.base_scorer import Scorer + + +class ValidXMLScorer(Scorer): + """Score an XML string.""" + + @weave.op + def score(self, output: Union[str, dict]) -> dict: + if isinstance(output, dict): + xml_string = output.get("output", "") + else: + xml_string = output + + try: + ET.fromstring(xml_string) + return {"xml_valid": True} + except ET.ParseError: + return {"xml_valid": False}