From 1a00368a280f8e87429c1456691b4e6d61999fc4 Mon Sep 17 00:00:00 2001
From: J2-D2-3PO <188380414+J2-D2-3PO@users.noreply.github.com>
Date: Fri, 22 Nov 2024 17:11:07 -0700
Subject: [PATCH] Refactor Evals and Scoring section
---
docs/docs/guides/core-types/evaluations.md | 41 +-
docs/docs/guides/evaluation/custom-scorers.md | 117 ++++
.../guides/evaluation/predefined-scorers.md | 493 ++++++++++++++
docs/docs/guides/evaluation/scorers.md | 607 +-----------------
docs/sidebars.ts | 2 +-
5 files changed, 638 insertions(+), 622 deletions(-)
create mode 100644 docs/docs/guides/evaluation/custom-scorers.md
create mode 100644 docs/docs/guides/evaluation/predefined-scorers.md
diff --git a/docs/docs/guides/core-types/evaluations.md b/docs/docs/guides/core-types/evaluations.md
index 602bdfcebea..1a5eb59e54b 100644
--- a/docs/docs/guides/core-types/evaluations.md
+++ b/docs/docs/guides/core-types/evaluations.md
@@ -1,6 +1,8 @@
# Evaluations
-Evaluation-driven development helps you reliably iterate on an application. The `Evaluation` class is designed to assess the performance of a `Model` on a given `Dataset` or set of examples using scoring functions.
+To systematically improve your LLM application, it's helpful to test your changes against a consistent dataset of potential inputs so that you can catch regressions and inspect your applications behaviour under different conditions. In Weave, the `Evaluation` class is designed to assess the performance of a `Model` on a test dataset.
+
+In a Weave Evaluation, a set of examples is passed through your application, and the output is scored according to multiple scoring functions. The result provides you with a overview of your application's performance in a rich UI to summarizing individual outputs and scores.
![Evals hero](../../../static/img/evals-hero.png)
@@ -38,23 +40,30 @@ weave.init('intro-example')
asyncio.run(evaluation.evaluate(function_to_evaluate))
```
-## Create an Evaluation
+This page describes how to get started with evaluations.
+## Create an evaluation
+
+To create an evaluation in Weave, follow these steps:
+
+1. [Define an evaluation dataset](#define-an-evaluation-dataset)
+2. [Define scoring functions](#define-scoring-functions)
-To systematically improve your application, it's helpful to test your changes against a consistent dataset of potential inputs so that you catch regressions and can inspect your apps behaviour under different conditions. Using the `Evaluation` class, you can be sure you're comparing apples-to-apples by keeping track of all of the details that you're experimenting and evaluating with.
+### Create an evaluation dataset
-Weave will take each example, pass it through your application and score the output on multiple custom scoring functions. By doing this, you'll have a view of the performance of your application, and a rich UI to drill into individual outputs and scores.
+First, create a test dataset that will be used to evaluate your application. Generally, the dataset should include failure cases that you want to test for, similar to software unit tests in Test-Driven Development (TDD). You have two options to create a dataset:
-### Define an evaluation dataset
+1. Define a [Dataset](/guides/core-types/datasets).
+2. Define a list of dictionaries with a collection of examples to be evaluated.
-First, define a [Dataset](/guides/core-types/datasets) or list of dictionaries with a collection of examples to be evaluated. These examples are often failure cases that you want to test for, these are similar to unit tests in Test-Driven Development (TDD).
+### Define scoring functions
-### Defining scoring functions
+Next, create a list of _Scorers_. Scorers are functions used to score each example. Scorers must have a `model_output` keyword argument. Other arguments are user defined and are taken from the dataset examples. The Scorer will only use the necessary keys by using a dictionary key based on the argument name.
-Then, create a list of scoring functions. These are used to score each example. Each function should have a `model_output` and optionally, other inputs from your examples, and return a dictionary with the scores.
+When defining Scorers, you can either use one of the many predefined scorers available in Weave, or create your own custom Scorer.
-Scoring functions need to have a `model_output` keyword argument, but the other arguments are user defined and are taken from the dataset examples. It will only take the necessary keys by using a dictionary key based on the argument name.
+#### Scorer example
-This will take `expected` from the dictionary for scoring.
+In the following example, the `match_score1()` Scorer will take `expected` from the dictionary for scoring.
```python
import weave
@@ -73,15 +82,17 @@ def match_score1(expected: str, model_output: dict) -> dict:
return {'match': expected == model_output['generated_text']}
```
-### Optional: Define a custom `Scorer` class
+#### Optional: Define a custom `Scorer` class
-In some applications we want to create custom `Scorer` classes - where for example a standardized `LLMJudge` class should be created with specific parameters (e.g. chat model, prompt), specific scoring of each row, and specific calculation of an aggregate score.
+For some applications, you may want to create custom `Scorer` classes. For example, a standardized `LLMJudge` class should be created with specific parameters (e.g. chat model, prompt), scoring of each row, and calculation of an aggregate score. For more information about creating custom Scorers, see [Create your own Scorers](../evaluation/custom-scorers.md).
-See the tutorial on defining a `Scorer` class in the next chapter on [Model-Based Evaluation of RAG applications](/tutorial-rag#optional-defining-a-scorer-class) for more information.
+> For an end-to-end tutorial that involves defining a custom `Scorer` class, see [Model-Based Evaluation of RAG applications](/tutorial-rag#optional-defining-a-scorer-class).
### Define a Model to evaluate
-To evaluate a `Model`, call `evaluate` on it using an `Evaluation`. `Models` are used when you have attributes that you want to experiment with and capture in weave.
+Once your test dataset and Scorers are defined, you can begin the evaluation. To evaluate a `Model`, call `evaluate` on using an `Evaluation`. `Models` are used when you have attributes that you want to experiment with and capture in Weave.
+
+The following example funs `predict()` on each example and scores the output with each scoring function defined in the `scorers` list using the `examples` dataset.
```python
from weave import Model, Evaluation
@@ -104,7 +115,7 @@ weave.init('intro-example') # begin tracking results with weave
asyncio.run(evaluation.evaluate(model))
```
-This will run `predict` on each example and score the output with each scoring functions.
+
#### Custom Naming
diff --git a/docs/docs/guides/evaluation/custom-scorers.md b/docs/docs/guides/evaluation/custom-scorers.md
new file mode 100644
index 00000000000..b7b6c27c870
--- /dev/null
+++ b/docs/docs/guides/evaluation/custom-scorers.md
@@ -0,0 +1,117 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Create your own Scorers
+
+In Weave, you can create your own custom Scorers. The Scorers can either be class-based or function-based. For more information about Scorers, see the
+
+### Function-based Scorers
+
+
+
+ These are functions decorated with `@weave.op` that return a dictionary. They're great for simple evaluations like:
+
+ ```python
+ import weave
+
+ @weave.op
+ def evaluate_uppercase(text: str) -> dict:
+ return {"text_is_uppercase": text.isupper()}
+
+ my_eval = weave.Evaluation(
+ dataset=[{"text": "HELLO WORLD"}],
+ scorers=[evaluate_uppercase]
+ )
+ ```
+
+ When the evaluation is run, `evaluate_uppercase` checks if the text is all uppercase.
+
+
+
+ These are functions wrapped with `weave.op` that accept an object with `modelOutput` and optionally `datasetRow`. They're great for simple evaluations like:
+ ```typescript
+ import * as weave from 'weave'
+
+ const evaluateUppercase = weave.op(
+ ({modelOutput}) => modelOutput.toUpperCase() === modelOutput,
+ {name: 'textIsUppercase'}
+ );
+
+
+ const myEval = new weave.Evaluation({
+ dataset: [{text: 'HELLO WORLD'}],
+ scorers: [evaluateUppercase],
+ })
+ ```
+
+
+
+
+### Class-based Scorers
+
+
+
+ For more advanced evaluations, especially when you need to keep track of additional scorer metadata, try different prompts for your LLM-evaluators, or make multiple function calls, you can use the `Scorer` class.
+
+ **Requirements:**
+
+ 1. Inherit from `weave.Scorer`.
+ 2. Define a `score` method decorated with `@weave.op`.
+ 3. The `score` method must return a dictionary.
+
+ Example:
+
+ ```python
+ import weave
+ from openai import OpenAI
+ from weave import Scorer
+
+ llm_client = OpenAI()
+
+ #highlight-next-line
+ class SummarizationScorer(Scorer):
+ model_id: str = "gpt-4o"
+ system_prompt: str = "Evaluate whether the summary is good."
+
+ @weave.op
+ def some_complicated_preprocessing(self, text: str) -> str:
+ processed_text = "Original text: \n" + text + "\n"
+ return processed_text
+
+ @weave.op
+ def call_llm(self, summary: str, processed_text: str) -> dict:
+ res = llm_client.chat.completions.create(
+ messages=[
+ {"role": "system", "content": self.system_prompt},
+ {"role": "user", "content": (
+ f"Analyse how good the summary is compared to the original text."
+ f"Summary: {summary}\n{processed_text}"
+ )}])
+ return {"summary_quality": res}
+
+ @weave.op
+ def score(self, output: str, text: str) -> dict:
+ """Score the summary quality.
+
+ Args:
+ output: The summary generated by an AI system
+ text: The original text being summarized
+ """
+ processed_text = self.some_complicated_preprocessing(text)
+ eval_result = self.call_llm(summary=output, processed_text=processed_text)
+ return {"summary_quality": eval_result}
+
+ evaluation = weave.Evaluation(
+ dataset=[{"text": "The quick brown fox jumps over the lazy dog."}],
+ scorers=[summarization_scorer])
+ ```
+
+ This class evaluates how good a summary is by comparing it to the original text.
+
+
+
+ ```plaintext
+ This feature is not available in TypeScript yet. Stay tuned!
+ ```
+
+
diff --git a/docs/docs/guides/evaluation/predefined-scorers.md b/docs/docs/guides/evaluation/predefined-scorers.md
new file mode 100644
index 00000000000..004d02c78cd
--- /dev/null
+++ b/docs/docs/guides/evaluation/predefined-scorers.md
@@ -0,0 +1,493 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Predefined Scorers
+
+
+
+ **Installation**
+
+ To use Weave's predefined scorers you need to install some additional dependencies:
+
+ ```bash
+ pip install weave[scorers]
+ ```
+
+ **LLM-evaluators**
+
+ The pre-defined scorers that use LLMs support the OpenAI, Anthropic, Google GenerativeAI and MistralAI clients. They also use `weave`'s `InstructorLLMScorer` class, so you'll need to install the [`instructor`](https://github.com/instructor-ai/instructor) Python package to be able to use them. You can get all necessary dependencies with `pip install "weave[scorers]"`
+
+ ### `HallucinationFreeScorer`
+
+ This scorer checks if your AI system's output includes any hallucinations based on the input data.
+
+ ```python
+ from weave.scorers import HallucinationFreeScorer
+
+ llm_client = ... # initialize your LLM client here
+
+ scorer = HallucinationFreeScorer(
+ client=llm_client,
+ model_id="gpt-4o"
+ )
+ ```
+
+ **Customization:**
+
+ - Customize the `system_prompt` and `user_prompt` attributes of the scorer to define what "hallucination" means for you.
+
+ **Notes:**
+
+ - The `score` method expects an input column named `context`. If your dataset uses a different name, use the `column_map` attribute to map `context` to the dataset column.
+
+ Here you have an example in the context of an evaluation:
+
+ ```python
+ import asyncio
+ from openai import OpenAI
+ import weave
+ from weave.scorers import HallucinationFreeScorer
+
+ # Initialize clients and scorers
+ llm_client = OpenAI()
+ hallucination_scorer = HallucinationFreeScorer(
+ client=llm_client,
+ model_id="gpt-4o",
+ column_map={"context": "input", "output": "other_col"}
+ )
+
+ # Create dataset
+ dataset = [
+ {"input": "John likes various types of cheese."},
+ {"input": "Pepe likes various types of cheese."},
+ ]
+
+ @weave.op
+ def model(input: str) -> str:
+ return "The person's favorite cheese is cheddar."
+
+ # Run evaluation
+ evaluation = weave.Evaluation(
+ dataset=dataset,
+ scorers=[hallucination_scorer],
+ )
+ result = asyncio.run(evaluation.evaluate(model))
+ print(result)
+ # {'HallucinationFreeScorer': {'has_hallucination': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 1.4395725727081299}}
+ ```
+
+ ---
+
+ ### `SummarizationScorer`
+
+ Use an LLM to compare a summary to the original text and evaluate the quality of the summary.
+
+ ```python
+ from weave.scorers import SummarizationScorer
+
+ llm_client = ... # initialize your LLM client here
+
+ scorer = SummarizationScorer(
+ client=llm_client,
+ model_id="gpt-4o"
+ )
+ ```
+
+ **How It Works:**
+
+ This scorer evaluates summaries in two ways:
+
+ 1. **Entity Density:** Checks the ratio of unique entities (like names, places, or things) mentioned in the summary to the total word count in the summary in order to estimate the "information density" of the summary. Uses an LLM to extract the entities. Similar to how entity density is used in the Chain of Density paper, https://arxiv.org/abs/2309.04269
+
+ 2. **Quality Grading:** Uses an LLM-evaluator to grade the summary as `poor`, `ok`, or `excellent`. These grades are converted to scores (0.0 for poor, 0.5 for ok, and 1.0 for excellent) so you can calculate averages.
+
+ **Customization:**
+
+ - Adjust `summarization_evaluation_system_prompt` and `summarization_evaluation_prompt` to define what makes a good summary.
+
+ **Notes:**
+
+ - This scorer uses the `InstructorLLMScorer` class.
+ - The `score` method expects the original text that was summarized to be present in the `input` column of the dataset. Use the `column_map` class attribute to map `input` to the correct dataset column if needed.
+
+ Here you have an example usage of the `SummarizationScorer` in the context of an evaluation:
+
+ ```python
+ import asyncio
+ from openai import OpenAI
+ import weave
+ from weave.scorers import SummarizationScorer
+
+ class SummarizationModel(weave.Model):
+ @weave.op()
+ async def predict(self, input: str) -> str:
+ return "This is a summary of the input text."
+
+ # Initialize clients and scorers
+ llm_client = OpenAI()
+ model = SummarizationModel()
+ summarization_scorer = SummarizationScorer(
+ client=llm_client,
+ model_id="gpt-4o",
+ )
+ # Create dataset
+ dataset = [
+ {"input": "The quick brown fox jumps over the lazy dog."},
+ {"input": "Artificial Intelligence is revolutionizing various industries."}
+ ]
+
+ # Run evaluation
+ evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer])
+ results = asyncio.run(evaluation.evaluate(model))
+ print(results)
+ # {'SummarizationScorer': {'is_entity_dense': {'true_count': 0, 'true_fraction': 0.0}, 'summarization_eval_score': {'mean': 0.0}, 'entity_density': {'mean': 0.0}}, 'model_latency': {'mean': 6.210803985595703e-05}}
+ ```
+
+ ---
+
+ ### `OpenAIModerationScorer`
+
+ The `OpenAIModerationScorer` uses OpenAI's Moderation API to check if the AI system's output contains disallowed content, such as hate speech or explicit material.
+
+ ```python
+ from weave.scorers import OpenAIModerationScorer
+ from openai import OpenAI
+
+ oai_client = OpenAI(api_key=...) # initialize your LLM client here
+
+ scorer = OpenAIModerationScorer(
+ client=oai_client,
+ model_id="text-embedding-3-small"
+ )
+ ```
+
+ **How It Works:**
+
+ - Sends the AI's output to the OpenAI Moderation endpoint and returns a dictionary indicating whether the content is flagged and details about the categories involved.
+
+ **Notes:**
+
+ - Requires the `openai` Python package.
+ - The client must be an instance of OpenAI's `OpenAI` or `AsyncOpenAI` client.
+
+ Here you have an example in the context of an evaluation:
+
+ ```python
+ import asyncio
+ from openai import OpenAI
+ import weave
+ from weave.scorers import OpenAIModerationScorer
+
+ class MyModel(weave.Model):
+ @weave.op
+ async def predict(self, input: str) -> str:
+ return input
+
+ # Initialize clients and scorers
+ client = OpenAI()
+ model = MyModel()
+ moderation_scorer = OpenAIModerationScorer(client=client)
+
+ # Create dataset
+ dataset = [
+ {"input": "I love puppies and kittens!"},
+ {"input": "I hate everyone and want to hurt them."}
+ ]
+
+ # Run evaluation
+ evaluation = weave.Evaluation(dataset=dataset, scorers=[moderation_scorer])
+ results = asyncio.run(evaluation.evaluate(model))
+ print(results)
+ # {'OpenAIModerationScorer': {'flagged': {'true_count': 1, 'true_fraction': 0.5}, 'categories': {'violence': {'true_count': 1, 'true_fraction': 1.0}}}, 'model_latency': {'mean': 9.500980377197266e-05}}
+ ```
+
+ ---
+
+ ### `EmbeddingSimilarityScorer`
+
+ The `EmbeddingSimilarityScorer` computes the cosine similarity between the embeddings of the AI system's output and a target text from your dataset. It's useful for measuring how similar the AI's output is to a reference text.
+
+ ```python
+ from weave.scorers import EmbeddingSimilarityScorer
+
+ llm_client = ... # initialise your LlM client
+
+ similarity_scorer = EmbeddingSimilarityScorer(
+ client=llm_client
+ target_column="reference_text", # the dataset column to compare the output against
+ threshold=0.4 # the cosine similarity threshold to use
+ )
+ ```
+
+ **Parameters:**
+
+ - `target`: This scorer expects a `target` column in your dataset, it will calculate the cosine similarity of the embeddings of the `target` column to the AI system output. If your dataset doesn't contain a column called `target` you can use the scorers `column_map` attribute to map `target` to the appropriate column name in your dataset. See the Column Mapping section for more.
+ - `threshold` (float): The minimum cosine similarity score between the embedding of the AI system output and the embdedding of the `target`, above which the 2 samples are considered "similar", (defaults to `0.5`). `threshold` can be in a range from -1 to 1:
+ - 1 indicates identical direction.
+ - 0 indicates orthogonal vectors.
+ - -1 indicates opposite direction.
+
+ The correct cosine similarity threshold to set can fluctuate quite a lot depending on your use case, we advise exploring different thresholds.
+
+ Here you have an example usage of the `EmbeddingSimilarityScorer` in the context of an evaluation:
+
+ ```python
+ import asyncio
+ from openai import OpenAI
+ import weave
+ from weave.scorers import EmbeddingSimilarityScorer
+
+ # Initialize clients and scorers
+ client = OpenAI()
+ similarity_scorer = EmbeddingSimilarityScorer(
+ client=client,
+ threshold=0.7,
+ column_map={"target": "reference"}
+ )
+
+ # Create dataset
+ dataset = [
+ {
+ "input": "He's name is John",
+ "reference": "John likes various types of cheese.",
+ },
+ {
+ "input": "He's name is Pepe.",
+ "reference": "Pepe likes various types of cheese.",
+ },
+ ]
+
+ # Define model
+ @weave.op
+ def model(input: str) -> str:
+ return "John likes various types of cheese."
+
+ # Run evaluation
+ evaluation = weave.Evaluation(
+ dataset=dataset,
+ scorers=[similarity_scorer],
+ )
+ result = asyncio.run(evaluation.evaluate(model))
+ print(result)
+ # {'EmbeddingSimilarityScorer': {'is_similar': {'true_count': 1, 'true_fraction': 0.5}, 'similarity_score': {'mean': 0.8448514031462045}}, 'model_latency': {'mean': 0.45862746238708496}}
+ ```
+
+ ---
+
+ ### `ValidJSONScorer`
+
+ The ValidJSONScorer checks whether the AI system's output is valid JSON. This scorer is useful when you expect the output to be in JSON format and need to verify its validity.
+
+ ```python
+ from weave.scorers import ValidJSONScorer
+
+ json_scorer = ValidJSONScorer()
+ ```
+
+ Here you have an example usage of the `ValidJSONScorer` in the context of an evaluation:
+
+ ```python
+ import asyncio
+ import weave
+ from weave.scorers import ValidJSONScorer
+
+ class JSONModel(weave.Model):
+ @weave.op()
+ async def predict(self, input: str) -> str:
+ # This is a placeholder.
+ # In a real scenario, this would generate JSON.
+ return '{"key": "value"}'
+
+ model = JSONModel()
+ json_scorer = ValidJSONScorer()
+
+ dataset = [
+ {"input": "Generate a JSON object with a key and value"},
+ {"input": "Create an invalid JSON"}
+ ]
+
+ evaluation = weave.Evaluation(dataset=dataset, scorers=[json_scorer])
+ results = asyncio.run(evaluation.evaluate(model))
+ print(results)
+ # {'ValidJSONScorer': {'json_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 8.58306884765625e-05}}
+ ```
+
+ ---
+
+ ### `ValidXMLScorer`
+
+ The `ValidXMLScorer` checks whether the AI system's output is valid XML. This is useful when expecting XML-formatted outputs.
+
+ ```python
+ from weave.scorers import ValidXMLScorer
+
+ xml_scorer = ValidXMLScorer()
+ ```
+
+ Here you have an example usage of the `ValidXMLScorer` in the context of an evaluation:
+
+ ```python
+ import asyncio
+ import weave
+ from weave.scorers import ValidXMLScorer
+
+ class XMLModel(weave.Model):
+ @weave.op()
+ async def predict(self, input: str) -> str:
+ # This is a placeholder. In a real scenario, this would generate XML.
+ return 'value'
+
+ model = XMLModel()
+ xml_scorer = ValidXMLScorer()
+
+ dataset = [
+ {"input": "Generate a valid XML with a root element"},
+ {"input": "Create an invalid XML"}
+ ]
+
+ evaluation = weave.Evaluation(dataset=dataset, scorers=[xml_scorer])
+ results = asyncio.run(evaluation.evaluate(model))
+ print(results)
+ # {'ValidXMLScorer': {'xml_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 8.20159912109375e-05}}
+ ```
+
+ ---
+
+ ### `PydanticScorer`
+
+ The `PydanticScorer` validates the AI system's output against a Pydantic model to ensure it adheres to a specified schema or data structure.
+
+ ```python
+ from weave.scorers import PydanticScorer
+ from pydantic import BaseModel
+
+ class FinancialReport(BaseModel):
+ revenue: int
+ year: str
+
+ pydantic_scorer = PydanticScorer(model=FinancialReport)
+ ```
+
+ ---
+
+ ### RAGAS - `ContextEntityRecallScorer`
+
+ The `ContextEntityRecallScorer` estimates context recall by extracting entities from both the AI system's output and the provided context, then computing the recall score. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library
+
+ ```python
+ from weave.scorers import ContextEntityRecallScorer
+
+ llm_client = ... # initialise your LlM client
+
+ entity_recall_scorer = ContextEntityRecallScorer(
+ client=llm_client
+ model_id="your-model-id"
+ )
+ ```
+
+ **How It Works:**
+
+ - Uses an LLM to extract unique entities from the output and context and calculates recall.
+ - **Recall** indicates the proportion of important entities from the context that are captured in the output, helping to assess the model's effectiveness in retrieving relevant information.
+ - Returns a dictionary with the recall score.
+
+ **Notes:**
+
+ - Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed.
+
+ ---
+
+ ### RAGAS - `ContextRelevancyScorer`
+
+ The `ContextRelevancyScorer` evaluates the relevancy of the provided context to the AI system's output. It helps determine if the context used is appropriate for generating the output. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library.
+
+ ```python
+ from weave.scorers import ContextRelevancyScorer
+
+ llm_client = ... # initialise your LlM client
+
+ relevancy_scorer = ContextRelevancyScorer(
+ llm_client = ... # initialise your LlM client
+ model_id="your-model-id"
+ )
+ ```
+
+ **How It Works:**
+
+ - Uses an LLM to rate the relevancy of the context to the output on a scale from 0 to 1.
+ - Returns a dictionary with the `relevancy_score`.
+
+ **Notes:**
+
+ - Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed.
+ - Customize the `relevancy_prompt` to define how relevancy is assessed.
+
+ Here you have an example usage of `ContextEntityRecallScorer` and `ContextRelevancyScorer` in the context of an evaluation:
+
+ ```python
+ import asyncio
+ from textwrap import dedent
+ from openai import OpenAI
+ import weave
+ from weave.scorers import ContextEntityRecallScorer, ContextRelevancyScorer
+
+ class RAGModel(weave.Model):
+ @weave.op()
+ async def predict(self, question: str) -> str:
+ "Retrieve relevant context"
+ return "Paris is the capital of France."
+
+
+ model = RAGModel()
+
+ # Define prompts
+ relevancy_prompt: str = dedent("""
+ Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1.
+
+ Question: {question}
+ Context: {context}
+ Relevancy Score (0-1):
+ """)
+
+ # Initialize clients and scorers
+ llm_client = OpenAI()
+ entity_recall_scorer = ContextEntityRecallScorer(
+ client=client,
+ model_id="gpt-4o",
+ )
+
+ relevancy_scorer = ContextRelevancyScorer(
+ client=llm_client,
+ model_id="gpt-4o",
+ relevancy_prompt=relevancy_prompt
+ )
+
+ # Create dataset
+ dataset = [
+ {
+ "question": "What is the capital of France?",
+ "context": "Paris is the capital city of France."
+ },
+ {
+ "question": "Who wrote Romeo and Juliet?",
+ "context": "William Shakespeare wrote many famous plays."
+ }
+ ]
+
+ # Run evaluation
+ evaluation = weave.Evaluation(
+ dataset=dataset,
+ scorers=[entity_recall_scorer, relevancy_scorer]
+ )
+ results = asyncio.run(evaluation.evaluate(model))
+ print(results)
+ # {'ContextEntityRecallScorer': {'recall': {'mean': 0.3333333333333333}}, 'ContextRelevancyScorer': {'relevancy_score': {'mean': 0.5}}, 'model_latency': {'mean': 9.393692016601562e-05}}
+ ```
+
+
+
+ ```plaintext
+ This feature is not available in TypeScript yet. Stay tuned!
+ ```
+
+
diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
index e313dcb5582..324b946fb2a 100644
--- a/docs/docs/guides/evaluation/scorers.md
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -1,9 +1,7 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
-# Evaluation Metrics
-
-## Evaluations in Weave
+# Scorers in Evaluation Workflows
In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics. They take the AI's output, analyze it, and return a dictionary of results. Scorers can use your input data as reference if needed and can also output extra information, such as explanations or reasonings from the evaluation.
@@ -22,119 +20,6 @@ In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics.
-## Create your own Scorers
-
-### Function-based Scorers
-
-
-
- These are functions decorated with `@weave.op` that return a dictionary. They're great for simple evaluations like:
-
- ```python
- import weave
-
- @weave.op
- def evaluate_uppercase(text: str) -> dict:
- return {"text_is_uppercase": text.isupper()}
-
- my_eval = weave.Evaluation(
- dataset=[{"text": "HELLO WORLD"}],
- scorers=[evaluate_uppercase]
- )
- ```
-
- When the evaluation is run, `evaluate_uppercase` checks if the text is all uppercase.
-
-
-
- These are functions wrapped with `weave.op` that accept an object with `modelOutput` and optionally `datasetRow`. They're great for simple evaluations like:
- ```typescript
- import * as weave from 'weave'
-
- const evaluateUppercase = weave.op(
- ({modelOutput}) => modelOutput.toUpperCase() === modelOutput,
- {name: 'textIsUppercase'}
- );
-
-
- const myEval = new weave.Evaluation({
- dataset: [{text: 'HELLO WORLD'}],
- scorers: [evaluateUppercase],
- })
- ```
-
-
-
-
-### Class-based Scorers
-
-
-
- For more advanced evaluations, especially when you need to keep track of additional scorer metadata, try different prompts for your LLM-evaluators, or make multiple function calls, you can use the `Scorer` class.
-
- **Requirements:**
-
- 1. Inherit from `weave.Scorer`.
- 2. Define a `score` method decorated with `@weave.op`.
- 3. The `score` method must return a dictionary.
-
- Example:
-
- ```python
- import weave
- from openai import OpenAI
- from weave import Scorer
-
- llm_client = OpenAI()
-
- #highlight-next-line
- class SummarizationScorer(Scorer):
- model_id: str = "gpt-4o"
- system_prompt: str = "Evaluate whether the summary is good."
-
- @weave.op
- def some_complicated_preprocessing(self, text: str) -> str:
- processed_text = "Original text: \n" + text + "\n"
- return processed_text
-
- @weave.op
- def call_llm(self, summary: str, processed_text: str) -> dict:
- res = llm_client.chat.completions.create(
- messages=[
- {"role": "system", "content": self.system_prompt},
- {"role": "user", "content": (
- f"Analyse how good the summary is compared to the original text."
- f"Summary: {summary}\n{processed_text}"
- )}])
- return {"summary_quality": res}
-
- @weave.op
- def score(self, output: str, text: str) -> dict:
- """Score the summary quality.
-
- Args:
- output: The summary generated by an AI system
- text: The original text being summarized
- """
- processed_text = self.some_complicated_preprocessing(text)
- eval_result = self.call_llm(summary=output, processed_text=processed_text)
- return {"summary_quality": eval_result}
-
- evaluation = weave.Evaluation(
- dataset=[{"text": "The quick brown fox jumps over the lazy dog."}],
- scorers=[summarization_scorer])
- ```
-
- This class evaluates how good a summary is by comparing it to the original text.
-
-
-
- ```plaintext
- This feature is not available in TypeScript yet. Stay tuned!
- ```
-
-
-
## How Scorers Work
### Scorer Keyword Arguments
@@ -303,493 +188,3 @@ In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics.
-## Predefined Scorers
-
-
-
- **Installation**
-
- To use Weave's predefined scorers you need to install some additional dependencies:
-
- ```bash
- pip install weave[scorers]
- ```
-
- **LLM-evaluators**
-
- The pre-defined scorers that use LLMs support the OpenAI, Anthropic, Google GenerativeAI and MistralAI clients. They also use `weave`'s `InstructorLLMScorer` class, so you'll need to install the [`instructor`](https://github.com/instructor-ai/instructor) Python package to be able to use them. You can get all necessary dependencies with `pip install "weave[scorers]"`
-
- ### `HallucinationFreeScorer`
-
- This scorer checks if your AI system's output includes any hallucinations based on the input data.
-
- ```python
- from weave.scorers import HallucinationFreeScorer
-
- llm_client = ... # initialize your LLM client here
-
- scorer = HallucinationFreeScorer(
- client=llm_client,
- model_id="gpt-4o"
- )
- ```
-
- **Customization:**
-
- - Customize the `system_prompt` and `user_prompt` attributes of the scorer to define what "hallucination" means for you.
-
- **Notes:**
-
- - The `score` method expects an input column named `context`. If your dataset uses a different name, use the `column_map` attribute to map `context` to the dataset column.
-
- Here you have an example in the context of an evaluation:
-
- ```python
- import asyncio
- from openai import OpenAI
- import weave
- from weave.scorers import HallucinationFreeScorer
-
- # Initialize clients and scorers
- llm_client = OpenAI()
- hallucination_scorer = HallucinationFreeScorer(
- client=llm_client,
- model_id="gpt-4o",
- column_map={"context": "input", "output": "other_col"}
- )
-
- # Create dataset
- dataset = [
- {"input": "John likes various types of cheese."},
- {"input": "Pepe likes various types of cheese."},
- ]
-
- @weave.op
- def model(input: str) -> str:
- return "The person's favorite cheese is cheddar."
-
- # Run evaluation
- evaluation = weave.Evaluation(
- dataset=dataset,
- scorers=[hallucination_scorer],
- )
- result = asyncio.run(evaluation.evaluate(model))
- print(result)
- # {'HallucinationFreeScorer': {'has_hallucination': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 1.4395725727081299}}
- ```
-
- ---
-
- ### `SummarizationScorer`
-
- Use an LLM to compare a summary to the original text and evaluate the quality of the summary.
-
- ```python
- from weave.scorers import SummarizationScorer
-
- llm_client = ... # initialize your LLM client here
-
- scorer = SummarizationScorer(
- client=llm_client,
- model_id="gpt-4o"
- )
- ```
-
- **How It Works:**
-
- This scorer evaluates summaries in two ways:
-
- 1. **Entity Density:** Checks the ratio of unique entities (like names, places, or things) mentioned in the summary to the total word count in the summary in order to estimate the "information density" of the summary. Uses an LLM to extract the entities. Similar to how entity density is used in the Chain of Density paper, https://arxiv.org/abs/2309.04269
-
- 2. **Quality Grading:** Uses an LLM-evaluator to grade the summary as `poor`, `ok`, or `excellent`. These grades are converted to scores (0.0 for poor, 0.5 for ok, and 1.0 for excellent) so you can calculate averages.
-
- **Customization:**
-
- - Adjust `summarization_evaluation_system_prompt` and `summarization_evaluation_prompt` to define what makes a good summary.
-
- **Notes:**
-
- - This scorer uses the `InstructorLLMScorer` class.
- - The `score` method expects the original text that was summarized to be present in the `input` column of the dataset. Use the `column_map` class attribute to map `input` to the correct dataset column if needed.
-
- Here you have an example usage of the `SummarizationScorer` in the context of an evaluation:
-
- ```python
- import asyncio
- from openai import OpenAI
- import weave
- from weave.scorers import SummarizationScorer
-
- class SummarizationModel(weave.Model):
- @weave.op()
- async def predict(self, input: str) -> str:
- return "This is a summary of the input text."
-
- # Initialize clients and scorers
- llm_client = OpenAI()
- model = SummarizationModel()
- summarization_scorer = SummarizationScorer(
- client=llm_client,
- model_id="gpt-4o",
- )
- # Create dataset
- dataset = [
- {"input": "The quick brown fox jumps over the lazy dog."},
- {"input": "Artificial Intelligence is revolutionizing various industries."}
- ]
-
- # Run evaluation
- evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer])
- results = asyncio.run(evaluation.evaluate(model))
- print(results)
- # {'SummarizationScorer': {'is_entity_dense': {'true_count': 0, 'true_fraction': 0.0}, 'summarization_eval_score': {'mean': 0.0}, 'entity_density': {'mean': 0.0}}, 'model_latency': {'mean': 6.210803985595703e-05}}
- ```
-
- ---
-
- ### `OpenAIModerationScorer`
-
- The `OpenAIModerationScorer` uses OpenAI's Moderation API to check if the AI system's output contains disallowed content, such as hate speech or explicit material.
-
- ```python
- from weave.scorers import OpenAIModerationScorer
- from openai import OpenAI
-
- oai_client = OpenAI(api_key=...) # initialize your LLM client here
-
- scorer = OpenAIModerationScorer(
- client=oai_client,
- model_id="text-embedding-3-small"
- )
- ```
-
- **How It Works:**
-
- - Sends the AI's output to the OpenAI Moderation endpoint and returns a dictionary indicating whether the content is flagged and details about the categories involved.
-
- **Notes:**
-
- - Requires the `openai` Python package.
- - The client must be an instance of OpenAI's `OpenAI` or `AsyncOpenAI` client.
-
- Here you have an example in the context of an evaluation:
-
- ```python
- import asyncio
- from openai import OpenAI
- import weave
- from weave.scorers import OpenAIModerationScorer
-
- class MyModel(weave.Model):
- @weave.op
- async def predict(self, input: str) -> str:
- return input
-
- # Initialize clients and scorers
- client = OpenAI()
- model = MyModel()
- moderation_scorer = OpenAIModerationScorer(client=client)
-
- # Create dataset
- dataset = [
- {"input": "I love puppies and kittens!"},
- {"input": "I hate everyone and want to hurt them."}
- ]
-
- # Run evaluation
- evaluation = weave.Evaluation(dataset=dataset, scorers=[moderation_scorer])
- results = asyncio.run(evaluation.evaluate(model))
- print(results)
- # {'OpenAIModerationScorer': {'flagged': {'true_count': 1, 'true_fraction': 0.5}, 'categories': {'violence': {'true_count': 1, 'true_fraction': 1.0}}}, 'model_latency': {'mean': 9.500980377197266e-05}}
- ```
-
- ---
-
- ### `EmbeddingSimilarityScorer`
-
- The `EmbeddingSimilarityScorer` computes the cosine similarity between the embeddings of the AI system's output and a target text from your dataset. It's useful for measuring how similar the AI's output is to a reference text.
-
- ```python
- from weave.scorers import EmbeddingSimilarityScorer
-
- llm_client = ... # initialise your LlM client
-
- similarity_scorer = EmbeddingSimilarityScorer(
- client=llm_client
- target_column="reference_text", # the dataset column to compare the output against
- threshold=0.4 # the cosine similarity threshold to use
- )
- ```
-
- **Parameters:**
-
- - `target`: This scorer expects a `target` column in your dataset, it will calculate the cosine similarity of the embeddings of the `target` column to the AI system output. If your dataset doesn't contain a column called `target` you can use the scorers `column_map` attribute to map `target` to the appropriate column name in your dataset. See the Column Mapping section for more.
- - `threshold` (float): The minimum cosine similarity score between the embedding of the AI system output and the embdedding of the `target`, above which the 2 samples are considered "similar", (defaults to `0.5`). `threshold` can be in a range from -1 to 1:
- - 1 indicates identical direction.
- - 0 indicates orthogonal vectors.
- - -1 indicates opposite direction.
-
- The correct cosine similarity threshold to set can fluctuate quite a lot depending on your use case, we advise exploring different thresholds.
-
- Here you have an example usage of the `EmbeddingSimilarityScorer` in the context of an evaluation:
-
- ```python
- import asyncio
- from openai import OpenAI
- import weave
- from weave.scorers import EmbeddingSimilarityScorer
-
- # Initialize clients and scorers
- client = OpenAI()
- similarity_scorer = EmbeddingSimilarityScorer(
- client=client,
- threshold=0.7,
- column_map={"target": "reference"}
- )
-
- # Create dataset
- dataset = [
- {
- "input": "He's name is John",
- "reference": "John likes various types of cheese.",
- },
- {
- "input": "He's name is Pepe.",
- "reference": "Pepe likes various types of cheese.",
- },
- ]
-
- # Define model
- @weave.op
- def model(input: str) -> str:
- return "John likes various types of cheese."
-
- # Run evaluation
- evaluation = weave.Evaluation(
- dataset=dataset,
- scorers=[similarity_scorer],
- )
- result = asyncio.run(evaluation.evaluate(model))
- print(result)
- # {'EmbeddingSimilarityScorer': {'is_similar': {'true_count': 1, 'true_fraction': 0.5}, 'similarity_score': {'mean': 0.8448514031462045}}, 'model_latency': {'mean': 0.45862746238708496}}
- ```
-
- ---
-
- ### `ValidJSONScorer`
-
- The ValidJSONScorer checks whether the AI system's output is valid JSON. This scorer is useful when you expect the output to be in JSON format and need to verify its validity.
-
- ```python
- from weave.scorers import ValidJSONScorer
-
- json_scorer = ValidJSONScorer()
- ```
-
- Here you have an example usage of the `ValidJSONScorer` in the context of an evaluation:
-
- ```python
- import asyncio
- import weave
- from weave.scorers import ValidJSONScorer
-
- class JSONModel(weave.Model):
- @weave.op()
- async def predict(self, input: str) -> str:
- # This is a placeholder.
- # In a real scenario, this would generate JSON.
- return '{"key": "value"}'
-
- model = JSONModel()
- json_scorer = ValidJSONScorer()
-
- dataset = [
- {"input": "Generate a JSON object with a key and value"},
- {"input": "Create an invalid JSON"}
- ]
-
- evaluation = weave.Evaluation(dataset=dataset, scorers=[json_scorer])
- results = asyncio.run(evaluation.evaluate(model))
- print(results)
- # {'ValidJSONScorer': {'json_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 8.58306884765625e-05}}
- ```
-
- ---
-
- ### `ValidXMLScorer`
-
- The `ValidXMLScorer` checks whether the AI system's output is valid XML. This is useful when expecting XML-formatted outputs.
-
- ```python
- from weave.scorers import ValidXMLScorer
-
- xml_scorer = ValidXMLScorer()
- ```
-
- Here you have an example usage of the `ValidXMLScorer` in the context of an evaluation:
-
- ```python
- import asyncio
- import weave
- from weave.scorers import ValidXMLScorer
-
- class XMLModel(weave.Model):
- @weave.op()
- async def predict(self, input: str) -> str:
- # This is a placeholder. In a real scenario, this would generate XML.
- return 'value'
-
- model = XMLModel()
- xml_scorer = ValidXMLScorer()
-
- dataset = [
- {"input": "Generate a valid XML with a root element"},
- {"input": "Create an invalid XML"}
- ]
-
- evaluation = weave.Evaluation(dataset=dataset, scorers=[xml_scorer])
- results = asyncio.run(evaluation.evaluate(model))
- print(results)
- # {'ValidXMLScorer': {'xml_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 8.20159912109375e-05}}
- ```
-
- ---
-
- ### `PydanticScorer`
-
- The `PydanticScorer` validates the AI system's output against a Pydantic model to ensure it adheres to a specified schema or data structure.
-
- ```python
- from weave.scorers import PydanticScorer
- from pydantic import BaseModel
-
- class FinancialReport(BaseModel):
- revenue: int
- year: str
-
- pydantic_scorer = PydanticScorer(model=FinancialReport)
- ```
-
- ---
-
- ### RAGAS - `ContextEntityRecallScorer`
-
- The `ContextEntityRecallScorer` estimates context recall by extracting entities from both the AI system's output and the provided context, then computing the recall score. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library
-
- ```python
- from weave.scorers import ContextEntityRecallScorer
-
- llm_client = ... # initialise your LlM client
-
- entity_recall_scorer = ContextEntityRecallScorer(
- client=llm_client
- model_id="your-model-id"
- )
- ```
-
- **How It Works:**
-
- - Uses an LLM to extract unique entities from the output and context and calculates recall.
- - **Recall** indicates the proportion of important entities from the context that are captured in the output, helping to assess the model's effectiveness in retrieving relevant information.
- - Returns a dictionary with the recall score.
-
- **Notes:**
-
- - Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed.
-
- ---
-
- ### RAGAS - `ContextRelevancyScorer`
-
- The `ContextRelevancyScorer` evaluates the relevancy of the provided context to the AI system's output. It helps determine if the context used is appropriate for generating the output. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library.
-
- ```python
- from weave.scorers import ContextRelevancyScorer
-
- llm_client = ... # initialise your LlM client
-
- relevancy_scorer = ContextRelevancyScorer(
- llm_client = ... # initialise your LlM client
- model_id="your-model-id"
- )
- ```
-
- **How It Works:**
-
- - Uses an LLM to rate the relevancy of the context to the output on a scale from 0 to 1.
- - Returns a dictionary with the `relevancy_score`.
-
- **Notes:**
-
- - Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed.
- - Customize the `relevancy_prompt` to define how relevancy is assessed.
-
- Here you have an example usage of `ContextEntityRecallScorer` and `ContextRelevancyScorer` in the context of an evaluation:
-
- ```python
- import asyncio
- from textwrap import dedent
- from openai import OpenAI
- import weave
- from weave.scorers import ContextEntityRecallScorer, ContextRelevancyScorer
-
- class RAGModel(weave.Model):
- @weave.op()
- async def predict(self, question: str) -> str:
- "Retrieve relevant context"
- return "Paris is the capital of France."
-
-
- model = RAGModel()
-
- # Define prompts
- relevancy_prompt: str = dedent("""
- Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1.
-
- Question: {question}
- Context: {context}
- Relevancy Score (0-1):
- """)
-
- # Initialize clients and scorers
- llm_client = OpenAI()
- entity_recall_scorer = ContextEntityRecallScorer(
- client=client,
- model_id="gpt-4o",
- )
-
- relevancy_scorer = ContextRelevancyScorer(
- client=llm_client,
- model_id="gpt-4o",
- relevancy_prompt=relevancy_prompt
- )
-
- # Create dataset
- dataset = [
- {
- "question": "What is the capital of France?",
- "context": "Paris is the capital city of France."
- },
- {
- "question": "Who wrote Romeo and Juliet?",
- "context": "William Shakespeare wrote many famous plays."
- }
- ]
-
- # Run evaluation
- evaluation = weave.Evaluation(
- dataset=dataset,
- scorers=[entity_recall_scorer, relevancy_scorer]
- )
- results = asyncio.run(evaluation.evaluate(model))
- print(results)
- # {'ContextEntityRecallScorer': {'recall': {'mean': 0.3333333333333333}}, 'ContextRelevancyScorer': {'relevancy_score': {'mean': 0.5}}, 'model_latency': {'mean': 9.393692016601562e-05}}
- ```
-
-
-
- ```plaintext
- This feature is not available in TypeScript yet. Stay tuned!
- ```
-
-
diff --git a/docs/sidebars.ts b/docs/sidebars.ts
index ede80ed6c50..0619e065c7d 100644
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -60,7 +60,7 @@ const sidebars: SidebarsConfig = {
collapsed: false,
label: "Evaluation",
link: { type: "doc", id: "guides/core-types/evaluations" },
- items: ["guides/evaluation/scorers"],
+ items: ["guides/evaluation/scorers", "guides/evaluation/predefined-scorers", "guides/evaluation/custom-scorers"],
},
"guides/core-types/prompts",
"guides/core-types/models",