diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 64b764b99d2..0605b0534df 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -240,6 +240,7 @@ jobs:
'mistral1',
'notdiamond',
'openai',
+ 'scorers_tests',
'pandas-test',
]
fail-fast: false
@@ -292,6 +293,9 @@ jobs:
WF_CLICKHOUSE_HOST: weave_clickhouse
WEAVE_SERVER_DISABLE_ECOSYSTEM: 1
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+ MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
nox -e "tests-${{ matrix.python-version-major }}.${{ matrix.python-version-minor }}(shard='${{ matrix.nox-shard }}')"
trace-tests-matrix-check: # This job does nothing and is only used for the branch protection
diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md
new file mode 100644
index 00000000000..ce7ea3b86c1
--- /dev/null
+++ b/docs/docs/guides/evaluation/scorers.md
@@ -0,0 +1,670 @@
+# Evaluation Metrics
+
+## Evaluations in Weave
+In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics. They take the AI's output, analyze it, and return a dictionary of results. Scorers can use your input data as reference if needed and can also output extra information, such as explanations or reasonings from the evaluation.
+
+Scorers are passed to a `weave.Evaluation` object during evaluation. There are two types of Scorers in weave:
+
+1. **Function-based Scorers:** Simple Python functions decorated with `@weave.op`.
+2. **Class-based Scorers:** Python classes that inherit from `weave.Scorer` for more complex evaluations.
+
+Scorers must return a dictionary and can return multiple metrics, nested metrics and non-numeric values such as text returned from a LLM-evaluator about its reasoning.
+
+## Create your own Scorers
+### Function-based Scorers
+These are functions decorated with `@weave.op` that return a dictionary. They're great for simple evaluations like:
+
+```python
+import weave
+
+@weave.op
+def evaluate_uppercase(text: str) -> dict: # Added return type hint
+ return {"text_is_uppercase": text.isupper()}
+
+my_eval = weave.Evaluation(
+ dataset=[{"text": "HELLO WORLD"}],
+ scorers=[evaluate_uppercase]
+)
+```
+
+When the evaluation is run, `evaluate_uppercase` checks if the text is all uppercase.
+
+### Class-based Scorers
+For more advanced evaluations, especially when you need to keep track of additional scorer metadata, try different prompts for your LLM-evaluators, or make multiple function calls, you can use the `Scorer` class.
+
+**Requirements:**
+1. Inherit from `weave.Scorer`.
+2. Define a `score` method decorated with `@weave.op`.
+3. The `score` method must return a dictionary.
+
+Example:
+
+
+```python
+import weave
+from openai import OpenAI
+from weave import Scorer
+
+llm_client = OpenAI()
+
+#highlight-next-line
+class SummarizationScorer(Scorer):
+ model_id: str = "gpt-4o"
+ system_prompt: str = "Evaluate whether the summary is good."
+
+ @weave.op
+ def some_complicated_preprocessing(self, text: str) -> str:
+ processed_text = "Original text: \n" + text + "\n"
+ return processed_text
+
+ @weave.op
+ def call_llm(self, summary: str, processed_text: str) -> dict:
+ res = llm_client.chat.completions.create(
+ messages=[
+ {"role": "system", "content": self.system_prompt},
+ {"role": "user", "content": (
+ f"Analyse how good the summary is compared to the original text."
+ f"Summary: {summary}\n{processed_text}"
+ )}])
+ return {"summary_quality": res}
+
+ @weave.op
+ def score(self, output: str, text: str) -> dict:
+ """Score the summary quality.
+
+ Args:
+ output: The summary generated by an AI system
+ text: The original text being summarized
+ """
+ processed_text = self.some_complicated_preprocessing(text)
+ eval_result = self.call_llm(summary=output, processed_text=processed_text)
+ return {"summary_quality": eval_result}
+
+evaluation = weave.Evaluation(
+ dataset=[{"text": "The quick brown fox jumps over the lazy dog."}],
+ scorers=[summarization_scorer])
+```
+This class evaluates how good a summary is by comparing it to the original text.
+
+## How Scorers Work
+### Scorer Keyword Arguments
+Scorers can access both the output from your AI system and the input data from the dataset row.
+
+- **Input:** If you would like your scorer to use data from your dataset row, such as a "label" or "target" column then you can easily make this available to the scorer by adding a `label` or `target` keyword argument to your scorer definition.
+
+For example if you wanted to use a column called "label" from your dataset then your scorer function (or `score` class method) would have a parameter list like this:
+
+```python
+@weave.op
+def my_custom_scorer(output: str, label: int) -> dict: # Added return type hint
+ ...
+```
+
+When a weave `Evaluation` is run, the output of the AI system is passed to the `output` parameter. The `Evaluation` also automatically tries to match any additional scorer argument names to your dataset columns. If customizing your scorer arguments or dataset columns is not feasible, you can use column mapping - see below for more.
+
+- **Output:** Include an `output` parameter in your scorer function's signature to access the AI system's output.
+
+
+### Mapping Column Names with column_map
+Sometimes, the `score` methods' argument names don't match the column names in your dataset. You can fix this using a `column_map`.
+
+If you're using a class-based scorer, pass a dictionary to the `column_map` attribute of `Scorer` when you initialise your scorer class. This dictionary maps your `score` method's argument names to the dataset's column names, in the order: `{scorer_keyword_argument: dataset_column_name}`.
+
+Example:
+
+```python
+import weave
+from weave import Scorer
+
+# A dataset with news articles to be summarised
+dataset = [
+ {"news_article": "The news today was great...", "date": "2030-04-20", "source": "Bright Sky Network"},
+ ...
+]
+
+# Scorer class
+class SummarizationScorer(Scorer):
+
+ @weave.op
+ def score(output, text) -> dict:
+ """
+ output: output summary from a LLM summarization system
+ text: the text being summarised
+ """
+ ... # evaluate the quality of the summary
+
+# create a scorer with a column mapping the `text` argument to the `news_article` data column
+scorer = SummarizationScorer(column_map={"text" : "news_article"})
+```
+
+Now, the `text` argument in the `score` method will receive data from the `news_article` dataset column.
+
+**Notes:**
+- Another equivalent option to map your columns is to subclass the `Scorer` and overload the `score` method mapping the columns explicitly.
+
+```python
+import weave
+from weave import Scorer
+
+class MySummarizationScorer(SummarizationScorer):
+
+ @weave.op
+ def score(self, output: str, news_article: str) -> dict: # Added type hints
+ # overload the score method and map columns manually
+ return super().score(output=output, text=news_article)
+```
+
+### Final summarization of the scorer
+
+During evaluation, the scorer will be computed for each row of your dataset. To provide a final score for the evaluation we provide an `auto_summarize` depending on the returning type of the output.
+ - average will be computed for numerical columns
+ - count and fraction for boolean cols
+ - other col types are ignored
+
+You can override the `summarize` method on the `Scorer` class and provide your own way of computing the final scores. The `summarize` function expects:
+
+- A single parameter `score_rows`: This is a list of dictionaries, where each dictionary contains the scores returned by the `score` method for a single row of your dataset.
+- It should return a dictionary containing the summarized scores.
+
+**Why this is useful?**
+
+When you need to score all rows before deciding on the final value of the score for the dataset.
+
+```python
+class MyBinaryScorer(Scorer):
+ """
+ Returns True if the full output matches the target, False if not
+ """
+
+ @weave.op
+ def score(output, target):
+ return {"match": if output == target}
+
+ def summarize(self, score_rows: list) -> dict:
+ full_match = all(row["match"] for row in score_rows)
+ return {"full_match": full_match}
+```
+> In this example, the default `auto_summarize` would have returned the count and proportion of True.
+
+If you want to learn more, check the implementation of [CorrectnessLLMJudge](/tutorial-rag#optional-defining-a-scorer-class).
+
+## Predefined Scorers
+
+**Installation**
+
+To use Weave's predefined scorers you need to install some additional dependencies:
+
+```bash
+pip install weave[scorers]
+```
+
+**LLM-evaluators**
+
+The pre-defined scorers that use LLMs support the OpenAI, Anthropic, Google GenerativeAI and MistralAI clients. They also use `weave`'s `InstructorLLMScorer` class, so you'll need to install the [`instructor`](https://github.com/instructor-ai/instructor) Python package to be able to use them. You can get all necessary dependencies with `pip install "weave[scorers]"`
+
+### `HallucinationFreeScorer`
+
+This scorer checks if your AI system's output includes any hallucinations based on the input data.
+
+```python
+from weave.scorers import HallucinationFreeScorer
+
+llm_client = ... # initialize your LLM client here
+
+scorer = HallucinationFreeScorer(
+ client=llm_client,
+ model_id="gpt4o"
+)
+```
+
+**Customization:**
+- Customize the `system_prompt` and `user_prompt` attributes of the scorer to define what "hallucination" means for you.
+
+**Notes:**
+- The `score` method expects an input column named `context`. If your dataset uses a different name, use the `column_map` attribute to map `context` to the dataset column.
+
+Here you have an example in the context of an evaluation:
+
+```python
+import asyncio
+from openai import OpenAI
+import weave
+from weave.scorers import HallucinationFreeScorer
+
+# Initialize clients and scorers
+llm_client = OpenAI()
+hallucination_scorer = HallucinationFreeScorer(
+ client=llm_client,
+ model_id="gpt-4o",
+ column_map={"context": "input", "output": "other_col"}
+)
+
+# Create dataset
+dataset = [
+ {"input": "John likes various types of cheese."},
+ {"input": "Pepe likes various types of cheese."},
+]
+
+@weave.op
+def model(input: str) -> str:
+ return "The person's favorite cheese is cheddar."
+
+# Run evaluation
+evaluation = weave.Evaluation(
+ dataset=dataset,
+ scorers=[hallucination_scorer],
+)
+result = asyncio.run(evaluation.evaluate(model))
+print(result)
+# {'HallucinationFreeScorer': {'has_hallucination': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 1.4395725727081299}}
+```
+---
+
+### `SummarizationScorer`
+
+Use an LLM to compare a summary to the original text and evaluate the quality of the summary.
+
+```python
+from weave.scorers import SummarizationScorer
+
+llm_client = ... # initialize your LLM client here
+
+scorer = SummarizationScorer(
+ client=llm_client,
+ model_id="gpt4o"
+)
+```
+
+**How It Works:**
+
+This scorer evaluates summaries in two ways:
+
+1. **Entity Density:** Checks the ratio of unique entities (like names, places, or things) mentioned in the summary to the total word count in the summary in order to estimate the "information density" of the summary. Uses an LLM to extract the entities. Similar to how entity density is used in the Chain of Density paper, https://arxiv.org/abs/2309.04269
+
+2. **Quality Grading:** Uses an LLM-evaluator to grade the summary as `poor`, `ok`, or `excellent`. These grades are converted to scores (0.0 for poor, 0.5 for ok, and 1.0 for excellent) so you can calculate averages.
+
+**Customization:**
+- Adjust `summarization_evaluation_system_prompt` and `summarization_evaluation_prompt` to define what makes a good summary.
+
+**Notes:**
+- This scorer uses the `InstructorLLMScorer` class.
+- The `score` method expects the original text that was summarized to be present in the `input` column of the dataset. Use the `column_map` class attribute to map `input` to the correct dataset column if needed.
+
+
+Here you have an example usage of the `SummarizationScorer` in the context of an evaluation:
+
+```python
+import asyncio
+from openai import OpenAI
+import weave
+from weave.scorers import SummarizationScorer
+
+class SummarizationModel(weave.Model):
+ @weave.op()
+ async def predict(self, input: str) -> str:
+ return "This is a summary of the input text."
+
+# Initialize clients and scorers
+llm_client = OpenAI()
+model = SummarizationModel()
+summarization_scorer = SummarizationScorer(
+ client=llm_client,
+ model_id="gpt-4o",
+)
+# Create dataset
+dataset = [
+ {"input": "The quick brown fox jumps over the lazy dog."},
+ {"input": "Artificial Intelligence is revolutionizing various industries."}
+]
+
+# Run evaluation
+evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer])
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+# {'SummarizationScorer': {'is_entity_dense': {'true_count': 0, 'true_fraction': 0.0}, 'summarization_eval_score': {'mean': 0.0}, 'entity_density': {'mean': 0.0}}, 'model_latency': {'mean': 6.210803985595703e-05}}
+```
+
+---
+
+### `OpenAIModerationScorer`
+
+The `OpenAIModerationScorer` uses OpenAI's Moderation API to check if the AI system's output contains disallowed content, such as hate speech or explicit material.
+
+```python
+from weave.scorers import OpenAIModerationScorer
+from openai import OpenAI
+
+oai_client = OpenAI(api_key=...) # initialize your LLM client here
+
+scorer = OpenAIModerationScorer(
+ client=oai_client,
+ model_id="text-embedding-3-small"
+)
+```
+
+**How It Works:**
+
+- Sends the AI's output to the OpenAI Moderation endpoint and returns a dictionary indicating whether the content is flagged and details about the categories involved.
+
+**Notes:**
+- Requires the `openai` Python package.
+- The client must be an instance of OpenAI's `OpenAI` or `AsyncOpenAI` client.
+
+
+Here you have an example in the context of an evaluation:
+```python
+import asyncio
+from openai import OpenAI
+import weave
+from weave.scorers import OpenAIModerationScorer
+
+class MyModel(weave.Model):
+ @weave.op
+ async def predict(self, input: str) -> str:
+ return input
+
+# Initialize clients and scorers
+client = OpenAI()
+model = MyModel()
+moderation_scorer = OpenAIModerationScorer(client=client)
+
+# Create dataset
+dataset = [
+ {"input": "I love puppies and kittens!"},
+ {"input": "I hate everyone and want to hurt them."}
+]
+
+# Run evaluation
+evaluation = weave.Evaluation(dataset=dataset, scorers=[moderation_scorer])
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+# {'OpenAIModerationScorer': {'flagged': {'true_count': 1, 'true_fraction': 0.5}, 'categories': {'violence': {'true_count': 1, 'true_fraction': 1.0}}}, 'model_latency': {'mean': 9.500980377197266e-05}}
+```
+
+---
+
+### `EmbeddingSimilarityScorer`
+
+The `EmbeddingSimilarityScorer` computes the cosine similarity between the embeddings of the AI system's output and a target text from your dataset. It's useful for measuring how similar the AI's output is to a reference text.
+
+```python
+from weave.scorers import EmbeddingSimilarityScorer
+
+llm_client = ... # initialise your LlM client
+
+similarity_scorer = EmbeddingSimilarityScorer(
+ client=llm_client
+ target_column="reference_text", # the dataset column to compare the output against
+ threshold=0.4 # the cosine similarity threshold to use
+)
+```
+
+**Parameters:**
+
+- `target`: This scorer expects a `target` column in your dataset, it will calculate the cosine similarity of the embeddings of the `target` column to the AI system output. If your dataset doesn't contain a column called `target` you can use the scorers `column_map` attribute to map `target` to the appropriate column name in your dataset. See the Column Mapping section for more.
+- `threshold` (float): The minimum cosine similarity score between the embedding of the AI system output and the embdedding of the `target`, above which the 2 samples are considered "similar", (defaults to `0.5`). `threshold` can be in a range from -1 to 1:
+ - 1 indicates identical direction.
+ - 0 indicates orthogonal vectors.
+ - -1 indicates opposite direction.
+
+The correct cosine similarity threshold to set can fluctuate quite a lot depending on your use case, we advise exploring different thresholds.
+
+
+Here you have an example usage of the `EmbeddingSimilarityScorer` in the context of an evaluation:
+
+```python
+import asyncio
+from openai import OpenAI
+import weave
+from weave.scorers import EmbeddingSimilarityScorer
+
+# Initialize clients and scorers
+client = OpenAI()
+similarity_scorer = EmbeddingSimilarityScorer(
+ client=client,
+ threshold=0.7,
+ column_map={"target": "reference"}
+)
+
+# Create dataset
+dataset = [
+ {
+ "input": "He's name is John",
+ "reference": "John likes various types of cheese.",
+ },
+ {
+ "input": "He's name is Pepe.",
+ "reference": "Pepe likes various types of cheese.",
+ },
+]
+
+# Define model
+@weave.op
+def model(input: str) -> str:
+ return "John likes various types of cheese."
+
+# Run evaluation
+evaluation = weave.Evaluation(
+ dataset=dataset,
+ scorers=[similarity_scorer],
+)
+result = asyncio.run(evaluation.evaluate(model))
+print(result)
+# {'EmbeddingSimilarityScorer': {'is_similar': {'true_count': 1, 'true_fraction': 0.5}, 'similarity_score': {'mean': 0.8448514031462045}}, 'model_latency': {'mean': 0.45862746238708496}}
+```
+
+---
+
+### `ValidJSONScorer`
+
+The ValidJSONScorer checks whether the AI system's output is valid JSON. This scorer is useful when you expect the output to be in JSON format and need to verify its validity.
+
+```python
+from weave.scorers import ValidJSONScorer
+
+json_scorer = ValidJSONScorer()
+```
+
+Here you have an example usage of the `ValidJSONScorer` in the context of an evaluation:
+
+```python
+import asyncio
+import weave
+from weave.scorers import ValidJSONScorer
+
+class JSONModel(weave.Model):
+ @weave.op()
+ async def predict(self, input: str) -> str:
+ # This is a placeholder.
+ # In a real scenario, this would generate JSON.
+ return '{"key": "value"}'
+
+model = JSONModel()
+json_scorer = ValidJSONScorer()
+
+dataset = [
+ {"input": "Generate a JSON object with a key and value"},
+ {"input": "Create an invalid JSON"}
+]
+
+evaluation = weave.Evaluation(dataset=dataset, scorers=[json_scorer])
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+# {'ValidJSONScorer': {'json_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 8.58306884765625e-05}}
+```
+
+
+---
+
+### `ValidXMLScorer`
+
+The `ValidXMLScorer` checks whether the AI system's output is valid XML. This is useful when expecting XML-formatted outputs.
+
+```python
+from weave.scorers import ValidXMLScorer
+
+xml_scorer = ValidXMLScorer()
+```
+
+
+Here you have an example usage of the `ValidXMLScorer` in the context of an evaluation:
+
+```python
+import asyncio
+import weave
+from weave.scorers import ValidXMLScorer
+
+class XMLModel(weave.Model):
+ @weave.op()
+ async def predict(self, input: str) -> str:
+ # This is a placeholder. In a real scenario, this would generate XML.
+ return 'value'
+
+model = XMLModel()
+xml_scorer = ValidXMLScorer()
+
+dataset = [
+ {"input": "Generate a valid XML with a root element"},
+ {"input": "Create an invalid XML"}
+]
+
+evaluation = weave.Evaluation(dataset=dataset, scorers=[xml_scorer])
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+# {'ValidXMLScorer': {'xml_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 8.20159912109375e-05}}
+```
+
+---
+
+### `PydanticScorer`
+
+The `PydanticScorer` validates the AI system's output against a Pydantic model to ensure it adheres to a specified schema or data structure.
+
+```python
+from weave.scorers import PydanticScorer
+from pydantic import BaseModel
+
+class FinancialReport(BaseModel):
+ revenue: int
+ year: str
+
+pydantic_scorer = PydanticScorer(model=FinancialReport)
+```
+
+---
+
+### RAGAS - `ContextEntityRecallScorer`
+
+The `ContextEntityRecallScorer` estimates context recall by extracting entities from both the AI system's output and the provided context, then computing the recall score. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library
+
+```python
+from weave.scorers import ContextEntityRecallScorer
+
+llm_client = ... # initialise your LlM client
+
+entity_recall_scorer = ContextEntityRecallScorer(
+ client=llm_client
+ model_id="your-model-id"
+)
+```
+
+**How It Works:**
+
+- Uses an LLM to extract unique entities from the output and context and calculates recall.
+- **Recall** indicates the proportion of important entities from the context that are captured in the output, helping to assess the model's effectiveness in retrieving relevant information.
+- Returns a dictionary with the recall score.
+
+**Notes:**
+
+- Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed.
+
+---
+
+### RAGAS - `ContextRelevancyScorer`
+
+The `ContextRelevancyScorer` evaluates the relevancy of the provided context to the AI system's output. It helps determine if the context used is appropriate for generating the output. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library.
+
+```python
+from weave.scorers import ContextRelevancyScorer
+
+llm_client = ... # initialise your LlM client
+
+relevancy_scorer = ContextRelevancyScorer(
+ llm_client = ... # initialise your LlM client
+ model_id="your-model-id"
+ )
+```
+
+**How It Works:**
+
+- Uses an LLM to rate the relevancy of the context to the output on a scale from 0 to 1.
+- Returns a dictionary with the `relevancy_score`.
+
+**Notes:**
+
+- Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed.
+- Customize the `relevancy_prompt` to define how relevancy is assessed.
+
+
+Here you have an example usage of `ContextEntityRecallScorer` and `ContextRelevancyScorer` in the context of an evaluation:
+
+```python
+import asyncio
+from textwrap import dedent
+from openai import OpenAI
+import weave
+from weave.scorers import ContextEntityRecallScorer, ContextRelevancyScorer
+
+class RAGModel(weave.Model):
+ @weave.op()
+ async def predict(self, question: str) -> str:
+ "Retrieve relevant context"
+ return "Paris is the capital of France."
+
+
+model = RAGModel()
+
+# Define prompts
+relevancy_prompt: str = dedent("""
+ Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1.
+
+ Question: {question}
+ Context: {context}
+ Relevancy Score (0-1):
+ """)
+
+# Initialize clients and scorers
+llm_client = OpenAI()
+entity_recall_scorer = ContextEntityRecallScorer(
+ client=client,
+ model_id="gpt-4o",
+)
+
+relevancy_scorer = ContextRelevancyScorer(
+ client=llm_client,
+ model_id="gpt-4o",
+ relevancy_prompt=relevancy_prompt
+)
+
+# Create dataset
+dataset = [
+ {
+ "question": "What is the capital of France?",
+ "context": "Paris is the capital city of France."
+ },
+ {
+ "question": "Who wrote Romeo and Juliet?",
+ "context": "William Shakespeare wrote many famous plays."
+ }
+]
+
+# Run evaluation
+evaluation = weave.Evaluation(
+ dataset=dataset,
+ scorers=[entity_recall_scorer, relevancy_scorer]
+)
+results = asyncio.run(evaluation.evaluate(model))
+print(results)
+# {'ContextEntityRecallScorer': {'recall': {'mean': 0.3333333333333333}}, 'ContextRelevancyScorer': {'relevancy_score': {'mean': 0.5}}, 'model_latency': {'mean': 9.393692016601562e-05}}
+```
+
diff --git a/docs/docs/guides/integrations/langchain.md b/docs/docs/guides/integrations/langchain.md
index b382e793e70..4487a85dfd4 100644
--- a/docs/docs/guides/integrations/langchain.md
+++ b/docs/docs/guides/integrations/langchain.md
@@ -196,7 +196,7 @@ Evaluations help you measure the performance of your models. By using the [`weav
```python
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
sentences = [
"There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.",
diff --git a/docs/docs/tutorial-eval.md b/docs/docs/tutorial-eval.md
index 2b4f202244d..44ccdfa5a9d 100644
--- a/docs/docs/tutorial-eval.md
+++ b/docs/docs/tutorial-eval.md
@@ -94,7 +94,7 @@ Here `sentence` is passed to the model's predict function, and `target` is used
```python
import weave
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
weave.init('intro-example')
@@ -132,7 +132,7 @@ import asyncio
# highlight-next-line
import weave
# highlight-next-line
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
import openai
# We create a model class with one predict function.
diff --git a/docs/docs/tutorial-rag.md b/docs/docs/tutorial-rag.md
index 43fbf3d9994..e88e27e38bc 100644
--- a/docs/docs/tutorial-rag.md
+++ b/docs/docs/tutorial-rag.md
@@ -182,7 +182,7 @@ On a high-level the steps to create custom Scorer are quite simple:
```python
-from weave.flow.scorer import Scorer
+from weave.scorers import Scorer
from weave import WeaveList
class CorrectnessLLMJudge(Scorer):
diff --git a/docs/sidebars.ts b/docs/sidebars.ts
index 64c8e3126ec..c5da61462b5 100644
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -54,9 +54,18 @@ const sidebars: SidebarsConfig = {
"guides/tracking/objects",
],
},
+ {
+ type: "category",
+ collapsible: true,
+ collapsed: false,
+ label: "Evaluation",
+ link: { type: "doc", id: "guides/core-types/evaluations"},
+ items: [
+ "guides/evaluation/scorers",
+ ],
+ },
"guides/core-types/models",
"guides/core-types/datasets",
- "guides/core-types/evaluations",
"guides/tracking/feedback",
"guides/tracking/costs",
"guides/core-types/media",
diff --git a/examples/text-extract/evaluate.py b/examples/text-extract/evaluate.py
index abb292b198e..357f101e387 100644
--- a/examples/text-extract/evaluate.py
+++ b/examples/text-extract/evaluate.py
@@ -6,7 +6,7 @@
import openai
import weave
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
class TextExtractModel(weave.Model):
diff --git a/examples/tutorial_scripts/05_eval_pipeline.py b/examples/tutorial_scripts/05_eval_pipeline.py
index ccb14126a03..0a6a5baf9ab 100644
--- a/examples/tutorial_scripts/05_eval_pipeline.py
+++ b/examples/tutorial_scripts/05_eval_pipeline.py
@@ -60,7 +60,7 @@ async def predict(self, sentence: str) -> dict:
]
import weave
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
@weave.op()
diff --git a/examples/tutorial_scripts/06_eval_pipeline_all.py b/examples/tutorial_scripts/06_eval_pipeline_all.py
index 6be10f08a44..0d5fe8fd3b2 100644
--- a/examples/tutorial_scripts/06_eval_pipeline_all.py
+++ b/examples/tutorial_scripts/06_eval_pipeline_all.py
@@ -4,7 +4,7 @@
import openai
import weave
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
# We create a model class with one predict function.
# All inputs, predictions and parameters are automatically captured for easy inspection.
diff --git a/noxfile.py b/noxfile.py
index bb74b97ec34..90aa3bfaac4 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -11,6 +11,7 @@
"litellm",
"notdiamond",
"google_ai_studio",
+ "scorers_tests",
]
@@ -40,6 +41,7 @@ def lint(session):
"mistral1",
"notdiamond",
"openai",
+ "scorers_tests",
"pandas-test",
],
)
@@ -64,12 +66,21 @@ def tests(session, shard):
if shard == "google_ai_studio":
env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY")
+ # we are doing some integration test in test_llm_integrations.py that requires
+ # setting some environment variables for the LLM providers
+ if shard == "scorers_tests":
+ env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY")
+ env["ANTHROPIC_API_KEY"] = session.env.get("ANTHROPIC_API_KEY")
+ env["MISTRAL_API_KEY"] = session.env.get("MISTRAL_API_KEY")
+ env["OPENAI_API_KEY"] = session.env.get("OPENAI_API_KEY")
+
default_test_dirs = [f"integrations/{shard}/"]
test_dirs_dict = {
"trace": ["trace/"],
"trace_server": ["trace_server/"],
"mistral0": ["integrations/mistral/v0/"],
"mistral1": ["integrations/mistral/v1/"],
+ "scorers_tests": ["scorers/"],
}
test_dirs = test_dirs_dict.get(shard, default_test_dirs)
diff --git a/pyproject.toml b/pyproject.toml
index ff5403c4e89..407b7548327 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,6 +66,8 @@ litellm = ["litellm>=1.36.1"]
llamaindex = ["llama-index>=0.10.35"]
mistral0 = ["mistralai>=0.1.8,<1.0.0"]
mistral1 = ["mistralai>=1.0.0"]
+scorers = ["Levenshtein>=0.26.0", "instructor>=1.5.2"]
+scorers_tests = ["instructor>=1.5.2", "Levenshtein>=0.26.0", "openai>=1.0.0", "google-generativeai>=0.8.0", "mistralai>=1.0.3", "anthropic>=0.30.0"]
notdiamond = ["notdiamond>=0.3.21", "litellm<=1.49.1"]
openai = ["openai>=1.0.0"]
pandas-test = ["pandas>=2.2.3"]
diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
new file mode 100644
index 00000000000..5f71fe724b9
--- /dev/null
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -0,0 +1,105 @@
+import pytest
+from openai import OpenAI
+
+import weave
+from weave.scorers import (
+ HallucinationFreeScorer,
+)
+from weave.scorers.hallucination_scorer import (
+ HallucinationReasoning,
+ HallucinationResponse,
+)
+
+
+# mock the create function
+@pytest.fixture
+def mock_create(monkeypatch):
+ def _mock_create(*args, **kwargs):
+ return HallucinationResponse(
+ chain_of_thought="The output is consistent with the input data.",
+ reasonings=[
+ HallucinationReasoning(
+ observation="My observation for this is that the output is consistent with the input data.",
+ hallucination_type="No Hallucination",
+ )
+ ],
+ conclusion="The output is consistent with the input data.",
+ has_hallucination=True,
+ )
+
+ monkeypatch.setattr("weave.scorers.hallucination_scorer.create", _mock_create)
+
+
+@pytest.fixture
+def hallucination_scorer(mock_create):
+ return HallucinationFreeScorer(
+ client=OpenAI(api_key="DUMMY_API_KEY"),
+ model_id="gpt-4o",
+ temperature=0.7,
+ max_tokens=4096,
+ )
+
+
+def test_hallucination_scorer_score(hallucination_scorer, mock_create):
+ output = "John's favorite cheese is cheddar."
+ context = "John likes various types of cheese."
+ result = hallucination_scorer.score(output=output, context=context)
+ # we should be able to do this validation
+ _ = HallucinationResponse.model_validate(result)
+
+ assert result["has_hallucination"] == True
+ assert result["conclusion"] == "The output is consistent with the input data."
+ assert len(result["reasonings"]) == 1
+ assert result["reasonings"][0]["hallucination_type"] == "No Hallucination"
+
+
+@pytest.mark.asyncio
+async def test_hallucination_scorer_eval(hallucination_scorer):
+ dataset = [
+ {"context": "John likes various types of cheese."},
+ {"context": "Pepe likes various types of cheese."},
+ ]
+
+ @weave.op
+ def model():
+ return "John's favorite cheese is cheddar."
+
+ evaluation = weave.Evaluation(
+ dataset=dataset,
+ scorers=[hallucination_scorer],
+ )
+ result = await evaluation.evaluate(model)
+ assert result["HallucinationFreeScorer"]["has_hallucination"]["true_count"] == 2
+ assert (
+ result["HallucinationFreeScorer"]["has_hallucination"]["true_fraction"] == 1.0
+ )
+
+
+@pytest.mark.asyncio
+async def test_hallucination_scorer_eval2(hallucination_scorer):
+ dataset = [
+ {
+ "input": "John likes various types of cheese.",
+ "other_col": "John's favorite cheese is cheddar.",
+ },
+ {
+ "input": "Pepe likes various types of cheese.",
+ "other_col": "Pepe's favorite cheese is gouda.",
+ },
+ ]
+
+ @weave.op
+ def model(input):
+ return "The person's favorite cheese is cheddar."
+
+ hallucination_scorer.column_map = {"context": "input", "output": "other_col"}
+
+ evaluation = weave.Evaluation(
+ dataset=dataset,
+ scorers=[hallucination_scorer],
+ )
+ result = await evaluation.evaluate(model)
+ assert result["HallucinationFreeScorer"]["has_hallucination"]["true_count"] == 2
+ assert (
+ result["HallucinationFreeScorer"]["has_hallucination"]["true_fraction"] == 1.0
+ )
diff --git a/tests/scorers/test_json_scorer.py b/tests/scorers/test_json_scorer.py
new file mode 100644
index 00000000000..c80b7a54743
--- /dev/null
+++ b/tests/scorers/test_json_scorer.py
@@ -0,0 +1,21 @@
+import pytest
+
+from weave.scorers import ValidJSONScorer
+
+
+@pytest.mark.parametrize(
+ "output, expected_result",
+ [
+ ('{"city": "San Francisco", "country": "USA"}', True),
+ ('{"city": "San Francisco", "country": "USA"', False),
+ ("Just a plain string.", False),
+ ("[1, 2, 3, 4, 5]", True),
+ ('{"person": {"name": "John", "age": 30}, "city": "New York"}', True),
+ ("{}", True),
+ ("[]", True),
+ ],
+)
+def test_json_scorer(output, expected_result):
+ scorer = ValidJSONScorer()
+ result = scorer.score(output)
+ assert result["json_valid"] is expected_result
diff --git a/tests/scorers/test_llm_integrations.py b/tests/scorers/test_llm_integrations.py
new file mode 100644
index 00000000000..0336955d740
--- /dev/null
+++ b/tests/scorers/test_llm_integrations.py
@@ -0,0 +1,82 @@
+import os
+
+import pytest
+
+from weave.scorers.summarization_scorer import (
+ SummarizationEvaluationResponse,
+ SummarizationScorer,
+)
+
+# Define providers and their models
+TEST_MODELS = {
+ "openai": ["gpt-4o-mini", "gpt-4o"],
+ "anthropic": ["claude-3-haiku-20240307", "claude-3-5-sonnet-20240620"],
+ "mistral": ["mistral-small-latest", "mistral-large-latest"],
+ "gemini": ["gemini-1.5-flash", "gemini-1.5-pro-latest"],
+}
+
+
+def get_client_and_model(provider, model):
+ api_key_env_vars = {
+ "openai": "OPENAI_API_KEY",
+ "anthropic": "ANTHROPIC_API_KEY",
+ "mistral": "MISTRAL_API_KEY",
+ "gemini": "GOOGLE_API_KEY",
+ }
+
+ if provider not in TEST_MODELS:
+ raise ValueError(f"Unknown provider: {provider}")
+
+ if model not in TEST_MODELS[provider]:
+ raise ValueError(f"Model '{model}' not available for provider '{provider}'")
+
+ api_key = os.getenv(api_key_env_vars[provider])
+ if not api_key:
+ raise EnvironmentError(
+ f"API key for {provider} not found. Please set '{api_key_env_vars[provider]}' environment variable."
+ )
+
+ if provider == "openai":
+ from openai import OpenAI
+
+ client = OpenAI(api_key=api_key)
+ elif provider == "anthropic":
+ from anthropic import Anthropic
+
+ client = Anthropic(api_key=api_key)
+ elif provider == "mistral":
+ from mistralai import Mistral
+
+ client = Mistral(api_key=api_key)
+ elif provider == "gemini":
+ import google.generativeai as genai
+
+ genai.configure(api_key=api_key)
+ client = genai.GenerativeModel(model_name=model)
+ model = "gemini" # Adjust if necessary
+
+ return client, model
+
+
+# Generate test parameters
+test_params = [
+ (provider, model) for provider, models in TEST_MODELS.items() for model in models
+]
+
+
+@pytest.mark.parametrize("provider,model", test_params, ids=lambda p: f"{p[0]}:{p[1]}")
+def test_summarization_scorer_evaluate_summary(provider, model):
+ client, model_id = get_client_and_model(provider, model)
+
+ summarization_scorer = SummarizationScorer(
+ client=client,
+ model_id=model_id,
+ temperature=0.7,
+ max_tokens=1024,
+ )
+ input_text = "This is the original text."
+ summary_text = "This is the summary."
+ result = summarization_scorer.evaluate_summary(
+ input=input_text, summary=summary_text
+ )
+ assert isinstance(result, SummarizationEvaluationResponse)
diff --git a/tests/scorers/test_pydantic_scorer.py b/tests/scorers/test_pydantic_scorer.py
new file mode 100644
index 00000000000..f06dc83bca7
--- /dev/null
+++ b/tests/scorers/test_pydantic_scorer.py
@@ -0,0 +1,30 @@
+import pytest
+from pydantic import BaseModel
+
+from weave.scorers import PydanticScorer
+
+
+class User(BaseModel):
+ name: str
+ age: int
+
+
+@pytest.fixture
+def user_scorer():
+ return PydanticScorer(model=User)
+
+
+@pytest.mark.parametrize(
+ "input_data, expected_result",
+ [
+ ('{"name": "John", "age": 30}', {"valid_pydantic": True}),
+ ({"name": "John", "age": 30}, {"valid_pydantic": True}),
+ ('{"name": "John", "age": "thirty"}', {"valid_pydantic": False}),
+ ({"name": "John", "age": "thirty"}, {"valid_pydantic": False}),
+ ('{"name": "John"}', {"valid_pydantic": False}),
+ ('{"name": "John", "age": 30, "city": "New York"}', {"valid_pydantic": True}),
+ (123, {"valid_pydantic": False}),
+ ],
+)
+def test_pydantic_scorer(user_scorer, input_data, expected_result):
+ assert user_scorer.score(input_data) == expected_result
diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
new file mode 100644
index 00000000000..f663ac965c2
--- /dev/null
+++ b/tests/scorers/test_ragas_scorer.py
@@ -0,0 +1,66 @@
+import pytest
+from openai import OpenAI
+
+from weave.scorers import (
+ ContextEntityRecallScorer,
+ ContextRelevancyScorer,
+)
+from weave.scorers.ragas_scorer import (
+ EntityExtractionResponse,
+ RelevancyResponse,
+)
+
+
+# Mock the create function
+@pytest.fixture
+def mock_create(monkeypatch):
+ def _mock_create(*args, **kwargs):
+ # Retrieve the response_model to return appropriate mock responses
+ response_model = kwargs.get("response_model")
+ if response_model is EntityExtractionResponse:
+ return EntityExtractionResponse(entities=["Paris"])
+ elif response_model is RelevancyResponse:
+ return RelevancyResponse(
+ reasoning="The context directly answers the question.",
+ relevancy_score=1,
+ )
+
+ monkeypatch.setattr("weave.scorers.ragas_scorer.create", _mock_create)
+
+
+@pytest.fixture
+def context_entity_recall_scorer(mock_create):
+ return ContextEntityRecallScorer(
+ client=OpenAI(api_key="DUMMY_API_KEY"),
+ model_id="gpt-4o",
+ temperature=0.7,
+ max_tokens=1024,
+ )
+
+
+@pytest.fixture
+def context_relevancy_scorer(mock_create):
+ return ContextRelevancyScorer(
+ client=OpenAI(api_key="DUMMY_API_KEY"),
+ model_id="gpt-4o",
+ temperature=0.7,
+ max_tokens=1024,
+ )
+
+
+def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
+ output = "Paris is the capital of France."
+ context = "The capital city of France is Paris."
+ result = context_entity_recall_scorer.score(output, context)
+ assert isinstance(result, dict)
+ assert "recall" in result
+ assert result["recall"] == 1.0 # Assuming full recall in mock response
+
+
+def test_context_relevancy_scorer_score(context_relevancy_scorer):
+ output = "What is the capital of France?"
+ context = "Paris is the capital city of France."
+ result = context_relevancy_scorer.score(output, context)
+ assert isinstance(result, dict)
+ assert "relevancy_score" in result
+ assert result["relevancy_score"] == 1 # Assuming relevancy in mock response
diff --git a/tests/scorers/test_similarity_scorer.py b/tests/scorers/test_similarity_scorer.py
new file mode 100644
index 00000000000..0a02296a55a
--- /dev/null
+++ b/tests/scorers/test_similarity_scorer.py
@@ -0,0 +1,92 @@
+import pytest
+from openai import OpenAI
+
+import weave
+from weave.scorers.llm_utils import OPENAI_DEFAULT_EMBEDDING_MODEL
+from weave.scorers.similarity_scorer import EmbeddingSimilarityScorer
+
+
+# mock the create function
+@pytest.fixture
+def mock_embed(monkeypatch):
+ def _mock_embed(*args, **kwargs):
+ import random
+
+ return [[random.random() for _ in range(1024)] for _ in range(2)]
+
+ monkeypatch.setattr("weave.scorers.similarity_scorer.embed", _mock_embed)
+
+
+@pytest.fixture
+def similarity_scorer(mock_embed):
+ return EmbeddingSimilarityScorer(
+ client=OpenAI(api_key="DUMMY_API_KEY"),
+ model_id=OPENAI_DEFAULT_EMBEDDING_MODEL,
+ threshold=0.9,
+ )
+
+
+def test_similarity_scorer_score(similarity_scorer):
+ output = "John's favorite cheese is cheddar."
+ target = "John likes various types of cheese."
+ similarity_scorer.threshold = 0.0
+ result = similarity_scorer.score(output=output, target=target)
+ assert result["similarity_score"] > 0.0
+ assert result["is_similar"] is True
+
+
+def test_similarity_scorer_not_similar(similarity_scorer):
+ output = "John's favorite cheese is cheddar."
+ target = "John likes various types of cheese."
+ similarity_scorer.threshold = 0.99
+ result = similarity_scorer.score(output=output, target=target)
+ assert result["similarity_score"] < 0.99
+ assert result["is_similar"] is False
+
+
+@pytest.mark.asyncio
+async def test_similarity_scorer_eval(similarity_scorer):
+ dataset = [
+ {"target": "John likes various types of cheese."},
+ {"target": "Pepe likes various types of cheese."},
+ ]
+
+ @weave.op
+ def model():
+ return "He's name is John"
+
+ evaluation = weave.Evaluation(
+ dataset=dataset,
+ scorers=[similarity_scorer],
+ )
+ result = await evaluation.evaluate(model)
+ assert result["EmbeddingSimilarityScorer"]["similarity_score"]["mean"] > 0.0
+ assert 0 <= result["EmbeddingSimilarityScorer"]["is_similar"]["true_count"] <= 2
+
+
+@pytest.mark.asyncio
+async def test_similarity_scorer_eval2(similarity_scorer):
+ dataset = [
+ {
+ "input": "He's name is John",
+ "other_col": "John likes various types of cheese.",
+ },
+ {
+ "input": "He's name is Pepe.",
+ "other_col": "Pepe likes various types of cheese.",
+ },
+ ]
+
+ @weave.op
+ def model(input):
+ return "John likes various types of cheese."
+
+ similarity_scorer.column_map = {"target": "other_col"}
+
+ evaluation = weave.Evaluation(
+ dataset=dataset,
+ scorers=[similarity_scorer],
+ )
+ result = await evaluation.evaluate(model)
+ assert result["EmbeddingSimilarityScorer"]["similarity_score"]["mean"] > 0.0
+ assert 0 <= result["EmbeddingSimilarityScorer"]["is_similar"]["true_count"] <= 2
diff --git a/tests/scorers/test_string_scorer.py b/tests/scorers/test_string_scorer.py
new file mode 100644
index 00000000000..2c635ea81db
--- /dev/null
+++ b/tests/scorers/test_string_scorer.py
@@ -0,0 +1,33 @@
+import pytest
+
+from weave.scorers import (
+ LevenshteinScorer,
+ StringMatchScorer,
+)
+
+
+@pytest.mark.parametrize(
+ "output, target, expected_result",
+ [
+ ("Morgan", "Hello my name is Morgan", True),
+ ("Alice", "Hello my name is Bob", False),
+ ],
+)
+def test_string_match_scorer(output, target, expected_result):
+ scorer = StringMatchScorer()
+ result = scorer.score(output, target)
+ assert result["string_in_input"] is expected_result
+
+
+@pytest.mark.parametrize(
+ "output, target, expected_distance",
+ [
+ ("Hello", "Hallo", 1),
+ ("Hello", "Hello", 0),
+ ("Hello", "World", 4),
+ ],
+)
+def test_levenshtein_scorer(output, target, expected_distance):
+ scorer = LevenshteinScorer()
+ result = scorer.score(output, target)
+ assert result["levenshtein_distance"] == expected_distance
diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py
new file mode 100644
index 00000000000..ca6c3f7139b
--- /dev/null
+++ b/tests/scorers/test_summarization_scorer.py
@@ -0,0 +1,110 @@
+import pytest
+from openai import OpenAI
+
+import weave
+from weave.scorers import (
+ SummarizationScorer,
+)
+from weave.scorers.summarization_scorer import (
+ EntityExtractionResponse,
+ SummarizationEvaluationResponse,
+)
+
+
+@pytest.fixture
+def mock_create(monkeypatch):
+ def _mock_create(*args, **kwargs):
+ response_model = kwargs.get("response_model")
+ if response_model == EntityExtractionResponse:
+ return EntityExtractionResponse(entities=["entity1", "entity2"])
+ elif response_model == SummarizationEvaluationResponse:
+ return SummarizationEvaluationResponse(
+ think_step_by_step="This is some reasoning.",
+ summarization_evaluation="excellent",
+ )
+
+ # Patch the 'create' function wherever it is called
+ monkeypatch.setattr("weave.scorers.summarization_scorer.create", _mock_create)
+
+
+@pytest.fixture
+def summarization_scorer(mock_create):
+ return SummarizationScorer(
+ client=OpenAI(api_key="DUMMY_API_KEY"),
+ model_id="gpt-4o",
+ temperature=0.7,
+ max_tokens=1024,
+ )
+
+
+def test_summarization_scorer_evaluate_summary(summarization_scorer, mock_create):
+ input_text = "This is the original text."
+ summary_text = "This is the summary."
+ result = summarization_scorer.evaluate_summary(
+ input=input_text, summary=summary_text
+ )
+ assert isinstance(result, SummarizationEvaluationResponse)
+ assert result.summarization_evaluation == "excellent"
+ assert result.think_step_by_step == "This is some reasoning."
+
+
+@pytest.mark.asyncio
+async def test_summarization_scorer_score(summarization_scorer):
+ input_text = "This is the original text."
+ output_text = "This is the summary."
+ result = await summarization_scorer.score(input=input_text, output=output_text)
+ assert isinstance(result, dict)
+ assert "summarization_eval_score" in result
+ assert result["summarization_eval_score"] == 1.0 # "excellent" maps to 1.0
+ assert "llm_eval_reasoning" in result
+ assert result["llm_eval_reasoning"] == "This is some reasoning."
+ assert "is_entity_dense" in result
+ assert isinstance(result["is_entity_dense"], bool)
+ assert "entity_density" in result
+ assert isinstance(result["entity_density"], float)
+
+
+def test_summarization_scorer_initialization(summarization_scorer):
+ assert isinstance(summarization_scorer, SummarizationScorer)
+ assert summarization_scorer.model_id == "gpt-4o"
+ assert summarization_scorer.temperature == 0.7
+ assert summarization_scorer.max_tokens == 1024
+
+
+def test_summarization_scorer_extract_entities(summarization_scorer):
+ text = "This is a sample text with entities."
+ entities = summarization_scorer.extract_entities(text)
+ assert isinstance(entities, list)
+ assert len(entities) == 2
+ assert "entity1" in entities
+ assert "entity2" in entities
+
+
+@pytest.mark.asyncio
+async def test_evaluate_summary_scorer(summarization_scorer):
+ dataset = [
+ {
+ "input": "This is the original text.",
+ },
+ {
+ "input": "This is another original text.",
+ },
+ ]
+ evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer])
+
+ @weave.op
+ def model(input: str):
+ return "This is the summary."
+
+ result = await evaluation.evaluate(model)
+ assert isinstance(result, dict)
+ assert "SummarizationScorer" in result
+ assert "entity_density" in result["SummarizationScorer"]
+ assert "is_entity_dense" in result["SummarizationScorer"]
+ assert "summarization_eval_score" in result["SummarizationScorer"]
+ assert "model_latency" in result
+
+ assert result["SummarizationScorer"]["entity_density"]["mean"] == pytest.approx(0.5)
+ assert result["SummarizationScorer"]["is_entity_dense"]["true_count"] == 2
+ assert result["SummarizationScorer"]["is_entity_dense"]["true_fraction"] == 1.0
+ assert result["SummarizationScorer"]["summarization_eval_score"]["mean"] == 1.0
diff --git a/tests/scorers/test_utils.py b/tests/scorers/test_utils.py
new file mode 100644
index 00000000000..03d95aff6c9
--- /dev/null
+++ b/tests/scorers/test_utils.py
@@ -0,0 +1,8 @@
+from weave.scorers.utils import stringify
+
+
+def test_stringify():
+ assert stringify("Hello, world!") == "Hello, world!"
+ assert stringify(123) == "123"
+ assert stringify([1, 2, 3]) == "[\n 1,\n 2,\n 3\n]"
+ assert stringify({"a": 1, "b": 2}) == '{\n "a": 1,\n "b": 2\n}'
diff --git a/tests/trace/test_client_trace.py b/tests/trace/test_client_trace.py
index 857d9b50042..2f444e30198 100644
--- a/tests/trace/test_client_trace.py
+++ b/tests/trace/test_client_trace.py
@@ -1443,7 +1443,7 @@ def test_named_reuse(client):
dataset = weave.ref(d_ref.uri()).get()
@weave.op()
- async def dummy_score(model_output):
+ async def dummy_score(output):
return 1
class SimpleModel(weave.Model):
diff --git a/tests/trace/test_evaluate.py b/tests/trace/test_evaluate.py
index f5ada25215f..76cc9f5b739 100644
--- a/tests/trace/test_evaluate.py
+++ b/tests/trace/test_evaluate.py
@@ -4,14 +4,14 @@
import weave
from weave import Dataset, Evaluation, Model
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
dataset = Dataset(rows=dataset_rows)
expected_eval_result = {
- "model_output": {"mean": 9.5},
+ "output": {"mean": 9.5},
"score": {"true_count": 1, "true_fraction": 0.5},
"model_latency": {"mean": pytest.approx(0, abs=1)},
}
@@ -24,8 +24,8 @@ async def predict(self, input) -> str:
@weave.op()
-def score(target, model_output):
- return target == model_output
+def score(target, output):
+ return target == output
@weave.op()
@@ -57,7 +57,7 @@ async def model_predict(input, target) -> str:
)
result = asyncio.run(evaluation.evaluate(model_predict))
assert result == {
- "model_output": {"mean": 18.5},
+ "output": {"mean": 18.5},
"score": {"true_count": 0, "true_fraction": 0.0},
"model_latency": {
"mean": pytest.approx(0, abs=1),
@@ -111,8 +111,8 @@ async def infer(self, input) -> str:
def test_score_as_class(client):
class MyScorer(weave.Scorer):
@weave.op()
- def score(self, target, model_output):
- return target == model_output
+ def score(self, target, output):
+ return target == output
evaluation = Evaluation(
dataset=dataset_rows,
@@ -121,7 +121,7 @@ def score(self, target, model_output):
model = EvalModel()
result = asyncio.run(evaluation.evaluate(model))
assert result == {
- "model_output": {"mean": 9.5},
+ "output": {"mean": 9.5},
"MyScorer": {"true_count": 1, "true_fraction": 0.5},
"model_latency": {
"mean": pytest.approx(0, abs=1),
@@ -137,8 +137,8 @@ def summarize(self, score_rows):
return {"awesome": 3}
@weave.op()
- def score(self, target, model_output):
- return target == model_output
+ def score(self, target, output):
+ return target == output
evaluation = Evaluation(
dataset=dataset_rows,
@@ -147,7 +147,7 @@ def score(self, target, model_output):
model = EvalModel()
result = asyncio.run(evaluation.evaluate(model))
assert result == {
- "model_output": {"mean": 9.5},
+ "output": {"mean": 9.5},
"MyScorer": {"awesome": 3},
"model_latency": {
"mean": pytest.approx(0, abs=1),
@@ -167,7 +167,7 @@ def return_pred(pred):
result = asyncio.run(evaluation.evaluate(return_pred))
assert result == {
- "model_output": {
+ "output": {
"a": {"true_count": 1, "true_fraction": 1.0},
"b": {"true_count": 0, "true_fraction": 0.0},
},
diff --git a/tests/trace/test_evaluation_performance.py b/tests/trace/test_evaluation_performance.py
index 51aceb0c1e8..8ccd8f9639b 100644
--- a/tests/trace/test_evaluation_performance.py
+++ b/tests/trace/test_evaluation_performance.py
@@ -91,8 +91,8 @@ def predict(question: str):
return "I don't know"
@weave.op()
- def score(question: str, expected: str, model_output: str):
- return model_output == expected
+ def score(question: str, expected: str, output: str):
+ return output == expected
evaluation = weave.Evaluation(
name="My Evaluation",
diff --git a/tests/trace/test_evaluations.py b/tests/trace/test_evaluations.py
index 7585980ce78..d137a92d4ef 100644
--- a/tests/trace/test_evaluations.py
+++ b/tests/trace/test_evaluations.py
@@ -9,6 +9,7 @@
import weave
from tests.trace.util import AnyIntMatcher
from weave import Evaluation, Model
+from weave.scorers import Scorer
from weave.trace.feedback_types.score import SCORE_TYPE_NAME
from weave.trace.weave_client import get_ref
from weave.trace_server import trace_server_interface as tsi
@@ -45,7 +46,6 @@ class MyModel(Model):
@weave.op()
def predict(self, question: str):
- # Here's where you would add your LLM call and return the output
return {"generated_text": "Hello, " + question + self.prompt}
@@ -58,12 +58,12 @@ async def do_quickstart():
]
@weave.op()
- def match_score1(expected: str, model_output: dict) -> dict:
- return {"match": expected == model_output["generated_text"]}
+ def match_score1(expected: str, output: dict) -> dict:
+ return {"match": expected == output["generated_text"]}
@weave.op()
- def match_score2(expected: dict, model_output: dict) -> dict:
- return {"match": expected == model_output["generated_text"]}
+ def match_score2(expected: dict, output: dict) -> dict:
+ return {"match": expected == output["generated_text"]}
model = MyModel(prompt="World")
evaluation = Evaluation(dataset=examples, scorers=[match_score1, match_score2])
@@ -192,32 +192,32 @@ def predict(self, question: str):
return {"response": res["response"], "confidence": 1 / (len(res) + 1)}
-def score_int(expected: str, model_output: dict) -> int:
+def score_int(expected: str, output: dict) -> int:
matches = 0
- for i in range(min(len(expected), len(model_output["response"]))):
- if expected[i] == model_output["response"][i]:
+ for i in range(min(len(expected), len(output["response"]))):
+ if expected[i] == output["response"][i]:
matches += 1
return matches
-def score_float(expected: str, model_output: dict) -> float:
- matches = score_int(expected, model_output)
- return matches / max(len(expected), len(model_output["response"]))
+def score_float(expected: str, output: dict) -> float:
+ matches = score_int(expected, output)
+ return matches / max(len(expected), len(output["response"]))
-def score_bool(expected: str, model_output: dict) -> bool:
- return score_float(expected, model_output) == 1.0
+def score_bool(expected: str, output: dict) -> bool:
+ return score_float(expected, output) == 1.0
-def score_dict(expected: str, model_output: dict) -> dict:
+def score_dict(expected: str, output: dict) -> dict:
return {
- "d_int": score_int(expected, model_output),
- "d_float": score_float(expected, model_output),
- "d_bool": score_bool(expected, model_output),
+ "d_int": score_int(expected, output),
+ "d_float": score_float(expected, output),
+ "d_bool": score_bool(expected, output),
"d_nested": {
- "d_int": score_int(expected, model_output),
- "d_float": score_float(expected, model_output),
- "d_bool": score_bool(expected, model_output),
+ "d_int": score_int(expected, output),
+ "d_float": score_float(expected, output),
+ "d_bool": score_bool(expected, output),
},
"reason": "This is a test reason",
}
@@ -225,32 +225,32 @@ def score_dict(expected: str, model_output: dict) -> dict:
class MyIntScorer(weave.Scorer):
@weave.op()
- def score(self, expected: str, model_output: dict) -> int:
- return score_int(expected, model_output)
+ def score(self, expected: str, output: dict) -> int:
+ return score_int(expected, output)
class MyFloatScorer(weave.Scorer):
@weave.op()
- def score(self, expected: str, model_output: dict) -> float:
- return score_float(expected, model_output)
+ def score(self, expected: str, output: dict) -> float:
+ return score_float(expected, output)
class MyBoolScorer(weave.Scorer):
@weave.op()
- def score(self, expected: str, model_output: dict) -> bool:
- return score_bool(expected, model_output)
+ def score(self, expected: str, output: dict) -> bool:
+ return score_bool(expected, output)
class MyDictScorer(weave.Scorer):
@weave.op()
- def score(self, expected: str, model_output: dict) -> dict:
- return score_dict(expected, model_output)
+ def score(self, expected: str, output: dict) -> dict:
+ return score_dict(expected, output)
class MyDictScorerWithCustomFloatSummary(weave.Scorer):
@weave.op()
- def score(self, expected: str, model_output: dict) -> dict:
- return score_dict(expected, model_output)
+ def score(self, expected: str, output: dict) -> dict:
+ return score_dict(expected, output)
@weave.op()
def summarize(self, score_rows: list) -> Optional[dict]:
@@ -260,8 +260,8 @@ def summarize(self, score_rows: list) -> Optional[dict]:
class MyDictScorerWithCustomBoolSummary(weave.Scorer):
@weave.op()
- def score(self, expected: str, model_output: dict) -> dict:
- return score_dict(expected, model_output)
+ def score(self, expected: str, output: dict) -> dict:
+ return score_dict(expected, output)
@weave.op()
def summarize(self, score_rows: list) -> Optional[dict]:
@@ -271,8 +271,8 @@ def summarize(self, score_rows: list) -> Optional[dict]:
class MyDictScorerWithCustomDictSummary(weave.Scorer):
@weave.op()
- def score(self, expected: str, model_output: dict) -> dict:
- return score_dict(expected, model_output)
+ def score(self, expected: str, output: dict) -> dict:
+ return score_dict(expected, output)
@weave.op()
def summarize(self, score_rows: list) -> Optional[dict]:
@@ -393,7 +393,7 @@ async def test_evaluation_data_topology(client):
# Prediction Section
confidence = 1 / 4
- model_output = {
+ output = {
"response": "A",
"confidence": confidence,
}
@@ -432,7 +432,7 @@ async def test_evaluation_data_topology(client):
}
# Prediction
- assert predict_call.output == model_output
+ assert predict_call.output == output
assert with_empty_feedback(predict_call.summary) == with_empty_feedback(
predict_usage
)
@@ -457,7 +457,7 @@ async def test_evaluation_data_topology(client):
# Predict And Score Group
assert predict_and_score_call.output == {
- "model_output": model_output,
+ "output": output,
"scores": {
"score_int": score_int_score,
"score_float": score_float_score,
@@ -471,7 +471,7 @@ async def test_evaluation_data_topology(client):
}
# Summary section
- model_output_summary = {
+ output_summary = {
"confidence": {"mean": confidence},
}
score_int_auto_summary = {"mean": 1.5}
@@ -544,7 +544,7 @@ async def test_evaluation_data_topology(client):
"MyDictScorerWithCustomBoolSummary": dict_scorer_bool_summary,
"MyDictScorerWithCustomDictSummary": dict_scorer_dict_summary,
"model_latency": model_latency,
- "model_output": model_output_summary,
+ "output": output_summary,
}
)
assert evaluate_call.summary == with_empty_feedback(predict_usage_summary)
@@ -566,13 +566,13 @@ async def test_evaluation_data_topology(client):
def make_test_eval():
- def function_score(target: dict, model_output: dict) -> dict:
- return {"correct": target == model_output}
+ def function_score(expected: str, output: dict) -> dict:
+ return {"correct": expected == output["generated_text"]}
evaluation = weave.Evaluation(
name="fruit_eval",
dataset=[
- {"id": "0", "sentence": "a", "target": "b"},
+ {"id": "0", "sentence": "a", "expected": "b"},
],
scorers=[function_score],
)
@@ -665,7 +665,7 @@ async def test_eval_is_robust_to_missing_values(client):
def model_func(model_res) -> dict:
return resp[model_res]
- def function_score(scorer_res, model_output) -> dict:
+ def function_score(scorer_res, output) -> dict:
return resp[scorer_res]
evaluation = weave.Evaluation(
@@ -676,7 +676,7 @@ def function_score(scorer_res, model_output) -> dict:
res = await evaluation.evaluate(model_func)
assert res == {
- "model_output": {"a": {"mean": 3.0}, "b": {"c": {"mean": 2.0}}},
+ "output": {"a": {"mean": 3.0}, "b": {"c": {"mean": 2.0}}},
"function_score": {"a": {"mean": 3.0}, "b": {"c": {"mean": 2.0}}},
"model_latency": {"mean": pytest.approx(0, abs=1)},
}
@@ -715,7 +715,7 @@ def model_func(
return text
- def function_score(image, dc, model, obj, text, model_output) -> bool:
+ def function_score(image, dc, model, obj, text, output) -> bool:
assert isinstance(image, Image.Image)
# Note: when we start recursively saving dataset rows, this will
@@ -728,7 +728,7 @@ def function_score(image, dc, model, obj, text, model_output) -> bool:
assert isinstance(model, MyModel)
assert isinstance(obj, MyObj)
assert isinstance(text, str)
- assert isinstance(model_output, str)
+ assert isinstance(output, str)
return True
@@ -780,6 +780,161 @@ def function_score(image, dc, model, obj, text, model_output) -> bool:
@pytest.mark.asyncio
+async def test_evaluation_with_column_map():
+ # Define a dummy scorer that uses column_map
+ class DummyScorer(Scorer):
+ @weave.op()
+ def score(self, foo: str, bar: str, output: str, target: str) -> dict:
+ # Return whether foo + bar equals output
+ return {"match": (foo + bar) == output == target}
+
+ # Create the scorer with column_map mapping 'foo'->'col1', 'bar'->'col2'
+ dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"})
+
+ @weave.op()
+ def model_function(col1, col2):
+ # For testing, return the concatenation of col1 and col2
+ return col1 + col2
+
+ dataset = [
+ {"col1": "Hello", "col2": "World", "target": "HelloWorld"},
+ {"col1": "Hi", "col2": "There", "target": "HiThere"},
+ {"col1": "Good", "col2": "Morning", "target": "GoodMorning"},
+ {"col1": "Bad", "col2": "Evening", "target": "GoodEvening"},
+ ]
+
+ evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer])
+
+ # Run the evaluation
+ eval_out = await evaluation.evaluate(model_function)
+
+ # Check that 'DummyScorer' is in the results
+ assert "DummyScorer" in eval_out
+
+ # The expected summary should show that 3 out of 4 predictions matched
+ expected_results = {"true_count": 3, "true_fraction": 0.75}
+ assert (
+ eval_out["DummyScorer"]["match"] == expected_results
+ ), "The summary should reflect the correct number of matches"
+
+
+@pytest.mark.asyncio
+async def test_evaluation_with_wrong_column_map():
+ # Define a dummy scorer that uses column_map
+ class DummyScorer(Scorer):
+ @weave.op()
+ def score(self, foo: str, bar: str, output: str, target: str) -> dict:
+ # Return whether foo + bar equals output
+ return {"match": (foo + bar) == output == target}
+
+ @weave.op()
+ def model_function(col1, col2):
+ # For testing, return the concatenation of col1 and col2
+ return col1 + col2
+
+ dataset = [
+ {"col1": "Hello", "col2": "World", "target": "HelloWorld"}, # True
+ {"col1": "Hi", "col2": "There", "target": "HiThere"}, # True
+ {"col1": "Good", "col2": "Morning", "target": "GoodMorning"}, # True
+ {"col1": "Bad", "col2": "Evening", "target": "GoodEvening"}, # False
+ ]
+
+ # Test that the column map is correctly used
+ dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"})
+ evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer])
+ eval_out = await evaluation.evaluate(model_function)
+ assert "DummyScorer" in eval_out
+ assert eval_out["DummyScorer"]["match"] == {"true_count": 3, "true_fraction": 0.75}
+
+ with pytest.raises(ValueError) as excinfo:
+ # Create the scorer with column_map mapping 'foo'->'col1', 'bar'->'col3'
+ # this is wrong because col3 does not exist
+ dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col3"})
+ evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer])
+ await evaluation.predict_and_score(model_function, dataset[0])
+ assert "which is not in the scorer's argument names" in str(excinfo.value)
+
+ with pytest.raises(ValueError) as excinfo:
+ # Create the scorer with column_map missing a column
+ dummy_scorer = DummyScorer(column_map={"foo": "col1"})
+ evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer])
+ await evaluation.predict_and_score(model_function, dataset[0])
+ assert "is not found in the dataset columns" in str(excinfo.value)
+
+ with pytest.raises(ValueError) as excinfo:
+ # Create the scorer with wrong argument name
+ dummy_scorer = DummyScorer(column_map={"jeez": "col1"})
+ evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer])
+ await evaluation.predict_and_score(model_function, dataset[0])
+ assert "is not found in the dataset columns and is not mapped" in str(
+ excinfo.value
+ )
+
+
+# Define another dummy scorer
+@pytest.mark.asyncio
+async def test_evaluation_with_multiple_column_maps():
+ class DummyScorer(Scorer):
+ @weave.op()
+ def score(self, foo: str, bar: str, output: str, target: str) -> dict:
+ # Return whether foo + bar equals output
+ return {"match": (foo + bar) == output == target}
+
+ class AnotherDummyScorer(Scorer):
+ @weave.op()
+ def score(self, input1: str, input2: str, output: str) -> dict:
+ # Return whether input1 == output reversed
+ return {"match": input1 == output[::-1]}
+
+ # First scorer maps 'foo'->'col1', 'bar'->'col2'
+ dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"})
+
+ # Second scorer maps 'input1'->'col2', 'input2'->'col1'
+ another_dummy_scorer = AnotherDummyScorer(
+ column_map={"input1": "col2", "input2": "col1"}
+ )
+
+ @weave.op()
+ def model_function(col1, col2):
+ # For testing, return the concatenation of col1 and col2
+ return col1 + col2
+
+ dataset = [
+ {"col1": "abc", "col2": "def", "target": "abcdef"},
+ {"col1": "123", "col2": "456", "target": "1111"},
+ {"col1": "xyz", "col2": "zyx", "target": "zzzzzz"},
+ ]
+
+ evaluation = Evaluation(
+ dataset=dataset, scorers=[dummy_scorer, another_dummy_scorer]
+ )
+
+ # Run the evaluation
+ eval_out = await evaluation.evaluate(model_function)
+
+ # Check that both scorers are in the results
+ assert "DummyScorer" in eval_out
+ assert "AnotherDummyScorer" in eval_out
+
+ # Assertions for the first scorer
+ expected_results_dummy = {"true_count": 1, "true_fraction": 1.0 / 3}
+ assert (
+ eval_out["DummyScorer"]["match"] == expected_results_dummy
+ ), "All concatenations should match the target"
+
+ # Assertions for the second scorer
+ # Since input1 == col2, and output is col1 + col2, we check if col2 == (col1 + col2)[::-1]
+ # Evaluate manually:
+ # First row: col2 = "def", output = "abcdef", output[::-1] = "fedcba" -> "def" != "fedcba"
+ # Second row: col2 = "456", output = "123456", output[::-1] = "654321" -> "456" != "654321"
+ # Third row: col2 = "zyx", output = "xyzzyx", output[::-1] = "xyzzyx" -> "zyx" == "xyzzyx" is False
+ # So all matches are False
+ expected_results_another_dummy = {"true_count": 0, "true_fraction": 0.0}
+ assert (
+ eval_out["AnotherDummyScorer"]["match"] == expected_results_another_dummy
+ ), "No matches should be found for AnotherDummyScorer"
+
+
async def test_feedback_is_correctly_linked(client):
@weave.op
def predict(text: str) -> str:
diff --git a/tests/trace/test_weave_client.py b/tests/trace/test_weave_client.py
index 95866e3ea5c..6f0af63d103 100644
--- a/tests/trace/test_weave_client.py
+++ b/tests/trace/test_weave_client.py
@@ -754,8 +754,8 @@ async def model_predict(input) -> str:
dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
@weave.op()
- async def score(target, model_output):
- return target == model_output
+ async def score(target, output):
+ return target == output
evaluation = Evaluation(
name="my-eval",
@@ -764,7 +764,7 @@ async def score(target, model_output):
)
result = asyncio.run(evaluation.evaluate(model_predict))
expected_eval_result = {
- "model_output": {"mean": 9.5},
+ "output": {"mean": 9.5},
"score": {"true_count": 1, "true_fraction": 0.5},
}
assert result == expected_eval_result
@@ -864,8 +864,8 @@ def test_nested_ref_is_inner(client):
dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
@weave.op()
- async def score(target, model_output):
- return target == model_output
+ async def score(target, output):
+ return target == output
evaluation = Evaluation(
name="my-eval",
diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 6bacdb74a13..ae433129f46 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -13,7 +13,7 @@
from weave.flow.dataset import Dataset
from weave.flow.model import Model, get_infer_method
from weave.flow.obj import Object
-from weave.flow.scorer import (
+from weave.scorers import (
Scorer,
auto_summarize,
get_scorer_attributes,
@@ -223,7 +223,7 @@ async def predict_and_score(
model_output = None
model_latency = time.time() - model_start_time
- scores = {}
+ scores = {} # TODO: Consider moving scorer setup and checks out of `predict_and_score`
scorers = cast(list[Union[Op, Scorer]], self.scorers or [])
for scorer in scorers:
scorer_self = None
@@ -237,13 +237,101 @@ async def predict_and_score(
score_signature = inspect.signature(score_fn)
score_arg_names = list(score_signature.parameters.keys())
- if "model_output" not in score_arg_names:
- raise OpCallError(
- f"Scorer {scorer_name} must have a 'model_output' argument, to receive the output of the model function."
+ if (
+ "model_output" not in score_arg_names
+ and "output" not in score_arg_names
+ ):
+ message = textwrap.dedent(
+ f"""
+ Scorer {scorer_name} must have an `output` or `model_output` argument, to receive the
+ output of the model function.
+ """
)
+ raise OpCallError(message)
if isinstance(example, dict):
- score_args = {k: v for k, v in example.items() if k in score_arg_names}
+ # The keys of `score_args` must match the argument names of the scorer's `score` method.
+ # If scorer.column_map is set, then user is indicating that the dataset column(s)
+ # being passed to the scorer have different names to the `score` functions' argument names.
+ # So we need to remap the dataset columns to the expected argument names in the scorer,
+ #
+ # column_map k:v pairs must be structured as `scorer param name : dataset column name`
+ #
+ # For instance, if the scorer expects "input" and "ground_truth" and we have a dataset
+ # with columns "question" and "answer", column_map should be defined as follows:
+ # {"input": "question", "ground_truth": "answer"}
+ #
+ # input: is the full row, we have access to it via example
+ # output: is the model output, we have access to it via model_output
+ score_arg_names = [
+ param for param in score_arg_names if (param != "self")
+ ]
+ score_args = {}
+
+ if isinstance(scorer, Scorer) and scorer.column_map is not None:
+ # Ensure that all keys in column_map are in score_arg_names
+ for key in scorer.column_map.keys():
+ if key not in score_arg_names:
+ message = textwrap.dedent(
+ f"""
+ You have created `{scorer_name}(column_map={scorer.column_map}, ...)`.
+
+ The `column_map` contains a key, `{key}`, which is not in the `score` methods' argument names.
+ `score` methods' argument names: {score_arg_names}
+
+ Hint:
+ - Ensure that the keys in `column_map` match the scorer's argument names.
+ """
+ )
+ raise ValueError(message)
+
+ for arg in score_arg_names:
+ if arg == "output" or arg == "model_output":
+ continue
+ if arg in example:
+ score_args[arg] = example[arg]
+ elif arg in scorer.column_map:
+ dataset_column_name = scorer.column_map[arg]
+ if dataset_column_name in example:
+ score_args[arg] = example[dataset_column_name]
+ else:
+ message = textwrap.dedent(
+ f"""
+ You have created `{scorer_name}(column_map={scorer.column_map}, ...)`.
+
+ You are mapping `{arg}` to `{dataset_column_name}`, but `{dataset_column_name}`
+ was not found in the dataset columns.
+
+ Available dataset columns: {list(example.keys())}
+
+ Hint:
+ - Ensure that `column_map` maps the `score` methods' argument names to existing dataset column names.
+ """
+ )
+ raise ValueError(message)
+ else:
+ message = textwrap.dedent(
+ f"""
+ You have created `{scorer_name}(column_map={scorer.column_map}, ...)`.
+
+ `score` method argument `{arg}` is not found in the dataset columns and is not mapped in `column_map`.
+
+ Available dataset columns: {list(example.keys())}
+ `column_map`: {scorer.column_map}
+
+ Hint:
+ Either:
+ - map the argument name to the dataset column using the scorers `column_map` attribute, in the form {{score_arg_name : dataset_column_name}} or
+ - rename a column in the dataset to `{arg}` or
+ - re-name the `{arg}` argument in your `score` method to match a dataset column name
+ """
+ )
+ raise ValueError(message)
+ else:
+ score_args = {
+ k: v for k, v in example.items() if k in score_arg_names
+ }
+
else:
if len(score_arg_names) == 2:
score_args = {score_arg_names[0]: example}
@@ -251,7 +339,7 @@ async def predict_and_score(
raise ValueError(
f"{score_fn} expects arguments: {score_arg_names}, provide a preprocess_model_input function that returns a dict with those keys."
)
- score_args["model_output"] = model_output
+ score_args["output"] = model_output
try:
if is_op(score_fn) and model_call:
@@ -275,29 +363,41 @@ async def predict_and_score(
except OpCallError as e:
dataset_column_names = list(example.keys())
dataset_column_names_str = ", ".join(dataset_column_names[:3])
- if len(dataset_column_names) > 3:
+ if len(dataset_column_names) > 10:
dataset_column_names_str += ", ..."
required_arg_names = [
param.name
for param in score_signature.parameters.values()
if param.default == inspect.Parameter.empty
]
- required_arg_names.remove("model_output")
+ required_arg_names.remove("output")
message = textwrap.dedent(
f"""
Call error: {e}
+ If using the `Scorer` weave class, you can set the `scorer.column_map`
+ attribute to map scorer argument names to dataset columns.
+
+ For example, if the `score` expects "output", "input" and "ground_truth" and we have a dataset
+ with columns "question" and "answer", `column_map` can be used to map the non-output parameter like so:
+ {{"input": "question", "ground_truth": "answer"}}
+
+ scorer argument names: {score_arg_names}
+ dataset keys: {example.keys()}
+ scorer.column_map: {getattr(scorer, 'column_map', '{}')}
+
Options for resolving:
- a. change {scorer_name} argument names to match a subset of dataset column names ({dataset_column_names_str})
- b. change dataset column names to match expected {scorer_name} argument names: {required_arg_names}
+ a. if using the `Scorer` weave class, you can set the `scorer.column_map` attribute to map scorer argument names to dataset column names or
+ b. change the argument names the in the scoring function of {scorer_name} to match a subset of dataset column names: ({dataset_column_names_str}) or
+ c. change dataset column names to match expected {scorer_name} argument names: {required_arg_names}
"""
)
raise OpCallError(message)
scores[scorer_name] = result
return {
- "model_output": model_output,
+ "output": model_output,
"scores": scores,
"model_latency": model_latency,
}
@@ -341,7 +441,7 @@ async def eval_example(example: dict) -> dict:
except Exception as e:
print("Predict and score failed")
traceback.print_exc()
- return {"model_output": None, "scores": {}}
+ return {"output": None, "scores": {}}
return eval_row
n_complete = 0
@@ -358,7 +458,7 @@ async def eval_example(example: dict) -> dict:
# f"Evaluating... {duration:.2f}s [{n_complete} / {len(self.dataset.rows)} complete]" # type:ignore
# )
if eval_row is None:
- eval_row = {"model_output": None, "scores": {}}
+ eval_row = {"output": None, "scores": {}}
else:
eval_row["scores"] = eval_row.get("scores", {})
for scorer in self.scorers or []:
diff --git a/weave/flow/scorer.py b/weave/flow/scorer.py
index e69f3afeb3f..86df3d6a055 100644
--- a/weave/flow/scorer.py
+++ b/weave/flow/scorer.py
@@ -1,158 +1,12 @@
-from collections import defaultdict
-from numbers import Number
-from typing import Any, Callable, Optional, Sequence, Tuple, Union
-
-import numpy as np
-from pydantic import BaseModel
-
-import weave
-from weave.flow.obj import Object
-from weave.trace.isinstance import weave_isinstance
-from weave.trace.op import Op, as_op, is_op
-
-
-class Scorer(Object):
- def score(self, target: Any, model_output: Any) -> Any:
- raise NotImplementedError
-
- @weave.op()
- def summarize(self, score_rows: list) -> Optional[dict]:
- return auto_summarize(score_rows)
-
-
-def stderr(data: Sequence[Union[int, float]]) -> float:
- if len(data) > 1:
- sample_variance = np.var(data, ddof=1)
- return float(np.sqrt(sample_variance / len(data)))
- else:
- return 0
-
-
-def auto_summarize(data: list) -> Optional[dict[str, Any]]:
- """Automatically summarize a list of (potentially nested) dicts.
-
- Computes:
- - avg for numeric cols
- - count and fraction for boolean cols
- - other col types are ignored
-
- If col is all None, result is None
-
- Returns:
- dict of summary stats, with structure matching input dict structure.
- """
- if not data:
- return {}
- data = [x for x in data if x is not None]
-
- if not data:
- return None
-
- val = data[0]
-
- if isinstance(val, bool):
- return {
- "true_count": (true_count := sum(1 for x in data if x)),
- "true_fraction": true_count / len(data),
- }
- elif isinstance(val, Number):
- return {"mean": np.mean(data).item()}
- elif isinstance(val, dict):
- result = {}
- all_keys = set().union(*[x.keys() for x in data if isinstance(x, dict)])
- for k in all_keys:
- if (
- summary := auto_summarize(
- [x.get(k) for x in data if isinstance(x, dict)]
- )
- ) is not None:
- if k in summary:
- result.update(summary)
- else:
- result[k] = summary
- if not result:
- return None
- return result
- elif isinstance(val, BaseModel):
- return auto_summarize([x.model_dump() for x in data])
- return None
-
-
-def get_scorer_attributes(
- scorer: Union[Callable, Op, Scorer],
-) -> Tuple[str, Callable, Callable]:
- if weave_isinstance(scorer, Scorer):
- scorer_name = scorer.name
- if scorer_name is None:
- scorer_name = scorer.__class__.__name__
- try:
- score_fn = scorer.score
- summarize_fn = scorer.summarize # type: ignore
- except AttributeError:
- raise ValueError(
- f"Scorer {scorer_name} must implement score and summarize methods. Did you forget to wrap with @weave.op()?"
- )
- elif callable(scorer):
- if is_op(scorer):
- scorer = as_op(scorer)
- scorer_name = scorer.name
- else:
- scorer_name = scorer.__name__
- score_fn = scorer
- summarize_fn = auto_summarize # type: ignore
- else:
- raise ValueError(f"Unknown scorer type: {scorer}")
- return (scorer_name, score_fn, summarize_fn) # type: ignore
-
-
-def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
- # if any denom is zero, then zero. could use NaN instead...
- precision: float = 0
- if tp or fp:
- precision = tp / (tp + fp)
- recall: float = 0
- if tp or fn:
- recall = tp / (tp + fn)
- f1: float = 0
- if precision or recall:
- f1 = 2 * (precision * recall) / (precision + recall)
- return precision, recall, f1
-
-
-class MultiTaskBinaryClassificationF1(Scorer):
- class_names: list[str]
-
- @weave.op()
- def summarize(self, score_rows: list) -> Optional[dict]:
- result = {}
- cols = transpose(score_rows)
-
- for class_name in self.class_names:
- col = cols[class_name]
- tp = sum(r["correct"] and not r["negative"] for r in col)
- fp = sum(not r["correct"] and not r["negative"] for r in col)
- fn = sum(not r["correct"] and r["negative"] for r in col)
- precision, recall, f1 = p_r_f1(tp, fp, fn)
- result[class_name] = {"f1": f1, "precision": precision, "recall": recall}
-
- return result
-
- @weave.op()
- def score(self, target: dict, model_output: Optional[dict]) -> dict:
- result = {}
- for class_name in self.class_names:
- class_label = target.get(class_name)
- class_model_output = model_output.get(class_name) if model_output else None
- result[class_name] = {
- "correct": class_label == class_model_output,
- "negative": not class_model_output,
- }
- return result
-
-
-def transpose(rows: list[dict]) -> dict[str, list]:
- cols = defaultdict(list)
- for row in rows:
- for k, v in row.items():
- cols[k].append(v)
- return dict(cols)
+# Keeping this file for now to avoid breaking changes.
+# In future, users should import all scoring functionality from weave.scorers
+import warnings
+
+from weave.scorers import *
+
+warnings.warn(
+ "Importing from weave.flow.scorer is deprecated. "
+ "Please import from weave.scorers in the future.",
+ DeprecationWarning,
+ stacklevel=2,
+)
diff --git a/weave/scorers/__init__.py b/weave/scorers/__init__.py
new file mode 100644
index 00000000000..941f48e7b13
--- /dev/null
+++ b/weave/scorers/__init__.py
@@ -0,0 +1,55 @@
+from weave.scorers.base_scorer import (
+ Scorer,
+ auto_summarize,
+ get_scorer_attributes,
+)
+from weave.scorers.classification_scorer import (
+ MultiTaskBinaryClassificationF1,
+ transpose,
+)
+from weave.scorers.hallucination_scorer import HallucinationFreeScorer
+from weave.scorers.json_scorer import ValidJSONScorer
+from weave.scorers.llm_scorer import (
+ InstructorLLMScorer,
+ LLMScorer,
+)
+from weave.scorers.llm_utils import (
+ create,
+ embed,
+)
+from weave.scorers.moderation_scorer import OpenAIModerationScorer
+from weave.scorers.pydantic_scorer import PydanticScorer
+from weave.scorers.ragas_scorer import (
+ ContextEntityRecallScorer,
+ ContextRelevancyScorer,
+)
+from weave.scorers.similarity_scorer import EmbeddingSimilarityScorer
+from weave.scorers.string_scorer import (
+ LevenshteinScorer,
+ StringMatchScorer,
+)
+from weave.scorers.summarization_scorer import SummarizationScorer
+from weave.scorers.xml_scorer import ValidXMLScorer
+
+__all__ = [
+ "auto_summarize",
+ "create",
+ "embed",
+ "ContextEntityRecallScorer",
+ "ContextRelevancyScorer",
+ "EmbeddingSimilarityScorer",
+ "get_scorer_attributes",
+ "HallucinationFreeScorer",
+ "InstructorLLMScorer",
+ "ValidJSONScorer",
+ "LevenshteinScorer",
+ "LLMScorer",
+ "MultiTaskBinaryClassificationF1",
+ "OpenAIModerationScorer",
+ "PydanticScorer",
+ "Scorer",
+ "StringMatchScorer",
+ "SummarizationScorer",
+ "transpose",
+ "ValidXMLScorer",
+]
diff --git a/weave/scorers/base_scorer.py b/weave/scorers/base_scorer.py
new file mode 100644
index 00000000000..a0eec1ac09c
--- /dev/null
+++ b/weave/scorers/base_scorer.py
@@ -0,0 +1,109 @@
+from numbers import Number
+from typing import Any, Callable, Optional, Sequence, Tuple, Union
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import weave
+from weave.flow.obj import Object
+from weave.trace.isinstance import weave_isinstance
+from weave.trace.op import Op, as_op, is_op
+
+
+class Scorer(Object):
+ column_map: Optional[dict[str, str]] = Field(
+ default=None,
+ description="A mapping from column names in the dataset to the names expected by the scorer",
+ )
+
+ def score(self, input: Any, target: Any, output: Any) -> Any:
+ raise NotImplementedError
+
+ @weave.op()
+ def summarize(self, score_rows: list) -> Optional[dict]:
+ return auto_summarize(score_rows)
+
+
+def stderr(data: Sequence[Union[int, float]]) -> float:
+ if len(data) > 1:
+ sample_variance = np.var(data, ddof=1)
+ return float(np.sqrt(sample_variance / len(data)))
+ else:
+ return 0
+
+
+def auto_summarize(data: list) -> Optional[dict[str, Any]]:
+ """Automatically summarize a list of (potentially nested) dicts.
+
+ Computes:
+ - avg for numeric cols
+ - count and fraction for boolean cols
+ - other col types are ignored
+
+ If col is all None, result is None
+
+ Returns:
+ dict of summary stats, with structure matching input dict structure.
+ """
+ if not data:
+ return {}
+ data = [x for x in data if x is not None]
+
+ if not data:
+ return None
+
+ val = data[0]
+
+ if isinstance(val, bool):
+ return {
+ "true_count": (true_count := sum(1 for x in data if x)),
+ "true_fraction": true_count / len(data),
+ }
+ elif isinstance(val, Number):
+ return {"mean": np.mean(data).item()}
+ elif isinstance(val, dict):
+ result = {}
+ all_keys = set().union(*[x.keys() for x in data if isinstance(x, dict)])
+ for k in all_keys:
+ if (
+ summary := auto_summarize(
+ [x.get(k) for x in data if isinstance(x, dict)]
+ )
+ ) is not None:
+ if k in summary:
+ result.update(summary)
+ else:
+ result[k] = summary
+ if not result:
+ return None
+ return result
+ elif isinstance(val, BaseModel):
+ return auto_summarize([x.model_dump() for x in data])
+ return None
+
+
+def get_scorer_attributes(
+ scorer: Union[Callable, Op, Scorer],
+) -> Tuple[str, Callable, Callable]:
+ if weave_isinstance(scorer, Scorer):
+ scorer_name = scorer.name
+ if scorer_name is None:
+ scorer_name = scorer.__class__.__name__
+ try:
+ score_fn = scorer.score
+ summarize_fn = scorer.summarize # type: ignore
+ except AttributeError:
+ raise ValueError(
+ f"Scorer {scorer_name} must implement score and summarize methods. Did you forget to wrap with @weave.op()?"
+ )
+ elif callable(scorer):
+ if is_op(scorer):
+ scorer = as_op(scorer)
+ scorer_name = scorer.name
+ else:
+ scorer_name = scorer.__name__
+ score_fn = scorer
+ summarize_fn = auto_summarize # type: ignore
+ else:
+ raise ValueError(f"Unknown scorer type: {scorer}")
+ return (scorer_name, score_fn, summarize_fn) # type: ignore
diff --git a/weave/scorers/classification_scorer.py b/weave/scorers/classification_scorer.py
new file mode 100644
index 00000000000..7c6cb1207c3
--- /dev/null
+++ b/weave/scorers/classification_scorer.py
@@ -0,0 +1,58 @@
+from collections import defaultdict
+from typing import Optional, Tuple
+
+import weave
+from weave.scorers.base_scorer import Scorer
+
+
+def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
+ # if any denom is zero, then zero. could use NaN instead...
+ precision: float = 0
+ if tp or fp:
+ precision = tp / (tp + fp)
+ recall: float = 0
+ if tp or fn:
+ recall = tp / (tp + fn)
+ f1: float = 0
+ if precision or recall:
+ f1 = 2 * (precision * recall) / (precision + recall)
+ return precision, recall, f1
+
+
+class MultiTaskBinaryClassificationF1(Scorer):
+ class_names: list[str]
+
+ @weave.op()
+ def summarize(self, score_rows: list) -> Optional[dict]:
+ result = {}
+ cols = transpose(score_rows)
+
+ for class_name in self.class_names:
+ col = cols[class_name]
+ tp = sum(r["correct"] and not r["negative"] for r in col)
+ fp = sum(not r["correct"] and not r["negative"] for r in col)
+ fn = sum(not r["correct"] and r["negative"] for r in col)
+ precision, recall, f1 = p_r_f1(tp, fp, fn)
+ result[class_name] = {"f1": f1, "precision": precision, "recall": recall}
+
+ return result
+
+ @weave.op()
+ def score(self, target: dict, output: Optional[dict]) -> dict:
+ result = {}
+ for class_name in self.class_names:
+ class_label = target.get(class_name)
+ class_output = output.get(class_name) if output else None
+ result[class_name] = {
+ "correct": class_label == class_output,
+ "negative": not class_output,
+ }
+ return result
+
+
+def transpose(rows: list[dict]) -> dict[str, list]:
+ cols = defaultdict(list)
+ for row in rows:
+ for k, v in row.items():
+ cols[k].append(v)
+ return dict(cols)
diff --git a/weave/scorers/hallucination_scorer.py b/weave/scorers/hallucination_scorer.py
new file mode 100644
index 00000000000..1aee2012134
--- /dev/null
+++ b/weave/scorers/hallucination_scorer.py
@@ -0,0 +1,160 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+import weave
+from weave.scorers.llm_scorer import InstructorLLMScorer
+from weave.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create
+from weave.scorers.utils import stringify
+
+DEFAULT_HALLUCINATION_SYSTEM_PROMPT = """
+Given some from a user and an