From cc3b87213464fe767ab284b1c182caaf53d255a7 Mon Sep 17 00:00:00 2001 From: Scott Condron Date: Fri, 5 Apr 2024 14:46:27 +0100 Subject: [PATCH] chore(weave): better eval docs (#1484) * chore: better eval docs * chore: make all evals run --- docs/docs/guides/core-types/evaluations.md | 142 +++++++++++++++++++-- 1 file changed, 130 insertions(+), 12 deletions(-) diff --git a/docs/docs/guides/core-types/evaluations.md b/docs/docs/guides/core-types/evaluations.md index 448af43748a..027b525da4f 100644 --- a/docs/docs/guides/core-types/evaluations.md +++ b/docs/docs/guides/core-types/evaluations.md @@ -5,36 +5,154 @@ hide_table_of_contents: true # Evaluation -Evaluation-driven development helps you reliably iterate on an application. The `Evaluation` class is designed to assess the performance of a `Model` on a given `Dataset` or set of examples using specified scoring functions. +Evaluation-driven development helps you reliably iterate on an application. The `Evaluation` class is designed to assess the performance of a `Model` on a given `Dataset` or set of examples using scoring functions. ```python +import weave from weave import Evaluation +import asyncio +# Collect your examples +examples = [ + {"question": "What is the capital of France?", "expected": "Paris"}, + {"question": "Who wrote 'To Kill a Mockingbird'?", "expected": "Harper Lee"}, + {"question": "What is the square root of 64?", "expected": "8"}, +] + +# Define any custom scoring function +@weave.op() +def match_score1(expected: dict, model_output: dict) -> dict: + # Here is where you'd define the logic to score the model output + return {'match': expected == model_output['generated_text']} + +@weave.op() +def function_to_evaluate(question: str): + # here's where you would add your LLM call and return the output + return {'generated_text': 'Paris'} + +# Score your examples using scoring functions evaluation = Evaluation( - dataset=dataset, scorers=[score] + dataset=examples, scorers=[match_score1] ) -evaluation.evaluate(model) + +# Start tracking the evaluation +weave.init('intro-example') +# Run the evaluation +asyncio.run(evaluation.evaluate(function_to_evaluate)) ``` ## Create an Evaluation -To systematically improve your application, it's very helpful to test your changes against a consistent dataset of potential inputs so that you catch regressions. Using the `Evaluation` class, you can be sure you're comparing apples-to-apples by keeping track of the model and dataset versions used. +To systematically improve your application, it's helpful to test your changes against a consistent dataset of potential inputs so that you catch regressions and can inspect your apps behaviour under different conditions. Using the `Evaluation` class, you can be sure you're comparing apples-to-apples by keeping track of all of the details that you're experimenting and evaluating with. + +Weave will take each example, pass it through your application and score the output on multiple custom scoring functions. By doing this, you'll have a view of the performance of your application, and a rich UI to drill into individual ouputs and scores. ### Define an evaluation dataset -First, define a [Dataset](/guides/core-types/datasets) or list of examples with a collection of examples to be evaluated. These examples are often failure cases that you want to test for, these are similar to unit tests in Test-Driven Development (TDD). +First, define a [Dataset](/guides/core-types/datasets) or list of dictionaries with a collection of examples to be evaluated. These examples are often failure cases that you want to test for, these are similar to unit tests in Test-Driven Development (TDD). + +### Defining scoring functions + +Then, create a list of scoring functions. These are used to score each example. Each function should have a `model_output` and optionally, other inputs from your examples, and return a dictionary with the scores. + +Scoring functions need to have a `model_output` keyword argument, but the other arguments are user defined and are taken from the dataset examples. It will only take the neccessary keys by using a dictionary key based on the argument name. + +This will take `expected` from the dictionary for scoring. + +```python +import weave + +# Collect your examples +examples = [ + {"question": "What is the capital of France?", "expected": "Paris"}, + {"question": "Who wrote 'To Kill a Mockingbird'?", "expected": "Harper Lee"}, + {"question": "What is the square root of 64?", "expected": "8"}, +] + +# Define any custom scoring function +@weave.op() +def match_score1(expected: dict, model_output: dict) -> dict: + # Here is where you'd define the logic to score the model output + return {'match': expected == model_output['generated_text']} +``` + +### Defining a Model to evaluate + +To evaluate a `Model`, call `evaluate` on it using an `Evaluation`. `Models` are used when you have attributes that you want to experiment with and capture in weave. + +```python +from weave import Model, Evaluation +import asyncio + +class MyModel(Model): + prompt: str -### Define a scoring function + @weave.op() + def predict(self, question: str): + # here's where you would add your LLM call and return the output + return {'generated_text': 'Hello, ' + self.prompt} -Then, define a list of scoring functions. Each function should take an example and a prediction, returning a dictionary with the scores. +model = MyModel(prompt='World') +evaluation = Evaluation( + dataset=examples, scorers=[match_score1] +) +weave.init('intro-example') # begin tracking results with weave +asyncio.run(evaluation.evaluate(model)) ``` -def match(answer: dict, model_output: dict ) -> dict: - return {'match': answer['expected_text'] == model_output['generated_text']} +This will run `predict` on each example and score the output with each scoring functions. + +### Defining a function to evaluate + +Alternatively, you can also evaluate a function that is wrapped in a `@weave.op()`. + +```python +@weave.op() +def function_to_evaluate(question: str): + # here's where you would add your LLM call and return the output + return {'generated_text': 'some response'} + +asyncio.run(evaluation.evaluate(function_to_evaluate)) ``` -### Evaluate +### Pulling it all together + +```python +from weave import Evaluation, Model +import weave +import asyncio +weave.init('intro-example') +examples = [ + {"question": "What is the capital of France?", "expected": "Paris"}, + {"question": "Who wrote 'To Kill a Mockingbird'?", "expected": "Harper Lee"}, + {"question": "What is the square root of 64?", "expected": "8"}, +] + +@weave.op() +def match_score1(expected: dict, model_output: dict) -> dict: + return {'match': expected == model_output['generated_text']} + +@weave.op() +def match_score2(expected: dict, model_output: dict) -> dict: + return {'match': expected == model_output['generated_text']} + +class MyModel(Model): + prompt: str + + @weave.op() + def predict(self, question: str): + # here's where you would add your LLM call and return the output + return {'generated_text': 'Hello, ' + question + self.prompt} + +model = MyModel(prompt='World') +evaluation = Evaluation(dataset=examples, scorers=[match_score1, match_score2]) + +asyncio.run(evaluation.evaluate(model)) -Finally, create a model and pass this to `evaluation.evaluate`, which will run `predict` on each example and score the output with each scoring functions. +@weave.op() +def function_to_evaluate(question: str): + # here's where you would add your LLM call and return the output + return {'generated_text': 'some response' + question} -To see this in action, follow the '[Build an Evaluation pipeline](/tutorial-eval)' tutorial. +asyncio.run(evaluation.evaluate(function_to_evaluate)) +``` \ No newline at end of file