diff --git a/docs/docs/tutorial-eval.md b/docs/docs/tutorial-eval.md index 86ff37e90b4..09037fa23e2 100644 --- a/docs/docs/tutorial-eval.md +++ b/docs/docs/tutorial-eval.md @@ -55,7 +55,7 @@ You can instantiate `Model` objects as normal like this: model = ExtractFruitsModel(model_name='gpt-3.5-turbo-1106', prompt_template='Extract fields ("fruit": , "color": , "flavor": ) from the following text, as json: {sentence}') sentence = "There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy." -print(asyncio.run(model.predict(sentence))) +print(asyncio.run(model.predict(sentence))) # if you're in a Jupyter Notebook, run: # await model.predict(sentence) ``` @@ -64,12 +64,11 @@ print(asyncio.run(model.predict(sentence))) Checkout the [Models](/guides/core-types/models) guide to learn more. ::: - ### Collect some examples ```python -sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.", -"Pounits are a bright green color and are more savory than sweet.", +sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.", +"Pounits are a bright green color and are more savory than sweet.", "Finally, there are fruits called glowls, which have a very sour and bitter taste which is acidic and caustic, and a pale orange tinge to them."] labels = [ {'fruit': 'neoskizzles', 'color': 'purple', 'flavor': 'candy'}, @@ -87,13 +86,13 @@ examples = [ `Evaluation`s assess a `Model`s performance on a set of examples using a list of specified scoring functions. -Here, we'll use a default scoring function `MulticlassF1Score` and we'll also define our own `fruit_name_score`. +Here, we'll use a default scoring function `MultiTaskBinaryClassificationF1` and we'll also define our own `fruit_name_score`. -Here `sentence` is passed to the model's predict function, and `target` is used in the scoring function, these are inferred based on the argument names of the `predict` and scoring functions. +Here `sentence` is passed to the model's predict function, and `target` is used in the scoring function, these are inferred based on the argument names of the `predict` and scoring functions. ```python import weave -from weave.flow.scorer import MulticlassF1Score +from weave.flow.scorer import MultiTaskBinaryClassificationF1 @weave.op() def fruit_name_score(target: dict, prediction: dict) -> dict: @@ -102,7 +101,7 @@ def fruit_name_score(target: dict, prediction: dict) -> dict: # highlight-next-line evaluation = weave.Evaluation( # highlight-next-line - dataset=examples, scorers=[MulticlassF1Score(class_names=["fruit", "color", "flavor"]), fruit_name_score], + dataset=examples, scorers=[MultiTaskBinaryClassificationF1(class_names=["fruit", "color", "flavor"]), fruit_name_score], # highlight-next-line ) # highlight-next-line @@ -119,10 +118,10 @@ import asyncio # highlight-next-line import weave # highlight-next-line -from weave.flow.scorer import MulticlassF1Score +from weave.flow.scorer import MultiTaskBinaryClassificationF1 import openai -# We create a model class with one predict function. +# We create a model class with one predict function. # All inputs, predictions and parameters are automatically captured for easy inspection. # highlight-next-line @@ -154,10 +153,10 @@ weave.init('intro-example') # We create our model with our system prompt. model = ExtractFruitsModel(name='gpt4', - model_name='gpt-4-0125-preview', + model_name='gpt-4-0125-preview', prompt_template='Extract fields ("fruit": , "color": , "flavor") from the following text, as json: {sentence}') -sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.", -"Pounits are a bright green color and are more savory than sweet.", +sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.", +"Pounits are a bright green color and are more savory than sweet.", "Finally, there are fruits called glowls, which have a very sour and bitter taste which is acidic and caustic, and a pale orange tinge to them."] labels = [ {'fruit': 'neoskizzles', 'color': 'purple', 'flavor': 'candy'}, @@ -177,13 +176,13 @@ examples = [ def fruit_name_score(target: dict, prediction: dict) -> dict: return {'correct': target['fruit'] == prediction['fruit']} -# Finally, we run an evaluation of this model. +# Finally, we run an evaluation of this model. # This will generate a prediction for each input example, and then score it with each scoring function. # highlight-next-line evaluation = weave.Evaluation( name='fruit_eval', # highlight-next-line - dataset=examples, scorers=[MulticlassF1Score(class_names=["fruit", "color", "flavor"]), fruit_name_score], + dataset=examples, scorers=[MultiTaskBinaryClassificationF1(class_names=["fruit", "color", "flavor"]), fruit_name_score], # highlight-next-line ) print(asyncio.run(evaluation.evaluate(model))) diff --git a/examples/text-extract/evaluate.py b/examples/text-extract/evaluate.py index 8cd0b423f65..5f846d32f37 100644 --- a/examples/text-extract/evaluate.py +++ b/examples/text-extract/evaluate.py @@ -7,7 +7,7 @@ import openai import weave -from weave.flow.scorer import MulticlassF1Score +from weave.flow.scorer import MultiTaskBinaryClassificationF1 class TextExtractModel(weave.Model): @@ -42,7 +42,7 @@ def main(): eval = weave.Evaluation( dataset=dataset_rows, - scorers=[MulticlassF1Score(class_names=["name", "shares"])], + scorers=[MultiTaskBinaryClassificationF1(class_names=["name", "shares"])], ) model = TextExtractModel( diff --git a/weave/flow/eval.py b/weave/flow/eval.py index c9b4f533158..25f2fe1e02a 100644 --- a/weave/flow/eval.py +++ b/weave/flow/eval.py @@ -105,7 +105,7 @@ async def predict_and_score( f"{model_predict} expects arguments: {model_predict_arg_names}, provide a preprocess_model_input function that returns a dict with those keys." ) try: - prediction = await async_call(model_predict, **model_predict_args) + model_output = await async_call(model_predict, **model_predict_args) except OpCallError as e: dataset_column_names = list(example.keys()) dataset_column_names_str = ", ".join(dataset_column_names[:3]) @@ -131,9 +131,9 @@ async def predict_and_score( ) raise OpCallError(message) except Exception as e: - print("Prediction failed") + print("model_output failed") traceback.print_exc() - prediction = None + model_output = None scores = {} scorers = typing.cast(list[Union[Op, Scorer]], self.scorers or []) @@ -151,9 +151,9 @@ async def predict_and_score( if isinstance(score_arg_names, BoundOp): score_arg_names = score_arg_names[1:] - if "prediction" not in score_arg_names: + if "model_output" not in score_arg_names: raise OpCallError( - f"Scorer {scorer_name} must have a 'prediction' argument, to receive the output of the model function." + f"Scorer {scorer_name} must have a 'model_output' argument, to receive the output of the model function." ) if isinstance(example, dict): @@ -165,7 +165,7 @@ async def predict_and_score( raise ValueError( f"{score_fn} expects arguments: {score_arg_names}, provide a preprocess_model_input function that returns a dict with those keys." ) - score_args["prediction"] = prediction + score_args["model_output"] = model_output try: result = await async_call(score_fn, **score_args) @@ -181,7 +181,7 @@ async def predict_and_score( ] if isinstance(score_fn, BoundOp): required_arg_names = required_arg_names[1:] - required_arg_names.remove("prediction") + required_arg_names.remove("model_output") message = textwrap.dedent( f""" @@ -196,7 +196,7 @@ async def predict_and_score( scores[scorer_name] = result return { - "prediction": prediction, + "model_output": model_output, "scores": scores, } @@ -205,9 +205,9 @@ async def summarize(self, eval_table: typing.Union[weave.WeaveList, list]) -> di summary = {} if not isinstance(eval_table, weave.WeaveList): eval_table = weave.WeaveList(eval_table) - prediction_summary = auto_summarize(eval_table.column("prediction")) - if prediction_summary: - summary["prediction"] = prediction_summary + model_output_summary = auto_summarize(eval_table.column("model_output")) + if model_output_summary: + summary["model_output"] = model_output_summary scorers = self.scorers or [] for scorer in scorers: scorer_name, _, summarize_fn = get_scorer_attributes(scorer) @@ -229,7 +229,7 @@ async def eval_example(example: dict) -> dict: except Exception as e: print("Predict and score failed") traceback.print_exc() - return {"prediction": None, "scores": {}} + return {"model_output": None, "scores": {}} return eval_row n_complete = 0 @@ -245,7 +245,7 @@ async def eval_example(example: dict) -> dict: # f"Evaluating... {duration:.2f}s [{n_complete} / {len(self.dataset.rows)} complete]" # type:ignore # ) if eval_row == None: - eval_row = {"prediction": None, "scores": {}} + eval_row = {"model_output": None, "scores": {}} if eval_row["scores"] == None: eval_row["scores"] = {} for scorer in self.scorers or []: diff --git a/weave/flow/scorer.py b/weave/flow/scorer.py index da77d56bb3d..9c0471d3524 100644 --- a/weave/flow/scorer.py +++ b/weave/flow/scorer.py @@ -8,7 +8,7 @@ class Scorer(Object): - def score(self, target: Any, prediction: Any) -> Any: + def score(self, target: Any, model_output: Any) -> Any: raise NotImplementedError @weave.op() @@ -105,7 +105,7 @@ def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]: return precision, recall, f1 -class MulticlassF1Score(Scorer): +class MultiTaskBinaryClassificationF1(Scorer): class_names: list[str] @weave.op() @@ -130,13 +130,13 @@ def summarize(self, score_rows: WeaveList) -> Optional[dict]: return result @weave.op() - def score(self, target: dict, prediction: Optional[dict]) -> dict: + def score(self, target: dict, model_output: Optional[dict]) -> dict: result = {} for class_name in self.class_names: class_label = target.get(class_name) - class_prediction = prediction.get(class_name) if prediction else None + class_model_output = model_output.get(class_name) if model_output else None result[class_name] = { - "correct": class_label == class_prediction, - "negative": not class_prediction, + "correct": class_label == class_model_output, + "negative": not class_model_output, } return result diff --git a/weave/tests/test_client_trace.py b/weave/tests/test_client_trace.py index 2b63fbc2a36..e5f40d4d1de 100644 --- a/weave/tests/test_client_trace.py +++ b/weave/tests/test_client_trace.py @@ -927,7 +927,7 @@ def test_named_reuse(client): dataset = weave.ref(d_ref.uri()).get() @weave.op() - async def dummy_score(prediction): + async def dummy_score(model_output): return 1 class SimpleModel(weave.Model): diff --git a/weave/tests/test_evaluate.py b/weave/tests/test_evaluate.py index a0d1a9dee08..4a4c55ccb2d 100644 --- a/weave/tests/test_evaluate.py +++ b/weave/tests/test_evaluate.py @@ -2,7 +2,7 @@ import pytest import weave from weave import ref_base -from weave.flow.scorer import MulticlassF1Score +from weave.flow.scorer import MultiTaskBinaryClassificationF1 from weave import Dataset, Model, Evaluation pytestmark = pytest.mark.webtest @@ -12,7 +12,7 @@ dataset = Dataset(rows=dataset_rows) expected_eval_result = { - "prediction": {"mean": 9.5}, + "model_output": {"mean": 9.5}, "score": {"true_count": 1, "true_fraction": 0.5}, } @@ -24,8 +24,8 @@ async def predict(self, input) -> str: @weave.op() -def score(target, prediction): - return target == prediction +def score(target, model_output): + return target == model_output @weave.op() @@ -57,7 +57,7 @@ async def model_predict(input, target) -> str: ) result = asyncio.run(evaluation.evaluate(model_predict)) assert result == { - "prediction": {"mean": 18.5}, + "model_output": {"mean": 18.5}, "score": {"true_count": 0, "true_fraction": 0.0}, } @@ -108,8 +108,8 @@ async def infer(self, input) -> str: def test_score_as_class(client): class MyScorer(weave.Scorer): @weave.op() - def score(self, target, prediction): - return target == prediction + def score(self, target, model_output): + return target == model_output evaluation = Evaluation( dataset=dataset_rows, @@ -118,7 +118,7 @@ def score(self, target, prediction): model = EvalModel() result = asyncio.run(evaluation.evaluate(model)) assert result == { - "prediction": {"mean": 9.5}, + "model_output": {"mean": 9.5}, "MyScorer": {"true_count": 1, "true_fraction": 0.5}, } @@ -131,8 +131,8 @@ def summarize(self, score_rows): return {"awesome": 3} @weave.op() - def score(self, target, prediction): - return target == prediction + def score(self, target, model_output): + return target == model_output evaluation = Evaluation( dataset=dataset_rows, @@ -141,7 +141,7 @@ def score(self, target, prediction): model = EvalModel() result = asyncio.run(evaluation.evaluate(model)) assert result == { - "prediction": {"mean": 9.5}, + "model_output": {"mean": 9.5}, "MyScorer": {"awesome": 3}, } @@ -149,7 +149,7 @@ def score(self, target, prediction): def test_multiclass_f1_score(client): evaluation = Evaluation( dataset=[{"target": {"a": False, "b": True}, "pred": {"a": True, "b": False}}], - scorers=[MulticlassF1Score(class_names=["a", "b"])], + scorers=[MultiTaskBinaryClassificationF1(class_names=["a", "b"])], ) @weave.op() @@ -158,11 +158,11 @@ def return_pred(pred): result = asyncio.run(evaluation.evaluate(return_pred)) assert result == { - "prediction": { + "model_output": { "a": {"true_count": 1, "true_fraction": 1.0}, "b": {"true_count": 0, "true_fraction": 0.0}, }, - "MulticlassF1Score": { + "MultiTaskBinaryClassificationF1": { "a": {"f1": 0, "precision": 0.0, "recall": 0}, "b": {"f1": 0, "precision": 0, "recall": 0.0}, }, diff --git a/weave/tests/test_weave_client.py b/weave/tests/test_weave_client.py index def19f0ac01..fac1a99c70c 100644 --- a/weave/tests/test_weave_client.py +++ b/weave/tests/test_weave_client.py @@ -443,8 +443,8 @@ async def model_predict(input) -> str: dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}] @weave.op() - async def score(target, prediction): - return target == prediction + async def score(target, model_output): + return target == model_output evaluation = Evaluation( name="my-eval", @@ -453,7 +453,7 @@ async def score(target, prediction): ) result = asyncio.run(evaluation.evaluate(model_predict)) expected_eval_result = { - "prediction": {"mean": 9.5}, + "model_output": {"mean": 9.5}, "score": {"true_count": 1, "true_fraction": 0.5}, } assert result == expected_eval_result @@ -553,8 +553,8 @@ def test_nested_ref_is_inner(client): dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}] @weave.op() - async def score(target, prediction): - return target == prediction + async def score(target, model_output): + return target == model_output evaluation = Evaluation( name="my-eval",