Skip to content

Commit

Permalink
chore(weave): Final name adjustments to scorers (#1471)
Browse files Browse the repository at this point in the history
* change MulticlassF1Score to MultiTaskBinaryClassificationF1

* change prediction to model output
  • Loading branch information
tssweeney authored Apr 3, 2024
1 parent a7ded8f commit 07f9019
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 56 deletions.
29 changes: 14 additions & 15 deletions docs/docs/tutorial-eval.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ You can instantiate `Model` objects as normal like this:
model = ExtractFruitsModel(model_name='gpt-3.5-turbo-1106',
prompt_template='Extract fields ("fruit": <str>, "color": <str>, "flavor": <str>) from the following text, as json: {sentence}')
sentence = "There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy."
print(asyncio.run(model.predict(sentence)))
print(asyncio.run(model.predict(sentence)))
# if you're in a Jupyter Notebook, run:
# await model.predict(sentence)
```
Expand All @@ -64,12 +64,11 @@ print(asyncio.run(model.predict(sentence)))
Checkout the [Models](/guides/core-types/models) guide to learn more.
:::


### Collect some examples

```python
sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.",
"Pounits are a bright green color and are more savory than sweet.",
sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.",
"Pounits are a bright green color and are more savory than sweet.",
"Finally, there are fruits called glowls, which have a very sour and bitter taste which is acidic and caustic, and a pale orange tinge to them."]
labels = [
{'fruit': 'neoskizzles', 'color': 'purple', 'flavor': 'candy'},
Expand All @@ -87,13 +86,13 @@ examples = [

`Evaluation`s assess a `Model`s performance on a set of examples using a list of specified scoring functions.

Here, we'll use a default scoring function `MulticlassF1Score` and we'll also define our own `fruit_name_score`.
Here, we'll use a default scoring function `MultiTaskBinaryClassificationF1` and we'll also define our own `fruit_name_score`.

Here `sentence` is passed to the model's predict function, and `target` is used in the scoring function, these are inferred based on the argument names of the `predict` and scoring functions.
Here `sentence` is passed to the model's predict function, and `target` is used in the scoring function, these are inferred based on the argument names of the `predict` and scoring functions.

```python
import weave
from weave.flow.scorer import MulticlassF1Score
from weave.flow.scorer import MultiTaskBinaryClassificationF1

@weave.op()
def fruit_name_score(target: dict, prediction: dict) -> dict:
Expand All @@ -102,7 +101,7 @@ def fruit_name_score(target: dict, prediction: dict) -> dict:
# highlight-next-line
evaluation = weave.Evaluation(
# highlight-next-line
dataset=examples, scorers=[MulticlassF1Score(class_names=["fruit", "color", "flavor"]), fruit_name_score],
dataset=examples, scorers=[MultiTaskBinaryClassificationF1(class_names=["fruit", "color", "flavor"]), fruit_name_score],
# highlight-next-line
)
# highlight-next-line
Expand All @@ -119,10 +118,10 @@ import asyncio
# highlight-next-line
import weave
# highlight-next-line
from weave.flow.scorer import MulticlassF1Score
from weave.flow.scorer import MultiTaskBinaryClassificationF1
import openai

# We create a model class with one predict function.
# We create a model class with one predict function.
# All inputs, predictions and parameters are automatically captured for easy inspection.

# highlight-next-line
Expand Down Expand Up @@ -154,10 +153,10 @@ weave.init('intro-example')

# We create our model with our system prompt.
model = ExtractFruitsModel(name='gpt4',
model_name='gpt-4-0125-preview',
model_name='gpt-4-0125-preview',
prompt_template='Extract fields ("fruit": <str>, "color": <str>, "flavor") from the following text, as json: {sentence}')
sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.",
"Pounits are a bright green color and are more savory than sweet.",
sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.",
"Pounits are a bright green color and are more savory than sweet.",
"Finally, there are fruits called glowls, which have a very sour and bitter taste which is acidic and caustic, and a pale orange tinge to them."]
labels = [
{'fruit': 'neoskizzles', 'color': 'purple', 'flavor': 'candy'},
Expand All @@ -177,13 +176,13 @@ examples = [
def fruit_name_score(target: dict, prediction: dict) -> dict:
return {'correct': target['fruit'] == prediction['fruit']}

# Finally, we run an evaluation of this model.
# Finally, we run an evaluation of this model.
# This will generate a prediction for each input example, and then score it with each scoring function.
# highlight-next-line
evaluation = weave.Evaluation(
name='fruit_eval',
# highlight-next-line
dataset=examples, scorers=[MulticlassF1Score(class_names=["fruit", "color", "flavor"]), fruit_name_score],
dataset=examples, scorers=[MultiTaskBinaryClassificationF1(class_names=["fruit", "color", "flavor"]), fruit_name_score],
# highlight-next-line
)
print(asyncio.run(evaluation.evaluate(model)))
Expand Down
4 changes: 2 additions & 2 deletions examples/text-extract/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import openai
import weave

from weave.flow.scorer import MulticlassF1Score
from weave.flow.scorer import MultiTaskBinaryClassificationF1


class TextExtractModel(weave.Model):
Expand Down Expand Up @@ -42,7 +42,7 @@ def main():

eval = weave.Evaluation(
dataset=dataset_rows,
scorers=[MulticlassF1Score(class_names=["name", "shares"])],
scorers=[MultiTaskBinaryClassificationF1(class_names=["name", "shares"])],
)

model = TextExtractModel(
Expand Down
26 changes: 13 additions & 13 deletions weave/flow/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ async def predict_and_score(
f"{model_predict} expects arguments: {model_predict_arg_names}, provide a preprocess_model_input function that returns a dict with those keys."
)
try:
prediction = await async_call(model_predict, **model_predict_args)
model_output = await async_call(model_predict, **model_predict_args)
except OpCallError as e:
dataset_column_names = list(example.keys())
dataset_column_names_str = ", ".join(dataset_column_names[:3])
Expand All @@ -131,9 +131,9 @@ async def predict_and_score(
)
raise OpCallError(message)
except Exception as e:
print("Prediction failed")
print("model_output failed")
traceback.print_exc()
prediction = None
model_output = None

scores = {}
scorers = typing.cast(list[Union[Op, Scorer]], self.scorers or [])
Expand All @@ -151,9 +151,9 @@ async def predict_and_score(
if isinstance(score_arg_names, BoundOp):
score_arg_names = score_arg_names[1:]

if "prediction" not in score_arg_names:
if "model_output" not in score_arg_names:
raise OpCallError(
f"Scorer {scorer_name} must have a 'prediction' argument, to receive the output of the model function."
f"Scorer {scorer_name} must have a 'model_output' argument, to receive the output of the model function."
)

if isinstance(example, dict):
Expand All @@ -165,7 +165,7 @@ async def predict_and_score(
raise ValueError(
f"{score_fn} expects arguments: {score_arg_names}, provide a preprocess_model_input function that returns a dict with those keys."
)
score_args["prediction"] = prediction
score_args["model_output"] = model_output

try:
result = await async_call(score_fn, **score_args)
Expand All @@ -181,7 +181,7 @@ async def predict_and_score(
]
if isinstance(score_fn, BoundOp):
required_arg_names = required_arg_names[1:]
required_arg_names.remove("prediction")
required_arg_names.remove("model_output")

message = textwrap.dedent(
f"""
Expand All @@ -196,7 +196,7 @@ async def predict_and_score(
scores[scorer_name] = result

return {
"prediction": prediction,
"model_output": model_output,
"scores": scores,
}

Expand All @@ -205,9 +205,9 @@ async def summarize(self, eval_table: typing.Union[weave.WeaveList, list]) -> di
summary = {}
if not isinstance(eval_table, weave.WeaveList):
eval_table = weave.WeaveList(eval_table)
prediction_summary = auto_summarize(eval_table.column("prediction"))
if prediction_summary:
summary["prediction"] = prediction_summary
model_output_summary = auto_summarize(eval_table.column("model_output"))
if model_output_summary:
summary["model_output"] = model_output_summary
scorers = self.scorers or []
for scorer in scorers:
scorer_name, _, summarize_fn = get_scorer_attributes(scorer)
Expand All @@ -229,7 +229,7 @@ async def eval_example(example: dict) -> dict:
except Exception as e:
print("Predict and score failed")
traceback.print_exc()
return {"prediction": None, "scores": {}}
return {"model_output": None, "scores": {}}
return eval_row

n_complete = 0
Expand All @@ -245,7 +245,7 @@ async def eval_example(example: dict) -> dict:
# f"Evaluating... {duration:.2f}s [{n_complete} / {len(self.dataset.rows)} complete]" # type:ignore
# )
if eval_row == None:
eval_row = {"prediction": None, "scores": {}}
eval_row = {"model_output": None, "scores": {}}
if eval_row["scores"] == None:
eval_row["scores"] = {}
for scorer in self.scorers or []:
Expand Down
12 changes: 6 additions & 6 deletions weave/flow/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


class Scorer(Object):
def score(self, target: Any, prediction: Any) -> Any:
def score(self, target: Any, model_output: Any) -> Any:
raise NotImplementedError

@weave.op()
Expand Down Expand Up @@ -105,7 +105,7 @@ def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
return precision, recall, f1


class MulticlassF1Score(Scorer):
class MultiTaskBinaryClassificationF1(Scorer):
class_names: list[str]

@weave.op()
Expand All @@ -130,13 +130,13 @@ def summarize(self, score_rows: WeaveList) -> Optional[dict]:
return result

@weave.op()
def score(self, target: dict, prediction: Optional[dict]) -> dict:
def score(self, target: dict, model_output: Optional[dict]) -> dict:
result = {}
for class_name in self.class_names:
class_label = target.get(class_name)
class_prediction = prediction.get(class_name) if prediction else None
class_model_output = model_output.get(class_name) if model_output else None
result[class_name] = {
"correct": class_label == class_prediction,
"negative": not class_prediction,
"correct": class_label == class_model_output,
"negative": not class_model_output,
}
return result
2 changes: 1 addition & 1 deletion weave/tests/test_client_trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,7 +927,7 @@ def test_named_reuse(client):
dataset = weave.ref(d_ref.uri()).get()

@weave.op()
async def dummy_score(prediction):
async def dummy_score(model_output):
return 1

class SimpleModel(weave.Model):
Expand Down
28 changes: 14 additions & 14 deletions weave/tests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest
import weave
from weave import ref_base
from weave.flow.scorer import MulticlassF1Score
from weave.flow.scorer import MultiTaskBinaryClassificationF1
from weave import Dataset, Model, Evaluation

pytestmark = pytest.mark.webtest
Expand All @@ -12,7 +12,7 @@
dataset = Dataset(rows=dataset_rows)

expected_eval_result = {
"prediction": {"mean": 9.5},
"model_output": {"mean": 9.5},
"score": {"true_count": 1, "true_fraction": 0.5},
}

Expand All @@ -24,8 +24,8 @@ async def predict(self, input) -> str:


@weave.op()
def score(target, prediction):
return target == prediction
def score(target, model_output):
return target == model_output


@weave.op()
Expand Down Expand Up @@ -57,7 +57,7 @@ async def model_predict(input, target) -> str:
)
result = asyncio.run(evaluation.evaluate(model_predict))
assert result == {
"prediction": {"mean": 18.5},
"model_output": {"mean": 18.5},
"score": {"true_count": 0, "true_fraction": 0.0},
}

Expand Down Expand Up @@ -108,8 +108,8 @@ async def infer(self, input) -> str:
def test_score_as_class(client):
class MyScorer(weave.Scorer):
@weave.op()
def score(self, target, prediction):
return target == prediction
def score(self, target, model_output):
return target == model_output

evaluation = Evaluation(
dataset=dataset_rows,
Expand All @@ -118,7 +118,7 @@ def score(self, target, prediction):
model = EvalModel()
result = asyncio.run(evaluation.evaluate(model))
assert result == {
"prediction": {"mean": 9.5},
"model_output": {"mean": 9.5},
"MyScorer": {"true_count": 1, "true_fraction": 0.5},
}

Expand All @@ -131,8 +131,8 @@ def summarize(self, score_rows):
return {"awesome": 3}

@weave.op()
def score(self, target, prediction):
return target == prediction
def score(self, target, model_output):
return target == model_output

evaluation = Evaluation(
dataset=dataset_rows,
Expand All @@ -141,15 +141,15 @@ def score(self, target, prediction):
model = EvalModel()
result = asyncio.run(evaluation.evaluate(model))
assert result == {
"prediction": {"mean": 9.5},
"model_output": {"mean": 9.5},
"MyScorer": {"awesome": 3},
}


def test_multiclass_f1_score(client):
evaluation = Evaluation(
dataset=[{"target": {"a": False, "b": True}, "pred": {"a": True, "b": False}}],
scorers=[MulticlassF1Score(class_names=["a", "b"])],
scorers=[MultiTaskBinaryClassificationF1(class_names=["a", "b"])],
)

@weave.op()
Expand All @@ -158,11 +158,11 @@ def return_pred(pred):

result = asyncio.run(evaluation.evaluate(return_pred))
assert result == {
"prediction": {
"model_output": {
"a": {"true_count": 1, "true_fraction": 1.0},
"b": {"true_count": 0, "true_fraction": 0.0},
},
"MulticlassF1Score": {
"MultiTaskBinaryClassificationF1": {
"a": {"f1": 0, "precision": 0.0, "recall": 0},
"b": {"f1": 0, "precision": 0, "recall": 0.0},
},
Expand Down
10 changes: 5 additions & 5 deletions weave/tests/test_weave_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,8 +443,8 @@ async def model_predict(input) -> str:
dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]

@weave.op()
async def score(target, prediction):
return target == prediction
async def score(target, model_output):
return target == model_output

evaluation = Evaluation(
name="my-eval",
Expand All @@ -453,7 +453,7 @@ async def score(target, prediction):
)
result = asyncio.run(evaluation.evaluate(model_predict))
expected_eval_result = {
"prediction": {"mean": 9.5},
"model_output": {"mean": 9.5},
"score": {"true_count": 1, "true_fraction": 0.5},
}
assert result == expected_eval_result
Expand Down Expand Up @@ -553,8 +553,8 @@ def test_nested_ref_is_inner(client):
dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]

@weave.op()
async def score(target, prediction):
return target == prediction
async def score(target, model_output):
return target == model_output

evaluation = Evaluation(
name="my-eval",
Expand Down

0 comments on commit 07f9019

Please sign in to comment.