diff --git a/docs/docs/tutorial-eval.md b/docs/docs/tutorial-eval.md
index 86ff37e90b4..09037fa23e2 100644
--- a/docs/docs/tutorial-eval.md
+++ b/docs/docs/tutorial-eval.md
@@ -55,7 +55,7 @@ You can instantiate `Model` objects as normal like this:
 model = ExtractFruitsModel(model_name='gpt-3.5-turbo-1106',
                           prompt_template='Extract fields ("fruit": <str>, "color": <str>, "flavor": <str>) from the following text, as json: {sentence}')
 sentence = "There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy."
-print(asyncio.run(model.predict(sentence))) 
+print(asyncio.run(model.predict(sentence)))
 # if you're in a Jupyter Notebook, run:
 # await model.predict(sentence)
 ```
@@ -64,12 +64,11 @@ print(asyncio.run(model.predict(sentence)))
 Checkout the [Models](/guides/core-types/models) guide to learn more.
 :::
 
-
 ### Collect some examples
 
 ```python
-sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.", 
-"Pounits are a bright green color and are more savory than sweet.", 
+sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.",
+"Pounits are a bright green color and are more savory than sweet.",
 "Finally, there are fruits called glowls, which have a very sour and bitter taste which is acidic and caustic, and a pale orange tinge to them."]
 labels = [
     {'fruit': 'neoskizzles', 'color': 'purple', 'flavor': 'candy'},
@@ -87,13 +86,13 @@ examples = [
 
 `Evaluation`s assess a `Model`s performance on a set of examples using a list of specified scoring functions.
 
-Here, we'll use a default scoring function `MulticlassF1Score` and we'll also define our own `fruit_name_score`. 
+Here, we'll use a default scoring function `MultiTaskBinaryClassificationF1` and we'll also define our own `fruit_name_score`.
 
-Here `sentence` is passed to the model's predict function, and `target` is used in the scoring function, these are inferred based on the argument names of the `predict` and scoring functions. 
+Here `sentence` is passed to the model's predict function, and `target` is used in the scoring function, these are inferred based on the argument names of the `predict` and scoring functions.
 
 ```python
 import weave
-from weave.flow.scorer import MulticlassF1Score
+from weave.flow.scorer import MultiTaskBinaryClassificationF1
 
 @weave.op()
 def fruit_name_score(target: dict, prediction: dict) -> dict:
@@ -102,7 +101,7 @@ def fruit_name_score(target: dict, prediction: dict) -> dict:
 # highlight-next-line
 evaluation = weave.Evaluation(
     # highlight-next-line
-    dataset=examples, scorers=[MulticlassF1Score(class_names=["fruit", "color", "flavor"]), fruit_name_score],
+    dataset=examples, scorers=[MultiTaskBinaryClassificationF1(class_names=["fruit", "color", "flavor"]), fruit_name_score],
 # highlight-next-line
 )
 # highlight-next-line
@@ -119,10 +118,10 @@ import asyncio
 # highlight-next-line
 import weave
 # highlight-next-line
-from weave.flow.scorer import MulticlassF1Score
+from weave.flow.scorer import MultiTaskBinaryClassificationF1
 import openai
 
-# We create a model class with one predict function. 
+# We create a model class with one predict function.
 # All inputs, predictions and parameters are automatically captured for easy inspection.
 
 # highlight-next-line
@@ -154,10 +153,10 @@ weave.init('intro-example')
 
 # We create our model with our system prompt.
 model = ExtractFruitsModel(name='gpt4',
-                           model_name='gpt-4-0125-preview', 
+                           model_name='gpt-4-0125-preview',
                            prompt_template='Extract fields ("fruit": <str>, "color": <str>, "flavor") from the following text, as json: {sentence}')
-sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.", 
-"Pounits are a bright green color and are more savory than sweet.", 
+sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.",
+"Pounits are a bright green color and are more savory than sweet.",
 "Finally, there are fruits called glowls, which have a very sour and bitter taste which is acidic and caustic, and a pale orange tinge to them."]
 labels = [
     {'fruit': 'neoskizzles', 'color': 'purple', 'flavor': 'candy'},
@@ -177,13 +176,13 @@ examples = [
 def fruit_name_score(target: dict, prediction: dict) -> dict:
     return {'correct': target['fruit'] == prediction['fruit']}
 
-# Finally, we run an evaluation of this model. 
+# Finally, we run an evaluation of this model.
 # This will generate a prediction for each input example, and then score it with each scoring function.
 # highlight-next-line
 evaluation = weave.Evaluation(
     name='fruit_eval',
     # highlight-next-line
-    dataset=examples, scorers=[MulticlassF1Score(class_names=["fruit", "color", "flavor"]), fruit_name_score],
+    dataset=examples, scorers=[MultiTaskBinaryClassificationF1(class_names=["fruit", "color", "flavor"]), fruit_name_score],
 # highlight-next-line
 )
 print(asyncio.run(evaluation.evaluate(model)))
diff --git a/examples/text-extract/evaluate.py b/examples/text-extract/evaluate.py
index 8cd0b423f65..5f846d32f37 100644
--- a/examples/text-extract/evaluate.py
+++ b/examples/text-extract/evaluate.py
@@ -7,7 +7,7 @@
 import openai
 import weave
 
-from weave.flow.scorer import MulticlassF1Score
+from weave.flow.scorer import MultiTaskBinaryClassificationF1
 
 
 class TextExtractModel(weave.Model):
@@ -42,7 +42,7 @@ def main():
 
     eval = weave.Evaluation(
         dataset=dataset_rows,
-        scorers=[MulticlassF1Score(class_names=["name", "shares"])],
+        scorers=[MultiTaskBinaryClassificationF1(class_names=["name", "shares"])],
     )
 
     model = TextExtractModel(
diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index c9b4f533158..25f2fe1e02a 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -105,7 +105,7 @@ async def predict_and_score(
                     f"{model_predict} expects arguments: {model_predict_arg_names}, provide a preprocess_model_input function that returns a dict with those keys."
                 )
         try:
-            prediction = await async_call(model_predict, **model_predict_args)
+            model_output = await async_call(model_predict, **model_predict_args)
         except OpCallError as e:
             dataset_column_names = list(example.keys())
             dataset_column_names_str = ", ".join(dataset_column_names[:3])
@@ -131,9 +131,9 @@ async def predict_and_score(
             )
             raise OpCallError(message)
         except Exception as e:
-            print("Prediction failed")
+            print("model_output failed")
             traceback.print_exc()
-            prediction = None
+            model_output = None
 
         scores = {}
         scorers = typing.cast(list[Union[Op, Scorer]], self.scorers or [])
@@ -151,9 +151,9 @@ async def predict_and_score(
             if isinstance(score_arg_names, BoundOp):
                 score_arg_names = score_arg_names[1:]
 
-            if "prediction" not in score_arg_names:
+            if "model_output" not in score_arg_names:
                 raise OpCallError(
-                    f"Scorer {scorer_name} must have a 'prediction' argument, to receive the output of the model function."
+                    f"Scorer {scorer_name} must have a 'model_output' argument, to receive the output of the model function."
                 )
 
             if isinstance(example, dict):
@@ -165,7 +165,7 @@ async def predict_and_score(
                     raise ValueError(
                         f"{score_fn} expects arguments: {score_arg_names}, provide a preprocess_model_input function that returns a dict with those keys."
                     )
-            score_args["prediction"] = prediction
+            score_args["model_output"] = model_output
 
             try:
                 result = await async_call(score_fn, **score_args)
@@ -181,7 +181,7 @@ async def predict_and_score(
                 ]
                 if isinstance(score_fn, BoundOp):
                     required_arg_names = required_arg_names[1:]
-                required_arg_names.remove("prediction")
+                required_arg_names.remove("model_output")
 
                 message = textwrap.dedent(
                     f"""
@@ -196,7 +196,7 @@ async def predict_and_score(
             scores[scorer_name] = result
 
         return {
-            "prediction": prediction,
+            "model_output": model_output,
             "scores": scores,
         }
 
@@ -205,9 +205,9 @@ async def summarize(self, eval_table: typing.Union[weave.WeaveList, list]) -> di
         summary = {}
         if not isinstance(eval_table, weave.WeaveList):
             eval_table = weave.WeaveList(eval_table)
-        prediction_summary = auto_summarize(eval_table.column("prediction"))
-        if prediction_summary:
-            summary["prediction"] = prediction_summary
+        model_output_summary = auto_summarize(eval_table.column("model_output"))
+        if model_output_summary:
+            summary["model_output"] = model_output_summary
         scorers = self.scorers or []
         for scorer in scorers:
             scorer_name, _, summarize_fn = get_scorer_attributes(scorer)
@@ -229,7 +229,7 @@ async def eval_example(example: dict) -> dict:
             except Exception as e:
                 print("Predict and score failed")
                 traceback.print_exc()
-                return {"prediction": None, "scores": {}}
+                return {"model_output": None, "scores": {}}
             return eval_row
 
         n_complete = 0
@@ -245,7 +245,7 @@ async def eval_example(example: dict) -> dict:
             #     f"Evaluating... {duration:.2f}s [{n_complete} / {len(self.dataset.rows)} complete]"  # type:ignore
             # )
             if eval_row == None:
-                eval_row = {"prediction": None, "scores": {}}
+                eval_row = {"model_output": None, "scores": {}}
             if eval_row["scores"] == None:
                 eval_row["scores"] = {}
             for scorer in self.scorers or []:
diff --git a/weave/flow/scorer.py b/weave/flow/scorer.py
index da77d56bb3d..9c0471d3524 100644
--- a/weave/flow/scorer.py
+++ b/weave/flow/scorer.py
@@ -8,7 +8,7 @@
 
 
 class Scorer(Object):
-    def score(self, target: Any, prediction: Any) -> Any:
+    def score(self, target: Any, model_output: Any) -> Any:
         raise NotImplementedError
 
     @weave.op()
@@ -105,7 +105,7 @@ def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
     return precision, recall, f1
 
 
-class MulticlassF1Score(Scorer):
+class MultiTaskBinaryClassificationF1(Scorer):
     class_names: list[str]
 
     @weave.op()
@@ -130,13 +130,13 @@ def summarize(self, score_rows: WeaveList) -> Optional[dict]:
         return result
 
     @weave.op()
-    def score(self, target: dict, prediction: Optional[dict]) -> dict:
+    def score(self, target: dict, model_output: Optional[dict]) -> dict:
         result = {}
         for class_name in self.class_names:
             class_label = target.get(class_name)
-            class_prediction = prediction.get(class_name) if prediction else None
+            class_model_output = model_output.get(class_name) if model_output else None
             result[class_name] = {
-                "correct": class_label == class_prediction,
-                "negative": not class_prediction,
+                "correct": class_label == class_model_output,
+                "negative": not class_model_output,
             }
         return result
diff --git a/weave/tests/test_client_trace.py b/weave/tests/test_client_trace.py
index 2b63fbc2a36..e5f40d4d1de 100644
--- a/weave/tests/test_client_trace.py
+++ b/weave/tests/test_client_trace.py
@@ -927,7 +927,7 @@ def test_named_reuse(client):
     dataset = weave.ref(d_ref.uri()).get()
 
     @weave.op()
-    async def dummy_score(prediction):
+    async def dummy_score(model_output):
         return 1
 
     class SimpleModel(weave.Model):
diff --git a/weave/tests/test_evaluate.py b/weave/tests/test_evaluate.py
index a0d1a9dee08..4a4c55ccb2d 100644
--- a/weave/tests/test_evaluate.py
+++ b/weave/tests/test_evaluate.py
@@ -2,7 +2,7 @@
 import pytest
 import weave
 from weave import ref_base
-from weave.flow.scorer import MulticlassF1Score
+from weave.flow.scorer import MultiTaskBinaryClassificationF1
 from weave import Dataset, Model, Evaluation
 
 pytestmark = pytest.mark.webtest
@@ -12,7 +12,7 @@
 dataset = Dataset(rows=dataset_rows)
 
 expected_eval_result = {
-    "prediction": {"mean": 9.5},
+    "model_output": {"mean": 9.5},
     "score": {"true_count": 1, "true_fraction": 0.5},
 }
 
@@ -24,8 +24,8 @@ async def predict(self, input) -> str:
 
 
 @weave.op()
-def score(target, prediction):
-    return target == prediction
+def score(target, model_output):
+    return target == model_output
 
 
 @weave.op()
@@ -57,7 +57,7 @@ async def model_predict(input, target) -> str:
     )
     result = asyncio.run(evaluation.evaluate(model_predict))
     assert result == {
-        "prediction": {"mean": 18.5},
+        "model_output": {"mean": 18.5},
         "score": {"true_count": 0, "true_fraction": 0.0},
     }
 
@@ -108,8 +108,8 @@ async def infer(self, input) -> str:
 def test_score_as_class(client):
     class MyScorer(weave.Scorer):
         @weave.op()
-        def score(self, target, prediction):
-            return target == prediction
+        def score(self, target, model_output):
+            return target == model_output
 
     evaluation = Evaluation(
         dataset=dataset_rows,
@@ -118,7 +118,7 @@ def score(self, target, prediction):
     model = EvalModel()
     result = asyncio.run(evaluation.evaluate(model))
     assert result == {
-        "prediction": {"mean": 9.5},
+        "model_output": {"mean": 9.5},
         "MyScorer": {"true_count": 1, "true_fraction": 0.5},
     }
 
@@ -131,8 +131,8 @@ def summarize(self, score_rows):
             return {"awesome": 3}
 
         @weave.op()
-        def score(self, target, prediction):
-            return target == prediction
+        def score(self, target, model_output):
+            return target == model_output
 
     evaluation = Evaluation(
         dataset=dataset_rows,
@@ -141,7 +141,7 @@ def score(self, target, prediction):
     model = EvalModel()
     result = asyncio.run(evaluation.evaluate(model))
     assert result == {
-        "prediction": {"mean": 9.5},
+        "model_output": {"mean": 9.5},
         "MyScorer": {"awesome": 3},
     }
 
@@ -149,7 +149,7 @@ def score(self, target, prediction):
 def test_multiclass_f1_score(client):
     evaluation = Evaluation(
         dataset=[{"target": {"a": False, "b": True}, "pred": {"a": True, "b": False}}],
-        scorers=[MulticlassF1Score(class_names=["a", "b"])],
+        scorers=[MultiTaskBinaryClassificationF1(class_names=["a", "b"])],
     )
 
     @weave.op()
@@ -158,11 +158,11 @@ def return_pred(pred):
 
     result = asyncio.run(evaluation.evaluate(return_pred))
     assert result == {
-        "prediction": {
+        "model_output": {
             "a": {"true_count": 1, "true_fraction": 1.0},
             "b": {"true_count": 0, "true_fraction": 0.0},
         },
-        "MulticlassF1Score": {
+        "MultiTaskBinaryClassificationF1": {
             "a": {"f1": 0, "precision": 0.0, "recall": 0},
             "b": {"f1": 0, "precision": 0, "recall": 0.0},
         },
diff --git a/weave/tests/test_weave_client.py b/weave/tests/test_weave_client.py
index def19f0ac01..fac1a99c70c 100644
--- a/weave/tests/test_weave_client.py
+++ b/weave/tests/test_weave_client.py
@@ -443,8 +443,8 @@ async def model_predict(input) -> str:
     dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
 
     @weave.op()
-    async def score(target, prediction):
-        return target == prediction
+    async def score(target, model_output):
+        return target == model_output
 
     evaluation = Evaluation(
         name="my-eval",
@@ -453,7 +453,7 @@ async def score(target, prediction):
     )
     result = asyncio.run(evaluation.evaluate(model_predict))
     expected_eval_result = {
-        "prediction": {"mean": 9.5},
+        "model_output": {"mean": 9.5},
         "score": {"true_count": 1, "true_fraction": 0.5},
     }
     assert result == expected_eval_result
@@ -553,8 +553,8 @@ def test_nested_ref_is_inner(client):
     dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
 
     @weave.op()
-    async def score(target, prediction):
-        return target == prediction
+    async def score(target, model_output):
+        return target == model_output
 
     evaluation = Evaluation(
         name="my-eval",