diff --git a/tests/trace/test_client_trace.py b/tests/trace/test_client_trace.py index cde47f0f4f5..716424554b8 100644 --- a/tests/trace/test_client_trace.py +++ b/tests/trace/test_client_trace.py @@ -1443,7 +1443,7 @@ def test_named_reuse(client): dataset = weave.ref(d_ref.uri()).get() @weave.op() - async def dummy_score(model_output): + async def dummy_score(output): return 1 class SimpleModel(weave.Model): diff --git a/tests/trace/test_evaluate.py b/tests/trace/test_evaluate.py index f5ada25215f..a07d9dd5d23 100644 --- a/tests/trace/test_evaluate.py +++ b/tests/trace/test_evaluate.py @@ -11,7 +11,7 @@ expected_eval_result = { - "model_output": {"mean": 9.5}, + "output": {"mean": 9.5}, "score": {"true_count": 1, "true_fraction": 0.5}, "model_latency": {"mean": pytest.approx(0, abs=1)}, } @@ -24,8 +24,8 @@ async def predict(self, input) -> str: @weave.op() -def score(target, model_output): - return target == model_output +def score(target, output): + return target == output @weave.op() @@ -57,7 +57,7 @@ async def model_predict(input, target) -> str: ) result = asyncio.run(evaluation.evaluate(model_predict)) assert result == { - "model_output": {"mean": 18.5}, + "output": {"mean": 18.5}, "score": {"true_count": 0, "true_fraction": 0.0}, "model_latency": { "mean": pytest.approx(0, abs=1), @@ -111,8 +111,8 @@ async def infer(self, input) -> str: def test_score_as_class(client): class MyScorer(weave.Scorer): @weave.op() - def score(self, target, model_output): - return target == model_output + def score(self, target, output): + return target == output evaluation = Evaluation( dataset=dataset_rows, @@ -121,7 +121,7 @@ def score(self, target, model_output): model = EvalModel() result = asyncio.run(evaluation.evaluate(model)) assert result == { - "model_output": {"mean": 9.5}, + "output": {"mean": 9.5}, "MyScorer": {"true_count": 1, "true_fraction": 0.5}, "model_latency": { "mean": pytest.approx(0, abs=1), @@ -137,8 +137,8 @@ def summarize(self, score_rows): return {"awesome": 3} @weave.op() - def score(self, target, model_output): - return target == model_output + def score(self, target, output): + return target == output evaluation = Evaluation( dataset=dataset_rows, @@ -147,7 +147,7 @@ def score(self, target, model_output): model = EvalModel() result = asyncio.run(evaluation.evaluate(model)) assert result == { - "model_output": {"mean": 9.5}, + "output": {"mean": 9.5}, "MyScorer": {"awesome": 3}, "model_latency": { "mean": pytest.approx(0, abs=1), @@ -167,7 +167,7 @@ def return_pred(pred): result = asyncio.run(evaluation.evaluate(return_pred)) assert result == { - "model_output": { + "output": { "a": {"true_count": 1, "true_fraction": 1.0}, "b": {"true_count": 0, "true_fraction": 0.0}, }, diff --git a/tests/trace/test_evaluation_performance.py b/tests/trace/test_evaluation_performance.py index bcc36c2ebca..17f01192f26 100644 --- a/tests/trace/test_evaluation_performance.py +++ b/tests/trace/test_evaluation_performance.py @@ -91,8 +91,8 @@ def predict(question: str): return "I don't know" @weave.op() - def score(question: str, expected: str, model_output: str): - return model_output == expected + def score(question: str, expected: str, output: str): + return output == expected evaluation = weave.Evaluation( name="My Evaluation", diff --git a/tests/trace/test_weave_client.py b/tests/trace/test_weave_client.py index a5eb8a49bc3..a746a871a9b 100644 --- a/tests/trace/test_weave_client.py +++ b/tests/trace/test_weave_client.py @@ -393,8 +393,8 @@ async def model_predict(input) -> str: dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}] @weave.op() - async def score(target, model_output): - return target == model_output + async def score(target, output): + return target == output evaluation = Evaluation( name="my-eval", @@ -747,8 +747,8 @@ async def model_predict(input) -> str: dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}] @weave.op() - async def score(target, model_output): - return target == model_output + async def score(target, output): + return target == output evaluation = Evaluation( name="my-eval", @@ -757,7 +757,7 @@ async def score(target, model_output): ) result = asyncio.run(evaluation.evaluate(model_predict)) expected_eval_result = { - "model_output": {"mean": 9.5}, + "output": {"mean": 9.5}, "score": {"true_count": 1, "true_fraction": 0.5}, } assert result == expected_eval_result @@ -857,8 +857,8 @@ def test_nested_ref_is_inner(client): dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}] @weave.op() - async def score(target, model_output): - return target == model_output + async def score(target, output): + return target == output evaluation = Evaluation( name="my-eval",