From 067d6c51c519660b6f4e01fe35d8f43fe544a287 Mon Sep 17 00:00:00 2001 From: Tim Sweeney Date: Thu, 31 Oct 2024 16:09:18 -0600 Subject: [PATCH 1/2] Init --- tests/trace/test_leaderboard.py | 191 ++++++++++++++++++ weave/flow/leaderboard.py | 87 ++++++++ weave/trace/api.py | 7 + weave/trace/urls.py | 4 + .../base_object_registry.py | 7 +- .../generated_base_object_class_schemas.json | 81 +++++++- .../base_object_classes/leaderboard.py | 16 ++ 7 files changed, 391 insertions(+), 2 deletions(-) create mode 100644 tests/trace/test_leaderboard.py create mode 100644 weave/flow/leaderboard.py create mode 100644 weave/trace_server/interface/base_object_classes/leaderboard.py diff --git a/tests/trace/test_leaderboard.py b/tests/trace/test_leaderboard.py new file mode 100644 index 00000000000..a25fccbdd35 --- /dev/null +++ b/tests/trace/test_leaderboard.py @@ -0,0 +1,191 @@ +import pytest + +import weave +from weave.flow import leaderboard +from weave.trace.weave_client import get_ref + + +def test_leaderboard_empty(client): + evaluation_obj_1 = weave.Evaluation( + name="test_evaluation_name", + dataset=[{"input": -1, "target": -1}], + scorers=[], + ) + + weave.publish(evaluation_obj_1) + + spec = leaderboard.Leaderboard( + name="Empty Leaderboard", + description="""This is an empty leaderboard""", + columns=[ + leaderboard.LeaderboardColumn( + evaluation_object_ref=get_ref(evaluation_obj_1).uri(), + scorer_name="test_scorer_name", + summary_metric_path="test_summary_metric_path", + ) + ], + ) + + ref = weave.publish(spec) + + # Overriding spec to show that this works + spec = ref.get() + + results = leaderboard.get_leaderboard_results(spec, client) + assert len(results) == 0 + + +def test_leaderboard_mis_configured(client): + spec = leaderboard.Leaderboard( + name="Misconfigured Leaderboard", + description="""This is a misconfigured leaderboard""", + columns=[ + leaderboard.LeaderboardColumn( + evaluation_object_ref="test_evaluation_object_ref", + scorer_name="test_scorer_name", + summary_metric_path="test_summary_metric_path", + ) + ], + ) + + ref = weave.publish(spec) + + # Overriding spec to show that this works + spec = ref.get() + + results = leaderboard.get_leaderboard_results(spec, client) + assert len(results) == 0 + + +async def do_evaluations(): + @weave.op + def my_scorer(target, output): + return target == output + + evaluation_obj_1 = weave.Evaluation( + name="test_evaluation_name", + dataset=[{"input": 1, "target": 1}], + scorers=[my_scorer], + ) + + @weave.op + def simple_model(input): + return input + + await evaluation_obj_1.evaluate(simple_model) + + evaluation_obj_2 = weave.Evaluation( + name="test_evaluation_name", + dataset=[{"input": 1, "target": 1}, {"input": 2, "target": 2}], + scorers=[my_scorer], + ) + + @weave.op + def static_model(input): + return 1 + + @weave.op + def bad_model(input): + return input + 1 + + await evaluation_obj_2.evaluate(simple_model) + await evaluation_obj_2.evaluate(static_model) + await evaluation_obj_2.evaluate(bad_model) + + return evaluation_obj_1, evaluation_obj_2, simple_model, static_model, bad_model + + +@pytest.mark.asyncio +async def test_leaderboard_with_results(client): + ( + evaluation_obj_1, + evaluation_obj_2, + simple_model, + static_model, + bad_model, + ) = await do_evaluations() + + spec = leaderboard.Leaderboard( + name="Simple Leaderboard", + description="""This is a simple leaderboard""", + columns=[ + leaderboard.LeaderboardColumn( + evaluation_object_ref=get_ref(evaluation_obj_1).uri(), + scorer_name="my_scorer", + summary_metric_path="true_fraction", + ) + ], + ) + + ref = weave.publish(spec) + + # Overriding spec to show that this works + spec = ref.get() + + results = leaderboard.get_leaderboard_results(spec, client) + assert len(results) == 1 + assert results[0].model_ref == get_ref(simple_model).uri() + assert results[0].column_scores[0].scores[0].value == 1.0 + + spec = leaderboard.Leaderboard( + name="Complex Leaderboard", + description=""" +This leaderboard has multiple columns + +### Columns + +1. Column 1: + - Evaluation Object: test_evaluation_object_ref + - Scorer Name: test_scorer_name + - Summary Metric Path: test_summary_metric_path +2. Column 2: + - Evaluation Object: test_evaluation_object_ref + - Scorer Name: test_scorer_name + - Summary Metric Path: test_summary_metric_path +3. Column 3: + - Evaluation Object: test_evaluation_object_ref + - Scorer Name: test_scorer_name + - Summary Metric Path: test_summary_metric_path +""", + columns=[ + leaderboard.LeaderboardColumn( + evaluation_object_ref=get_ref(evaluation_obj_2).uri(), + scorer_name="my_scorer", + summary_metric_path="true_count", + ), + leaderboard.LeaderboardColumn( + evaluation_object_ref=get_ref(evaluation_obj_2).uri(), + scorer_name="my_scorer", + should_minimize=True, + summary_metric_path="true_fraction", + ), + leaderboard.LeaderboardColumn( + evaluation_object_ref=get_ref(evaluation_obj_1).uri(), + scorer_name="my_scorer", + summary_metric_path="true_fraction", + ), + ], + ) + + ref = weave.publish(spec) + + # Overriding spec to show that this works + spec = ref.get() + + results = leaderboard.get_leaderboard_results(spec, client) + assert len(results) == 3 + assert results[0].model_ref == get_ref(simple_model).uri() + assert len(results[0].column_scores) == 3 + assert results[0].column_scores[0].scores[0].value == 2.0 + assert results[0].column_scores[1].scores[0].value == 1.0 + assert results[0].column_scores[1].scores[0].value == 1.0 + assert results[1].model_ref == get_ref(static_model).uri() + assert len(results[1].column_scores) == 3 + assert results[1].column_scores[0].scores[0].value == 1.0 + assert results[1].column_scores[1].scores[0].value == 0.5 + assert len(results[1].column_scores[2].scores) == 0 + assert results[2].model_ref == get_ref(bad_model).uri() + assert len(results[1].column_scores) == 3 + assert results[2].column_scores[0].scores[0].value == 0 + assert results[2].column_scores[1].scores[0].value == 0 + assert len(results[2].column_scores[2].scores) == 0 diff --git a/weave/flow/leaderboard.py b/weave/flow/leaderboard.py new file mode 100644 index 00000000000..14fce2cf5e9 --- /dev/null +++ b/weave/flow/leaderboard.py @@ -0,0 +1,87 @@ +from typing import Any + +from pydantic import BaseModel + +from weave.trace.refs import OpRef +from weave.trace.weave_client import WeaveClient, get_ref +from weave.trace_server.interface.base_object_classes import leaderboard +from weave.trace_server.trace_server_interface import CallsFilter + + +class LeaderboardModelEvaluationResult(BaseModel): + evaluate_call_ref: str + value: Any + + +class ModelScoresForColumn(BaseModel): + scores: list[LeaderboardModelEvaluationResult] + + +class LeaderboardModelResult(BaseModel): + model_ref: str + column_scores: list[ModelScoresForColumn] + + +def get_leaderboard_results( + spec: leaderboard.Leaderboard, client: WeaveClient +) -> list[LeaderboardModelResult]: + entity, project = client._project_id().split("/") + calls = client.get_calls( + filter=CallsFilter( + op_names=[ + OpRef( + entity=entity, + project=project, + name="Evaluation.evaluate", + _digest="*", + ).uri() + ], + input_refs=[c.evaluation_object_ref for c in spec.columns], + ) + ) + + res_map: dict[str, LeaderboardModelResult] = {} + for call in calls: + # Frustrating that we have to get the ref like this. Since the + # `Call` object auto-derefs the inputs (making a network request), + # we have to manually get the ref here... waste of network calls. + call_ref = get_ref(call) + if call_ref is None: + continue + call_ref_uri = call_ref.uri() + + model_ref = get_ref(call.inputs["model"]) + if model_ref is None: + continue + model_ref_uri = model_ref.uri() + if model_ref_uri not in res_map: + res_map[model_ref_uri] = LeaderboardModelResult( + model_ref=model_ref_uri, + column_scores=[ModelScoresForColumn(scores=[]) for _ in spec.columns], + ) + for col_idx, c in enumerate(spec.columns): + eval_obj_ref = get_ref(call.inputs["self"]) + if eval_obj_ref is None: + continue + eval_obj_ref_uri = eval_obj_ref.uri() + if c.evaluation_object_ref != eval_obj_ref_uri: + continue + val = call.output.get(c.scorer_name) + for part in c.summary_metric_path.split("."): + if isinstance(val, dict): + val = val.get(part) + elif isinstance(val, list): + val = val[int(part)] + else: + break + res_map[model_ref_uri].column_scores[col_idx].scores.append( + LeaderboardModelEvaluationResult( + evaluate_call_ref=call_ref_uri, value=val + ) + ) + return list(res_map.values()) + + +# Re-export: +Leaderboard = leaderboard.Leaderboard +LeaderboardColumn = leaderboard.LeaderboardColumn diff --git a/weave/trace/api.py b/weave/trace/api.py index fc4847bc0cb..38881c34e81 100644 --- a/weave/trace/api.py +++ b/weave/trace/api.py @@ -22,6 +22,7 @@ should_disable_weave, ) from weave.trace.table import Table +from weave.trace_server.interface.base_object_classes import leaderboard def init( @@ -109,6 +110,12 @@ def publish(obj: Any, name: Optional[str] = None) -> weave_client.ObjectRef: ref.name, ref.digest, ) + elif isinstance(obj, leaderboard.Leaderboard): + url = urls.leaderboard_path( + ref.entity, + ref.project, + ref.name, + ) # TODO(gst): once frontend has direct dataset/model links # elif isinstance(obj, weave_client.Dataset): else: diff --git a/weave/trace/urls.py b/weave/trace/urls.py index d5237c668b7..60a9b3a2a85 100644 --- a/weave/trace/urls.py +++ b/weave/trace/urls.py @@ -34,5 +34,9 @@ def object_version_path( return f"{project_weave_root_url(entity_name, project_name)}/objects/{quote(object_name)}/versions/{obj_version}" +def leaderboard_path(entity_name: str, project_name: str, object_name: str) -> str: + return f"{project_weave_root_url(entity_name, project_name)}/leaderboards/{quote(object_name)}" + + def redirect_call(entity_name: str, project_name: str, call_id: str) -> str: return f"{remote_project_root_url(entity_name, project_name)}/r/call/{call_id}" diff --git a/weave/trace_server/interface/base_object_classes/base_object_registry.py b/weave/trace_server/interface/base_object_classes/base_object_registry.py index 8941dfa75ad..843598d5979 100644 --- a/weave/trace_server/interface/base_object_classes/base_object_registry.py +++ b/weave/trace_server/interface/base_object_classes/base_object_registry.py @@ -1,7 +1,11 @@ from typing import Dict, Type from weave.trace_server.interface.base_object_classes.base_object_def import BaseObject -from weave.trace_server.interface.base_object_classes.test_only_example import * +from weave.trace_server.interface.base_object_classes.leaderboard import Leaderboard +from weave.trace_server.interface.base_object_classes.test_only_example import ( + TestOnlyExample, + TestOnlyNestedBaseObject, +) BASE_OBJECT_REGISTRY: Dict[str, Type[BaseObject]] = {} @@ -18,3 +22,4 @@ def register_base_object(cls: Type[BaseObject]) -> None: register_base_object(TestOnlyExample) register_base_object(TestOnlyNestedBaseObject) +register_base_object(Leaderboard) diff --git a/weave/trace_server/interface/base_object_classes/generated/generated_base_object_class_schemas.json b/weave/trace_server/interface/base_object_classes/generated/generated_base_object_class_schemas.json index 4f7aee9dd54..2207d2c1f3c 100644 --- a/weave/trace_server/interface/base_object_classes/generated/generated_base_object_class_schemas.json +++ b/weave/trace_server/interface/base_object_classes/generated/generated_base_object_class_schemas.json @@ -1,5 +1,80 @@ { "$defs": { + "Leaderboard": { + "properties": { + "name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Name" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Description" + }, + "columns": { + "items": { + "$ref": "#/$defs/LeaderboardColumn" + }, + "title": "Columns", + "type": "array" + } + }, + "required": [ + "columns" + ], + "title": "Leaderboard", + "type": "object" + }, + "LeaderboardColumn": { + "properties": { + "evaluation_object_ref": { + "title": "Evaluation Object Ref", + "type": "string" + }, + "scorer_name": { + "title": "Scorer Name", + "type": "string" + }, + "summary_metric_path": { + "title": "Summary Metric Path", + "type": "string" + }, + "should_minimize": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Should Minimize" + } + }, + "required": [ + "evaluation_object_ref", + "scorer_name", + "summary_metric_path" + ], + "title": "LeaderboardColumn", + "type": "object" + }, "TestOnlyExample": { "properties": { "name": { @@ -103,11 +178,15 @@ }, "TestOnlyNestedBaseObject": { "$ref": "#/$defs/TestOnlyNestedBaseObject" + }, + "Leaderboard": { + "$ref": "#/$defs/Leaderboard" } }, "required": [ "TestOnlyExample", - "TestOnlyNestedBaseObject" + "TestOnlyNestedBaseObject", + "Leaderboard" ], "title": "CompositeBaseObject", "type": "object" diff --git a/weave/trace_server/interface/base_object_classes/leaderboard.py b/weave/trace_server/interface/base_object_classes/leaderboard.py new file mode 100644 index 00000000000..c55be78e80f --- /dev/null +++ b/weave/trace_server/interface/base_object_classes/leaderboard.py @@ -0,0 +1,16 @@ +from typing import Optional + +from pydantic import BaseModel + +from weave.trace_server.interface.base_object_classes import base_object_def + + +class LeaderboardColumn(BaseModel): + evaluation_object_ref: base_object_def.RefStr + scorer_name: str + summary_metric_path: str + should_minimize: Optional[bool] = None + + +class Leaderboard(base_object_def.BaseObject): + columns: list[LeaderboardColumn] From 840f5de16ea7b1e7ef67537d2ffa6df805bc05d6 Mon Sep 17 00:00:00 2001 From: Tim Sweeney Date: Thu, 31 Oct 2024 16:13:13 -0600 Subject: [PATCH 2/2] gen --- .../generatedBaseObjectClasses.zod.ts | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/wfReactInterface/generatedBaseObjectClasses.zod.ts b/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/wfReactInterface/generatedBaseObjectClasses.zod.ts index fd27f0bc933..1acf71ad314 100644 --- a/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/wfReactInterface/generatedBaseObjectClasses.zod.ts +++ b/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/wfReactInterface/generatedBaseObjectClasses.zod.ts @@ -1,5 +1,13 @@ import * as z from 'zod'; +export const LeaderboardColumnSchema = z.object({ + evaluation_object_ref: z.string(), + scorer_name: z.string(), + should_minimize: z.union([z.boolean(), z.null()]).optional(), + summary_metric_path: z.string(), +}); +export type LeaderboardColumn = z.infer; + export const TestOnlyNestedBaseModelSchema = z.object({ a: z.number(), }); @@ -16,6 +24,13 @@ export type TestOnlyNestedBaseObject = z.infer< typeof TestOnlyNestedBaseObjectSchema >; +export const LeaderboardSchema = z.object({ + columns: z.array(LeaderboardColumnSchema), + description: z.union([z.null(), z.string()]).optional(), + name: z.union([z.null(), z.string()]).optional(), +}); +export type Leaderboard = z.infer; + export const TestOnlyExampleSchema = z.object({ description: z.union([z.null(), z.string()]).optional(), name: z.union([z.null(), z.string()]).optional(), @@ -26,6 +41,7 @@ export const TestOnlyExampleSchema = z.object({ export type TestOnlyExample = z.infer; export const baseObjectClassRegistry = { + Leaderboard: LeaderboardSchema, TestOnlyExample: TestOnlyExampleSchema, TestOnlyNestedBaseObject: TestOnlyNestedBaseObjectSchema, };