Skip to content

Commit

Permalink
finished basic scorers
Browse files Browse the repository at this point in the history
  • Loading branch information
tssweeney committed Dec 12, 2024
1 parent 9612a9d commit 51a9a4c
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 12 deletions.
119 changes: 110 additions & 9 deletions tests/trace/builtin_objects/backend_models.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"🍩 https://wandb.ai/timssweeney/remote_model_demo_4/r/call/0193b4ad-e5f5-7c10-a41f-ee06bc0c34ea\n",
"🍩 https://wandb.ai/timssweeney/remote_model_demo_4/r/call/0193ba4f-de91-79a2-9028-67c3c500703e\n",
"weave:///timssweeney/remote_model_demo_4/object/LiteLLMCompletionModel:KBsfUswVpEHFYmZuJjmhM2YH4EttkRZJSoH0Z0ZaNRY\n",
"{'name': 'Fred', 'age': 30}\n"
]
Expand Down Expand Up @@ -97,7 +97,7 @@
{
"data": {
"text/plain": [
"CallMethodRes(call_id='0193b4af-f064-7f92-8852-ed3024344dac', output={'name': 'Charles', 'age': 40})"
"CallMethodRes(call_id='0193ba4f-fc13-79c2-b217-03e6fdd7e7c4', output={'name': 'Charles', 'age': 40})"
]
},
"execution_count": 5,
Expand All @@ -124,7 +124,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand All @@ -133,7 +133,7 @@
"ObjCreateRes(digest='k85wXnWLVxpHujpohAqNBIirXZSM6XRSOSk84n1XR84')"
]
},
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -183,14 +183,14 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🍩 https://wandb.ai/timssweeney/remote_model_demo_4/r/call/0193b4b2-897b-77e0-b404-236415a689b9\n"
"🍩 https://wandb.ai/timssweeney/remote_model_demo_4/r/call/0193ba50-1be0-70d2-84cd-dcf8fac3ff09\n"
]
},
{
Expand All @@ -199,7 +199,7 @@
"{'name': 'Fred', 'age': 30}"
]
},
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -224,7 +224,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand All @@ -233,7 +233,7 @@
"LiteLLMCompletionModel(name=None, description=None, model='gpt-4o', messages_template=WeaveList([{'role': 'system', 'content': 'Please extract the name and age from the following text!'}, {'role': 'user', 'content': '{user_input}'}]), response_format=WeaveDict({'type': 'json_schema', 'json_schema': {'name': 'Person', 'schema': {'type': 'object', 'properties': {'age': {'type': 'integer'}, 'name': {'type': 'string'}}}}}))"
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -242,6 +242,107 @@
"gotten_model"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Part 2: Scoring:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ScoreCallRes(feedback_id='0193ba55-d466-74a3-a4de-da0a456b08a7', score_call=CallSchema(id='0193ba55-cb43-7c61-a712-f7249e6dfe4f', project_id='UHJvamVjdEludGVybmFsSWQ6NDA1NzYyOTQ=', op_name='weave:///timssweeney/remote_model_demo_4/op/LLMJudgeScorer.score:LSxb3VBdL8YmPr9vqYhxsMe74D8C04dJL1IKQ61Ke7M', display_name=None, trace_id='0193ba55-cb43-7c61-a712-f71512a66d3b', parent_id=None, started_at=datetime.datetime(2024, 12, 12, 10, 6, 45, 59887, tzinfo=TzInfo(UTC)), attributes={'weave': {'client_version': '0.51.25-dev0', 'source': 'python-sdk', 'os_name': 'Darwin', 'os_version': 'Darwin Kernel Version 23.2.0: Wed Nov 15 21:53:18 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T6000', 'os_release': '23.2.0', 'sys_version': '3.10.8 (main, Dec 5 2022, 18:10:41) [Clang 14.0.0 (clang-1400.0.29.202)]'}}, inputs={'self': 'weave:///timssweeney/remote_model_demo_4/object/LLMJudgeScorer:uCL086uULzE1HKLFn8YIezCG98HiqayaAp3d1R9ktA0', 'inputs': {'kwargs': {'user_input': 'Hello, my name is Charles and I am 40 years old.'}}, 'output': {'name': 'Charles', 'age': 40}}, ended_at=datetime.datetime(2024, 12, 12, 10, 6, 47, 368348, tzinfo=TzInfo(UTC)), exception=None, output={'is_correct': True}, summary={'usage': {'gpt-4o-2024-08-06': {'prompt_tokens': 91, 'completion_tokens': 6, 'requests': 1, 'total_tokens': 97, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}}, 'weave': {'status': <TraceStatus.SUCCESS: 'success'>, 'trace_name': 'LLMJudgeScorer.score', 'latency_ms': 2308}}, wb_user_id='VXNlcjo2Mzg4Nw==', wb_run_id=None, deleted_at=None))"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import weave\n",
"from weave.trace.refs import CallRef\n",
"from weave.trace_server import trace_server_interface as tsi\n",
"\n",
"obj_create_res = client.server.obj_create(\n",
" tsi.ObjCreateReq.model_validate(\n",
" {\n",
" \"obj\": {\n",
" \"project_id\": client._project_id(),\n",
" \"object_id\": \"CorrectnessJudge\",\n",
" \"val\": {\n",
" \"model\": \"gpt-4o\",\n",
" \"system_prompt\": \"You are a judge that scores the correctness of a response.\",\n",
" \"response_format\": {\n",
" \"type\": \"json_schema\",\n",
" \"json_schema\": {\n",
" \"name\": \"Correctness\",\n",
" \"schema\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"is_correct\": {\"type\": \"boolean\"},\n",
" },\n",
" },\n",
" },\n",
" },\n",
" },\n",
" \"set_leaf_object_class\": \"LLMJudgeScorer\",\n",
" }\n",
" }\n",
" )\n",
")\n",
"client._flush()\n",
"scorer_ref = weave.ObjectRef(\n",
" entity=client._project_id().split(\"/\")[0],\n",
" project=client._project_id().split(\"/\")[1],\n",
" name=\"CorrectnessJudge\",\n",
" _digest=obj_create_res.digest,\n",
")\n",
"\n",
"call_ref = CallRef(\n",
" entity=client._project_id().split(\"/\")[0],\n",
" project=client._project_id().split(\"/\")[1],\n",
" id=call_res.call_id,\n",
")\n",
"\n",
"score_res = client.server.score_call(\n",
" tsi.ScoreCallReq.model_validate(\n",
" {\n",
" \"project_id\": client._project_id(),\n",
" \"call_ref\": call_ref.uri(),\n",
" \"scorer_ref\": scorer_ref.uri(),\n",
" }\n",
" )\n",
")\n",
"\n",
"score_res"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"feedback_id='0193ba50-631e-7192-aa0c-d9332c474c17' score_call=CallSchema(id='0193ba50-62e7-7f21-bc8c-e8f46fedbdb4', project_id='UHJvamVjdEludGVybmFsSWQ6NDA1NzYyOTQ=', op_name='weave:///timssweeney/remote_model_demo_4/op/LLMJudgeScorer.score:LSxb3VBdL8YmPr9vqYhxsMe74D8C04dJL1IKQ61Ke7M', display_name=None, trace_id='0193ba50-62e7-7f21-bc8c-e8eb057791c0', parent_id=None, started_at=datetime.datetime(2024, 12, 12, 10, 0, 50, 663992, tzinfo=TzInfo(UTC)), attributes={'weave': {'client_version': '0.51.25-dev0', 'source': 'python-sdk', 'os_name': 'Darwin', 'os_version': 'Darwin Kernel Version 23.2.0: Wed Nov 15 21:53:18 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T6000', 'os_release': '23.2.0', 'sys_version': '3.10.8 (main, Dec 5 2022, 18:10:41) [Clang 14.0.0 (clang-1400.0.29.202)]'}}, inputs={'self': 'weave:///timssweeney/remote_model_demo_4/object/LLMJudgeScorer:uCL086uULzE1HKLFn8YIezCG98HiqayaAp3d1R9ktA0', 'inputs': {'self': 'weave:///timssweeney/remote_model_demo_4/object/LiteLLMCompletionModel:KBsfUswVpEHFYmZuJjmhM2YH4EttkRZJSoH0Z0ZaNRY', 'kwargs': {'user_input': 'Hello, my name is Charles and I am 40 years old.'}}, 'output': {'name': 'Charles', 'age': 40}}, ended_at=datetime.datetime(2024, 12, 12, 10, 0, 50, 689418, tzinfo=TzInfo(UTC)), exception='{\"type\": \"TypeError\", \"message\": \"Object of type ObjectRef is not JSON serializable\", \"traceback\": [{\"filename\": \"<string>\", \"line_number\": 1, \"function_name\": \"<module>\", \"text\": \"\"}, {\"filename\": \"/Users/timothysweeney/.pyenv/versions/3.10.8/lib/python3.10/multiprocessing/spawn.py\", \"line_number\": 116, \"function_name\": \"spawn_main\", \"text\": \"exitcode = _main(fd, parent_sentinel)\"}, {\"filename\": \"/Users/timothysweeney/.pyenv/versions/3.10.8/lib/python3.10/multiprocessing/spawn.py\", \"line_number\": 129, \"function_name\": \"_main\", \"text\": \"return self._bootstrap(parent_sentinel)\"}, {\"filename\": \"/Users/timothysweeney/.pyenv/versions/3.10.8/lib/python3.10/multiprocessing/process.py\", \"line_number\": 314, \"function_name\": \"_bootstrap\", \"text\": \"self.run()\"}, {\"filename\": \"/Users/timothysweeney/.pyenv/versions/3.10.8/lib/python3.10/multiprocessing/process.py\", \"line_number\": 108, \"function_name\": \"run\", \"text\": \"self._target(*self._args, **self._kwargs)\"}, {\"filename\": \"/Users/timothysweeney/Workspace/github/wandb/core/services/weave-python/weave-public/weave/trace_server/server_side_object_saver.py\", \"line_number\": 301, \"function_name\": \"_score_call\", \"text\": \"apply_scorer_res = target_call._apply_scorer(scorer)\"}, {\"filename\": \"/Users/timothysweeney/Workspace/github/wandb/core/services/weave-python/weave-public/weave/trace/weave_client.py\", \"line_number\": 497, \"function_name\": \"_apply_scorer\", \"text\": \"_, score_call = scorer_op.call(**score_args)\"}, {\"filename\": \"/Users/timothysweeney/Workspace/github/wandb/core/services/weave-python/weave-public/weave/trace/op.py\", \"line_number\": 372, \"function_name\": \"call\", \"text\": \"return _do_call(\"}, {\"filename\": \"/Users/timothysweeney/Workspace/github/wandb/core/services/weave-python/weave-public/weave/trace/op.py\", \"line_number\": 432, \"function_name\": \"_do_call\", \"text\": \"execute_result = _execute_op(\"}, {\"filename\": \"/Users/timothysweeney/Workspace/github/wandb/core/services/weave-python/weave-public/weave/trace/op.py\", \"line_number\": 331, \"function_name\": \"_execute_op\", \"text\": \"res = func(*args, **kwargs)\"}, {\"filename\": \"/Users/timothysweeney/Workspace/github/wandb/core/services/weave-python/weave-public/weave/builtin_objects/scorers/LLMJudgeScorer.py\", \"line_number\": 18, \"function_name\": \"score\", \"text\": \"user_prompt = json.dumps(\"}, {\"filename\": \"/Users/timothysweeney/.pyenv/versions/3.10.8/lib/python3.10/json/__init__.py\", \"line_number\": 231, \"function_name\": \"dumps\", \"text\": \"return _default_encoder.encode(obj)\"}, {\"filename\": \"/Users/timothysweeney/.pyenv/versions/3.10.8/lib/python3.10/json/encoder.py\", \"line_number\": 199, \"function_name\": \"encode\", \"text\": \"chunks = self.iterencode(o, _one_shot=True)\"}, {\"filename\": \"/Users/timothysweeney/.pyenv/versions/3.10.8/lib/python3.10/json/encoder.py\", \"line_number\": 257, \"function_name\": \"iterencode\", \"text\": \"return _iterencode(o, 0)\"}, {\"filename\": \"/Users/timothysweeney/.pyenv/versions/3.10.8/lib/python3.10/json/encoder.py\", \"line_number\": 179, \"function_name\": \"default\", \"text\": \"raise TypeError(f\\'Object of type {o.__class__.__name__} \\'\"}]}', output=None, summary={'weave': {'status': <TraceStatus.ERROR: 'error'>, 'trace_name': 'LLMJudgeScorer.score', 'latency_ms': 25}}, wb_user_id='VXNlcjo2Mzg4Nw==', wb_run_id=None, deleted_at=None)\n"
]
}
],
"source": [
"print(score_res)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ export const CallPage: FC<{
};

export const useShowRunnableUI = () => {
return false;
return true;
// Uncomment to re-enable
// const viewerInfo = useViewerInfo();
// return viewerInfo.loading ? false : viewerInfo.userInfo?.admin;
Expand Down
2 changes: 1 addition & 1 deletion weave/trace/weave_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ def _apply_scorer(self, scorer_op: Op | Scorer) -> ApplyScorerResult:
scorer_signature = inspect.signature(scorer_op)
scorer_arg_names = list(scorer_signature.parameters.keys())
if "inputs" in scorer_arg_names:
score_args = {"inputs": self.inputs}
score_args = {"inputs": {k: v for k, v in self.inputs.items() if k != "self"}}
else:
score_args = {k: v for k, v in self.inputs.items() if k in scorer_arg_names}
if self_arg is not None:
Expand Down
7 changes: 6 additions & 1 deletion weave/trace_server/todo.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,9 @@
* [ ] scorers should have a client-spec, not a specific client
* [ ] How to model a scorers's stub (input, output, context, reference(s), etc...)
* [ ] How to handle output types from scorers (boolean, number, reason, etc...)
* [ ]Investigate why the tests are running so slowly
* [ ]Investigate why the tests are running so slowly


---- Decomposition PRs ----
1. Change/Add the set_object_class instead of base_object_class
2. Add new methods to the server
5 changes: 5 additions & 0 deletions weave/trace_server_bindings/remote_http_trace_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,11 @@ def call_method(self, req: tsi.CallMethodReq) -> tsi.CallMethodRes:
"/execute/method", req, tsi.CallMethodReq, tsi.CallMethodRes
)

def score_call(self, req: tsi.ScoreCallReq) -> tsi.ScoreCallRes:
return self._generic_request(
"/execute/score_call", req, tsi.ScoreCallReq, tsi.ScoreCallRes
)


__docspec__ = [
RemoteHTTPTraceServer,
Expand Down

0 comments on commit 51a9a4c

Please sign in to comment.