diff --git a/docs/docs/reference/gen_notebooks/leaderboard_quickstart.md b/docs/docs/reference/gen_notebooks/leaderboard_quickstart.md new file mode 100644 index 00000000000..1830209847f --- /dev/null +++ b/docs/docs/reference/gen_notebooks/leaderboard_quickstart.md @@ -0,0 +1,343 @@ +--- +title: Leaderboard Quickstart +--- + + +:::tip[This is a notebook] + +
Open In Colab
Open in Colab
+ +
View in Github
View in Github
+ +::: + + + + + +# Leaderboard Quickstart + +In this notebook we will learn to use Weave's Leaderboard to compare model performance across different datasets and scoring functions. Specifically, we will: + +1. Generate a dataset of fake zip code data +2. Author some scoring functions and evaluate a baseline model. +3. Use these techniques to evaluate a matrix of models vs evaluations. +4. Review the leaderboard in the Weave UI. + +## Step 1: Generate a dataset of fake zip code data + +First we will create a function `generate_dataset_rows` that generates a list of fake zip code data. + + +```python +import json + +from openai import OpenAI +from pydantic import BaseModel + + +class Row(BaseModel): + zip_code: str + city: str + state: str + avg_temp_f: float + population: int + median_income: int + known_for: str + + +class Rows(BaseModel): + rows: list[Row] + + +def generate_dataset_rows( + location: str = "United States", count: int = 5, year: int = 2022 +): + client = OpenAI() + + completion = client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": f"Please generate {count} rows of data for random zip codes in {location} for the year {year}.", + }, + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "response_format", + "schema": Rows.model_json_schema(), + }, + }, + ) + + return json.loads(completion.choices[0].message.content)["rows"] +``` + + +```python +import weave + +weave.init("leaderboard-demo") +``` + +## Step 2: Author scoring functions + +Next we will author 3 scoring functions: + +1. `check_concrete_fields`: Checks if the model output matches the expected city and state. +2. `check_value_fields`: Checks if the model output is within 10% of the expected population and median income. +3. `check_subjective_fields`: Uses a LLM to check if the model output matches the expected "known for" field. + + + +```python +@weave.op +def check_concrete_fields(city: str, state: str, output: dict): + return { + "city_match": city == output["city"], + "state_match": state == output["state"], + } + + +@weave.op +def check_value_fields( + avg_temp_f: float, population: int, median_income: int, output: dict +): + return { + "avg_temp_f_err": abs(avg_temp_f - output["avg_temp_f"]) / avg_temp_f, + "population_err": abs(population - output["population"]) / population, + "median_income_err": abs(median_income - output["median_income"]) + / median_income, + } + + +@weave.op +def check_subjective_fields(zip_code: str, known_for: str, output: dict): + client = OpenAI() + + class Response(BaseModel): + correct_known_for: bool + + completion = client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": f"My student was asked what the zip code {zip_code} is best known best for. The right answer is '{known_for}', and they said '{output['known_for']}'. Is their answer correct?", + }, + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "response_format", + "schema": Response.model_json_schema(), + }, + }, + ) + + return json.loads(completion.choices[0].message.content) +``` + +## Step 3: Create a simple Evaluation + +Next we define a simple evaliation using our fake data and scoring functions. + + + +```python +rows = generate_dataset_rows() +evaluation = weave.Evaluation( + name="United States - 2022", + dataset=rows, + scorers=[ + check_concrete_fields, + check_value_fields, + check_subjective_fields, + ], +) +``` + +## Step 4: Evaluate a baseline model + +Now we will evaluate a baseline model which returns a static response. + + + +```python +@weave.op +def baseline_model(zip_code: str): + return { + "city": "New York", + "state": "NY", + "avg_temp_f": 50.0, + "population": 1000000, + "median_income": 100000, + "known_for": "The Big Apple", + } + + +await evaluation.evaluate(baseline_model) +``` + +## Step 5: Create more Models + +Now we will create 2 more models to compare against the baseline. + + +```python +@weave.op +def gpt_4o_mini_no_context(zip_code: str): + client = OpenAI() + + completion = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": f"""Zip code {zip_code}"""}], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "response_format", + "schema": Row.model_json_schema(), + }, + }, + ) + + return json.loads(completion.choices[0].message.content) + + +await evaluation.evaluate(gpt_4o_mini_no_context) +``` + + +```python +@weave.op +def gpt_4o_mini_with_context(zip_code: str): + client = OpenAI() + + completion = client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": f"""Please answer the following questions about the zip code {zip_code}: + 1. What is the city? + 2. What is the state? + 3. What is the average temperature in Fahrenheit? + 4. What is the population? + 5. What is the median income? + 6. What is the most well known thing about this zip code? + """, + } + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "response_format", + "schema": Row.model_json_schema(), + }, + }, + ) + + return json.loads(completion.choices[0].message.content) + + +await evaluation.evaluate(gpt_4o_mini_with_context) +``` + +## Step 6: Create more Evaluations + +Now we will evaluate a matrix of models vs evaluations. + + + +```python +scorers = [ + check_concrete_fields, + check_value_fields, + check_subjective_fields, +] +evaluations = [ + weave.Evaluation( + name="United States - 2022", + dataset=weave.Dataset( + name="United States - 2022", + rows=generate_dataset_rows("United States", 5, 2022), + ), + scorers=scorers, + ), + weave.Evaluation( + name="California - 2022", + dataset=weave.Dataset( + name="California - 2022", rows=generate_dataset_rows("California", 5, 2022) + ), + scorers=scorers, + ), + weave.Evaluation( + name="United States - 2000", + dataset=weave.Dataset( + name="United States - 2000", + rows=generate_dataset_rows("United States", 5, 2000), + ), + scorers=scorers, + ), +] +models = [ + baseline_model, + gpt_4o_mini_no_context, + gpt_4o_mini_with_context, +] + +for evaluation in evaluations: + for model in models: + await evaluation.evaluate( + model, __weave={"display_name": evaluation.name + ":" + model.__name__} + ) +``` + +## Step 7: Review the Leaderboard + +You can create a new leaderboard by navigating to the leaderboard tab in the UI and clicking "Create Leaderboard". + +We can also generate a leaderboard directly from Python: + + +```python +from weave.flow import leaderboard +from weave.trace.weave_client import get_ref + +spec = leaderboard.Leaderboard( + name="Zip Code World Knowledge", + description=""" +This leaderboard compares the performance of models in terms of world knowledge about zip codes. + +### Columns + +1. **State Match against `United States - 2022`**: The fraction of zip codes that the model correctly identified the state for. +2. **Avg Temp F Error against `California - 2022`**: The mean absolute error of the model's average temperature prediction. +3. **Correct Known For against `United States - 2000`**: The fraction of zip codes that the model correctly identified the most well known thing about the zip code. +""", + columns=[ + leaderboard.LeaderboardColumn( + evaluation_object_ref=get_ref(evaluations[0]).uri(), + scorer_name="check_concrete_fields", + summary_metric_path="state_match.true_fraction", + ), + leaderboard.LeaderboardColumn( + evaluation_object_ref=get_ref(evaluations[1]).uri(), + scorer_name="check_value_fields", + should_minimize=True, + summary_metric_path="avg_temp_f_err.mean", + ), + leaderboard.LeaderboardColumn( + evaluation_object_ref=get_ref(evaluations[2]).uri(), + scorer_name="check_subjective_fields", + summary_metric_path="correct_known_for.true_fraction", + ), + ], +) + +ref = weave.publish(spec) +``` diff --git a/docs/notebooks/leaderboard_quickstart.ipynb b/docs/notebooks/leaderboard_quickstart.ipynb new file mode 100644 index 00000000000..b09d336de51 --- /dev/null +++ b/docs/notebooks/leaderboard_quickstart.ipynb @@ -0,0 +1,445 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Leaderboard Quickstart\n", + "\n", + "In this notebook we will learn to use Weave's Leaderboard to compare model performance across different datasets and scoring functions. Specifically, we will:\n", + "\n", + "1. Generate a dataset of fake zip code data\n", + "2. Author some scoring functions and evaluate a baseline model.\n", + "3. Use these techniques to evaluate a matrix of models vs evaluations.\n", + "4. Review the leaderboard in the Weave UI." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Generate a dataset of fake zip code data\n", + "\n", + "First we will create a function `generate_dataset_rows` that generates a list of fake zip code data." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "from openai import OpenAI\n", + "from pydantic import BaseModel\n", + "\n", + "\n", + "class Row(BaseModel):\n", + " zip_code: str\n", + " city: str\n", + " state: str\n", + " avg_temp_f: float\n", + " population: int\n", + " median_income: int\n", + " known_for: str\n", + "\n", + "\n", + "class Rows(BaseModel):\n", + " rows: list[Row]\n", + "\n", + "\n", + "def generate_dataset_rows(\n", + " location: str = \"United States\", count: int = 5, year: int = 2022\n", + "):\n", + " client = OpenAI()\n", + "\n", + " completion = client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": f\"Please generate {count} rows of data for random zip codes in {location} for the year {year}.\",\n", + " },\n", + " ],\n", + " response_format={\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": {\n", + " \"name\": \"response_format\",\n", + " \"schema\": Rows.model_json_schema(),\n", + " },\n", + " },\n", + " )\n", + "\n", + " return json.loads(completion.choices[0].message.content)[\"rows\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import weave\n", + "\n", + "weave.init(\"leaderboard-demo\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Author scoring functions\n", + "\n", + "Next we will author 3 scoring functions:\n", + "\n", + "1. `check_concrete_fields`: Checks if the model output matches the expected city and state.\n", + "2. `check_value_fields`: Checks if the model output is within 10% of the expected population and median income.\n", + "3. `check_subjective_fields`: Uses a LLM to check if the model output matches the expected \"known for\" field.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "@weave.op\n", + "def check_concrete_fields(city: str, state: str, output: dict):\n", + " return {\n", + " \"city_match\": city == output[\"city\"],\n", + " \"state_match\": state == output[\"state\"],\n", + " }\n", + "\n", + "\n", + "@weave.op\n", + "def check_value_fields(\n", + " avg_temp_f: float, population: int, median_income: int, output: dict\n", + "):\n", + " return {\n", + " \"avg_temp_f_err\": abs(avg_temp_f - output[\"avg_temp_f\"]) / avg_temp_f,\n", + " \"population_err\": abs(population - output[\"population\"]) / population,\n", + " \"median_income_err\": abs(median_income - output[\"median_income\"])\n", + " / median_income,\n", + " }\n", + "\n", + "\n", + "@weave.op\n", + "def check_subjective_fields(zip_code: str, known_for: str, output: dict):\n", + " client = OpenAI()\n", + "\n", + " class Response(BaseModel):\n", + " correct_known_for: bool\n", + "\n", + " completion = client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": f\"My student was asked what the zip code {zip_code} is best known best for. The right answer is '{known_for}', and they said '{output['known_for']}'. Is their answer correct?\",\n", + " },\n", + " ],\n", + " response_format={\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": {\n", + " \"name\": \"response_format\",\n", + " \"schema\": Response.model_json_schema(),\n", + " },\n", + " },\n", + " )\n", + "\n", + " return json.loads(completion.choices[0].message.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Create a simple Evaluation\n", + "\n", + "Next we define a simple evaliation using our fake data and scoring functions.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "rows = generate_dataset_rows()\n", + "evaluation = weave.Evaluation(\n", + " name=\"United States - 2022\",\n", + " dataset=rows,\n", + " scorers=[\n", + " check_concrete_fields,\n", + " check_value_fields,\n", + " check_subjective_fields,\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Evaluate a baseline model\n", + "\n", + "Now we will evaluate a baseline model which returns a static response.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@weave.op\n", + "def baseline_model(zip_code: str):\n", + " return {\n", + " \"city\": \"New York\",\n", + " \"state\": \"NY\",\n", + " \"avg_temp_f\": 50.0,\n", + " \"population\": 1000000,\n", + " \"median_income\": 100000,\n", + " \"known_for\": \"The Big Apple\",\n", + " }\n", + "\n", + "\n", + "await evaluation.evaluate(baseline_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Create more Models\n", + "\n", + "Now we will create 2 more models to compare against the baseline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@weave.op\n", + "def gpt_4o_mini_no_context(zip_code: str):\n", + " client = OpenAI()\n", + "\n", + " completion = client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[{\"role\": \"user\", \"content\": f\"\"\"Zip code {zip_code}\"\"\"}],\n", + " response_format={\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": {\n", + " \"name\": \"response_format\",\n", + " \"schema\": Row.model_json_schema(),\n", + " },\n", + " },\n", + " )\n", + "\n", + " return json.loads(completion.choices[0].message.content)\n", + "\n", + "\n", + "await evaluation.evaluate(gpt_4o_mini_no_context)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@weave.op\n", + "def gpt_4o_mini_with_context(zip_code: str):\n", + " client = OpenAI()\n", + "\n", + " completion = client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": f\"\"\"Please answer the following questions about the zip code {zip_code}:\n", + " 1. What is the city?\n", + " 2. What is the state?\n", + " 3. What is the average temperature in Fahrenheit?\n", + " 4. What is the population?\n", + " 5. What is the median income?\n", + " 6. What is the most well known thing about this zip code?\n", + " \"\"\",\n", + " }\n", + " ],\n", + " response_format={\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": {\n", + " \"name\": \"response_format\",\n", + " \"schema\": Row.model_json_schema(),\n", + " },\n", + " },\n", + " )\n", + "\n", + " return json.loads(completion.choices[0].message.content)\n", + "\n", + "\n", + "await evaluation.evaluate(gpt_4o_mini_with_context)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Create more Evaluations\n", + "\n", + "Now we will evaluate a matrix of models vs evaluations.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scorers = [\n", + " check_concrete_fields,\n", + " check_value_fields,\n", + " check_subjective_fields,\n", + "]\n", + "evaluations = [\n", + " weave.Evaluation(\n", + " name=\"United States - 2022\",\n", + " dataset=weave.Dataset(\n", + " name=\"United States - 2022\",\n", + " rows=generate_dataset_rows(\"United States\", 5, 2022),\n", + " ),\n", + " scorers=scorers,\n", + " ),\n", + " weave.Evaluation(\n", + " name=\"California - 2022\",\n", + " dataset=weave.Dataset(\n", + " name=\"California - 2022\", rows=generate_dataset_rows(\"California\", 5, 2022)\n", + " ),\n", + " scorers=scorers,\n", + " ),\n", + " weave.Evaluation(\n", + " name=\"United States - 2000\",\n", + " dataset=weave.Dataset(\n", + " name=\"United States - 2000\",\n", + " rows=generate_dataset_rows(\"United States\", 5, 2000),\n", + " ),\n", + " scorers=scorers,\n", + " ),\n", + "]\n", + "models = [\n", + " baseline_model,\n", + " gpt_4o_mini_no_context,\n", + " gpt_4o_mini_with_context,\n", + "]\n", + "\n", + "for evaluation in evaluations:\n", + " for model in models:\n", + " await evaluation.evaluate(\n", + " model, __weave={\"display_name\": evaluation.name + \":\" + model.__name__}\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Review the Leaderboard\n", + "\n", + "You can create a new leaderboard by navigating to the leaderboard tab in the UI and clicking \"Create Leaderboard\".\n", + "\n", + "We can also generate a leaderboard directly from Python:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📦 Published to https://wandb.ai/timssweeney/leaderboard-demo/weave/leaderboards/Zip-Code-World-Knowledge\n" + ] + } + ], + "source": [ + "from weave.flow import leaderboard\n", + "from weave.trace.weave_client import get_ref\n", + "\n", + "spec = leaderboard.Leaderboard(\n", + " name=\"Zip Code World Knowledge\",\n", + " description=\"\"\"\n", + "This leaderboard compares the performance of models in terms of world knowledge about zip codes.\n", + "\n", + "### Columns\n", + "\n", + "1. **State Match against `United States - 2022`**: The fraction of zip codes that the model correctly identified the state for.\n", + "2. **Avg Temp F Error against `California - 2022`**: The mean absolute error of the model's average temperature prediction.\n", + "3. **Correct Known For against `United States - 2000`**: The fraction of zip codes that the model correctly identified the most well known thing about the zip code.\n", + "\"\"\",\n", + " columns=[\n", + " leaderboard.LeaderboardColumn(\n", + " evaluation_object_ref=get_ref(evaluations[0]).uri(),\n", + " scorer_name=\"check_concrete_fields\",\n", + " summary_metric_path=\"state_match.true_fraction\",\n", + " ),\n", + " leaderboard.LeaderboardColumn(\n", + " evaluation_object_ref=get_ref(evaluations[1]).uri(),\n", + " scorer_name=\"check_value_fields\",\n", + " should_minimize=True,\n", + " summary_metric_path=\"avg_temp_f_err.mean\",\n", + " ),\n", + " leaderboard.LeaderboardColumn(\n", + " evaluation_object_ref=get_ref(evaluations[2]).uri(),\n", + " scorer_name=\"check_subjective_fields\",\n", + " summary_metric_path=\"correct_known_for.true_fraction\",\n", + " ),\n", + " ],\n", + ")\n", + "\n", + "ref = weave.publish(spec)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "wandb-weave", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/weave-js/src/components/FancyPage/useProjectSidebar.ts b/weave-js/src/components/FancyPage/useProjectSidebar.ts index b77290b7d1d..c9d0b928997 100644 --- a/weave-js/src/components/FancyPage/useProjectSidebar.ts +++ b/weave-js/src/components/FancyPage/useProjectSidebar.ts @@ -31,6 +31,7 @@ export const useProjectSidebar = ( const isNoSidebarItems = !showModelsSidebarItems && !showWeaveSidebarItems; const isBothSidebarItems = showModelsSidebarItems && showWeaveSidebarItems; const isShowAll = isNoSidebarItems || isBothSidebarItems; + return useMemo(() => { const allItems = isLoading ? [] @@ -137,6 +138,11 @@ export const useProjectSidebar = ( isShown: showWeaveSidebarItems || isShowAll, iconName: IconNames.LayoutTabs, }, + { + type: 'divider' as const, + key: 'dividerWithinWeave-1', + isShown: isWeaveOnly, + }, { type: 'button' as const, name: 'Evals', @@ -144,6 +150,18 @@ export const useProjectSidebar = ( isShown: showWeaveSidebarItems || isShowAll, iconName: IconNames.BaselineAlt, }, + { + type: 'button' as const, + name: 'Leaders', + slug: 'weave/leaderboards', + isShown: isWeaveOnly, + iconName: IconNames.BenchmarkSquare, + }, + { + type: 'divider' as const, + key: 'dividerWithinWeave-2', + isShown: isWeaveOnly, + }, // { // type: 'button' as const, // name: 'Prompts', @@ -167,7 +185,7 @@ export const useProjectSidebar = ( }, { type: 'divider' as const, - key: 'dividerWithinWeave', + key: 'dividerWithinWeave-3', isShown: isWeaveOnly, }, { @@ -193,7 +211,7 @@ export const useProjectSidebar = ( key: 'moreWeave', isShown: isShowAll, // iconName: IconNames.OverflowHorizontal, - menu: ['weave/operations', 'weave/objects'], + menu: ['weave/leaderboards', 'weave/operations', 'weave/objects'], }, ]; @@ -220,10 +238,10 @@ export const useProjectSidebar = ( return onlyShownItems; }, [ isLoading, - isModelsOnly, - isWeaveOnly, - showWeaveSidebarItems, isShowAll, + isWeaveOnly, viewingRestricted, + isModelsOnly, + showWeaveSidebarItems, ]); }; diff --git a/weave-js/src/components/PagePanelComponents/Home/Browse3.tsx b/weave-js/src/components/PagePanelComponents/Home/Browse3.tsx index bee4705042c..d9098d0970b 100644 --- a/weave-js/src/components/PagePanelComponents/Home/Browse3.tsx +++ b/weave-js/src/components/PagePanelComponents/Home/Browse3.tsx @@ -82,6 +82,8 @@ import {Empty} from './Browse3/pages/common/Empty'; import {EMPTY_NO_TRACE_SERVER} from './Browse3/pages/common/EmptyContent'; import {SimplePageLayoutContext} from './Browse3/pages/common/SimplePageLayout'; import {CompareEvaluationsPage} from './Browse3/pages/CompareEvaluationsPage/CompareEvaluationsPage'; +import {LeaderboardListingPage} from './Browse3/pages/LeaderboardPage/LeaderboardListingPage'; +import {LeaderboardPage} from './Browse3/pages/LeaderboardPage/LeaderboardPage'; import {ObjectPage} from './Browse3/pages/ObjectPage'; import {ObjectVersionPage} from './Browse3/pages/ObjectVersionPage'; import { @@ -151,6 +153,7 @@ const tabOptions = [ 'op-versions', 'calls', 'evaluations', + 'leaderboards', 'boards', 'tables', ]; @@ -492,6 +495,13 @@ const Browse3ProjectRoot: FC<{ + + + {/* BOARDS */} { ); }; +const LeaderboardPageBinding = () => { + const params = useParamsDecoded(); + const {entity, project, itemName: leaderboardName} = params; + const query = useURLSearchParamsDict(); + const edit = query.edit === 'true'; + if (!leaderboardName) { + return ; + } + return ( + + ); +}; + const OpsPageBinding = () => { const params = useParamsDecoded(); diff --git a/weave-js/src/components/PagePanelComponents/Home/Browse3/context.tsx b/weave-js/src/components/PagePanelComponents/Home/Browse3/context.tsx index 640e68441b6..f453e583c54 100644 --- a/weave-js/src/components/PagePanelComponents/Home/Browse3/context.tsx +++ b/weave-js/src/components/PagePanelComponents/Home/Browse3/context.tsx @@ -193,6 +193,14 @@ export const browse2Context = { ) => { throw new Error('Not implemented'); }, + leaderboardsUIUrl: ( + entityName: string, + projectName: string, + leaderboardName?: string, + edit?: boolean + ) => { + throw new Error('Not implemented'); + }, }; export const browse3ContextGen = ( @@ -422,6 +430,16 @@ export const browse3ContextGen = ( JSON.stringify(evaluationCallIds) )}${metricsPart}`; }, + leaderboardsUIUrl: ( + entityName: string, + projectName: string, + leaderboardName?: string, + edit?: boolean + ) => { + return `${projectRoot(entityName, projectName)}/leaderboards${ + leaderboardName ? `/${leaderboardName}` : '' + }${edit ? '?edit=true' : ''}`; + }, }; return browse3Context; }; @@ -506,6 +524,12 @@ type RouteType = { evaluationCallIds: string[], metrics: Record | null ) => string; + leaderboardsUIUrl: ( + entityName: string, + projectName: string, + leaderboardName?: string, + edit?: boolean + ) => string; }; const useSetSearchParam = () => { @@ -619,6 +643,11 @@ const useMakePeekingRouter = (): RouteType => { baseContext.compareEvaluationsUri(...args) ); }, + leaderboardsUIUrl: ( + ...args: Parameters + ) => { + return setSearchParam(PEEK_PARAM, baseContext.leaderboardsUIUrl(...args)); + }, }; }; diff --git a/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/LeaderboardPage/LeaderboardConfigEditor.tsx b/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/LeaderboardPage/LeaderboardConfigEditor.tsx new file mode 100644 index 00000000000..e1e1f3b5ffd --- /dev/null +++ b/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/LeaderboardPage/LeaderboardConfigEditor.tsx @@ -0,0 +1,327 @@ +import {Box} from '@material-ui/core'; +import {PopupDropdown} from '@wandb/weave/common/components/PopupDropdown'; +import {Button} from '@wandb/weave/components/Button/Button'; +import {Select} from '@wandb/weave/components/Form/Select'; +import {TextField} from '@wandb/weave/components/Form/TextField'; +import { + IconChevronDown, + IconChevronUp, + IconCopy, + IconDelete, + IconSortAscending, + IconSortDescending, +} from '@wandb/weave/components/Icon'; +import _ from 'lodash'; +import React, {useMemo} from 'react'; + +import {LeaderboardObjectVal} from '../../views/Leaderboard/types/leaderboardConfigType'; +import { + EvaluationHelperObj, + useEvaluationObjects, + useMetrics, + useScorers, +} from './leaderboardConfigEditorHooks'; +import {SimpleCodeLikeTextArea} from './SimpleCodeLikeTextArea'; + +export const LeaderboardConfigEditor: React.FC<{ + entity: string; + project: string; + leaderboardVal: LeaderboardObjectVal; + saving: boolean; + isDirty: boolean; + setWorkingCopy: (leaderboardVal: LeaderboardObjectVal) => void; + discardChanges: () => void; + commitChanges: () => void; +}> = ({ + entity, + project, + leaderboardVal, + saving, + isDirty, + setWorkingCopy, + discardChanges, + commitChanges, +}) => { + const handleNameChange = (value: string) => { + setWorkingCopy({...leaderboardVal, name: value}); + }; + + const handleDescriptionChange = (value: string) => { + setWorkingCopy({...leaderboardVal, description: value}); + }; + + const handleColumnChange = (index: number, field: string, value: any) => { + const newColumns = [...leaderboardVal.columns]; + newColumns[index] = {...newColumns[index], [field]: value}; + + // Reset dependent fields when changing evaluation_object_ref or scorer_name + if (field === 'evaluation_object_ref') { + newColumns[index].scorer_name = ''; + newColumns[index].summary_metric_path = ''; + } else if (field === 'scorer_name') { + newColumns[index].summary_metric_path = ''; + } + + setWorkingCopy({...leaderboardVal, columns: newColumns}); + }; + + const addColumn = () => { + setWorkingCopy({ + ...leaderboardVal, + columns: [ + ...leaderboardVal.columns, + { + evaluation_object_ref: '', + scorer_name: '', + should_minimize: false, + summary_metric_path: '', + }, + ], + }); + }; + + const removeColumn = (index: number) => { + const newColumns = leaderboardVal.columns.filter((v, i) => i !== index); + setWorkingCopy({...leaderboardVal, columns: newColumns}); + }; + + const cloneColumn = (index: number) => { + const newColumns = [...leaderboardVal.columns]; + newColumns.splice(index + 1, 0, {...newColumns[index]}); + setWorkingCopy({...leaderboardVal, columns: newColumns}); + }; + + const moveColumn = (fromIndex: number, toIndex: number) => { + const newColumns = [...leaderboardVal.columns]; + const [removed] = newColumns.splice(fromIndex, 1); + newColumns.splice(toIndex, 0, removed); + setWorkingCopy({...leaderboardVal, columns: newColumns}); + }; + + const evalObjs = useEvaluationObjects(entity, project); + + return ( + + + + + + + + + + Evaluation + + Scorer + Metric + + {leaderboardVal.columns.map((column, index) => ( + + ))} + + + + + + + + + + ); +}; + +const Label: React.FC<{children: React.ReactNode}> = ({children}) => { + return ( + + {children} + + ); +}; + +const ColumnEditor: React.FC<{ + column: LeaderboardObjectVal['columns'][0]; + index: number; + evalObjs: EvaluationHelperObj[]; + entity: string; + project: string; + handleColumnChange: (index: number, field: string, value: any) => void; + moveColumn: (fromIndex: number, toIndex: number) => void; + cloneColumn: (index: number) => void; + removeColumn: (index: number) => void; + totalColumns: number; +}> = ({ + column, + index, + evalObjs, + entity, + project, + handleColumnChange, + moveColumn, + cloneColumn, + removeColumn, + totalColumns, +}) => { + const scorers = useScorers(entity, project, column.evaluation_object_ref); + const metrics = useMetrics( + entity, + project, + column.evaluation_object_ref, + column.scorer_name + ); + const selectedEvalObj = evalObjs.find( + obj => obj.ref === column.evaluation_object_ref + ); + const selectedScorer = useMemo( + () => (column.scorer_name ? {val: column.scorer_name} : undefined), + [column.scorer_name] + ); + const selectedMetricPath = useMemo( + () => ({val: column.summary_metric_path}), + [column.summary_metric_path] + ); + const shouldMinimize = column.should_minimize ?? false; + return ( + <> + + value={selectedEvalObj} + placeholder="Evaluation Definition" + onChange={newVal => + handleColumnChange(index, 'evaluation_object_ref', newVal?.ref) + } + options={Object.entries(_.groupBy(evalObjs, 'name')).map( + ([name, objs]) => ({options: objs, label: name}) + )} + getOptionLabel={obj => + `${obj.name}:v${obj.versionIndex} (${obj.digest.slice(0, 6)})` + } + getOptionValue={obj => obj.ref} + /> + + value={selectedScorer} + onChange={newVal => + handleColumnChange(index, 'scorer_name', newVal?.val) + } + options={scorers.map(scorer => ({val: scorer}))} + isDisabled={!column.evaluation_object_ref} + getOptionLabel={scorer => scorer.val} + getOptionValue={scorer => scorer.val} + /> + + value={selectedMetricPath} + onChange={newVal => + handleColumnChange(index, 'summary_metric_path', newVal?.val ?? '') + } + options={metrics.map(metric => ({val: metric}))} + isDisabled={!column.evaluation_object_ref || !column.scorer_name} + getOptionLabel={metric => metric.val} + getOptionValue={metric => metric.val} + /> + , + onClick: () => moveColumn(index, index - 1), + disabled: index === 0, + }, + { + key: 'moveAfter', + text: 'Move After', + icon: , + onClick: () => moveColumn(index, index + 1), + disabled: index === totalColumns - 1, + }, + { + key: 'duplicate', + text: 'Duplicate', + icon: , + onClick: () => cloneColumn(index), + }, + { + key: 'delete', + text: 'Delete', + icon: , + onClick: () => removeColumn(index), + }, + { + key: 'changeSortDirection', + text: shouldMinimize ? 'Sort Descending' : 'Sort Ascending', + icon: shouldMinimize ? ( + + ) : ( + + ), + onClick: () => + handleColumnChange(index, 'should_minimize', !shouldMinimize), + }, + ], + ]} + trigger={ + + + ); +}; + +const LeaderboardTable: React.FC<{ + entity: string; + project: string; +}> = props => { + const history = useHistory(); + const {peekingRouter} = useWeaveflowRouteContext(); + + // TODO: Once `useCollectionObjects` lands from the online + // evals project, switch to that (much more type safe) + const leaderboardQuery = useBaseObjectInstances('Leaderboard', { + project_id: projectIdFromParts({ + entity: props.entity, + project: props.project, + }), + filter: {latest_only: true}, + }); + + const leaderboardObjectVersions = useMemo(() => { + return (leaderboardQuery.result ?? []).map( + convertTraceServerObjectVersionToSchema + ); + }, [leaderboardQuery.result]); + const onClick = useCallback( + (obj: ObjectVersionSchema) => { + const to = peekingRouter.leaderboardsUIUrl( + props.entity, + props.project, + obj.objectId + ); + history.push(to); + }, + [history, peekingRouter, props.entity, props.project] + ); + + if (leaderboardQuery.loading) { + return ; + } + + const isEmpty = leaderboardObjectVersions.length === 0; + if (isEmpty) { + return ; + } + + return ( + + ); +}; + +const generateLeaderboardId = () => { + const timestamp = new Date().getTime(); + const timestampHex = timestamp.toString(36); + return `leaderboard-${timestampHex}`; +}; + +const useCreateLeaderboard = (entity: string, project: string) => { + const createLeaderboardInstance = useCreateBaseObjectInstance('Leaderboard'); + + const createLeaderboard = async () => { + const objectId = sanitizeObjectId(generateLeaderboardId()); + await createLeaderboardInstance({ + obj: { + project_id: projectIdFromParts({entity, project}), + object_id: objectId, + val: { + name: objectId, + description: '', + columns: [], + }, + }, + }); + return objectId; + }; + + return createLeaderboard; +}; + +const useNavigateToLeaderboard = (entity: string, project: string) => { + const history = useHistory(); + const {baseRouter} = useWeaveflowRouteContext(); + const navigateToLeaderboard = useCallback( + (objectId: string) => { + const to = baseRouter.leaderboardsUIUrl(entity, project, objectId, true); + history.push(to); + }, + [history, baseRouter, entity, project] + ); + return navigateToLeaderboard; +}; diff --git a/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/LeaderboardPage/LeaderboardPage.tsx b/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/LeaderboardPage/LeaderboardPage.tsx new file mode 100644 index 00000000000..6fac8eaa599 --- /dev/null +++ b/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/LeaderboardPage/LeaderboardPage.tsx @@ -0,0 +1,385 @@ +import {Box} from '@mui/material'; +import {MOON_250} from '@wandb/weave/common/css/color.styles'; +import {useViewerInfo} from '@wandb/weave/common/hooks/useViewerInfo'; +import {Button} from '@wandb/weave/components/Button'; +import {Loading} from '@wandb/weave/components/Loading'; +import _ from 'lodash'; +import React, { + FC, + useCallback, + useContext, + useEffect, + useMemo, + useState, +} from 'react'; +import ReactMarkdown from 'react-markdown'; +import styled from 'styled-components'; + +import {WeaveflowPeekContext} from '../../context'; +import {NotFoundPanel} from '../../NotFoundPanel'; +import { + LeaderboardColumnOrderType, + LeaderboardGrid, +} from '../../views/Leaderboard/LeaderboardGrid'; +import {useSavedLeaderboardData} from '../../views/Leaderboard/query/hookAdapters'; +import {LeaderboardObjectVal} from '../../views/Leaderboard/types/leaderboardConfigType'; +import {SimplePageLayout} from '../common/SimplePageLayout'; +import { + useBaseObjectInstances, + useCreateBaseObjectInstance, +} from '../wfReactInterface/baseObjectClassQuery'; +import {projectIdFromParts} from '../wfReactInterface/tsDataModelHooks'; +import {LeaderboardConfigEditor} from './LeaderboardConfigEditor'; + +type LeaderboardPageProps = { + entity: string; + project: string; + leaderboardName: string; + openEditorOnMount?: boolean; +}; + +export const LeaderboardPage: React.FC = props => { + const [name, setName] = useState(props.leaderboardName); + const {isPeeking} = useContext(WeaveflowPeekContext); + const {isEditor} = useIsEditor(props.entity); + const [isEditing, setIsEditing] = useState(false); + useEffect(() => { + if (isEditor && props.openEditorOnMount) { + setIsEditing(true); + } + }, [isEditor, props.openEditorOnMount]); + return ( + + ), + }, + ]} + headerExtra={ + !isPeeking && + !isEditing && + isEditor && ( + + ) + } + /> + ); +}; + +export const LeaderboardPageContent: React.FC< + LeaderboardPageProps & { + setName: (name: string) => void; + isEditing: boolean; + setIsEditing: (isEditing: boolean) => void; + } +> = props => { + const {entity, project} = props; + const leaderboardInstances = useBaseObjectInstances('Leaderboard', { + project_id: projectIdFromParts({entity, project}), + filter: {object_ids: [props.leaderboardName], latest_only: true}, + }); + + if (leaderboardInstances.loading) { + return ; + } + + if ( + leaderboardInstances.result == null || + leaderboardInstances.result.length !== 1 + ) { + return ( + + ); + } + + const leaderboardVal = leaderboardInstances.result[0].val; + + if (leaderboardVal == null) { + return ( + + ); + } + + return ( + + ); +}; + +const useUpdateLeaderboard = ( + entity: string, + project: string, + objectId: string +) => { + const createLeaderboard = useCreateBaseObjectInstance('Leaderboard'); + + const updateLeaderboard = async (leaderboardVal: LeaderboardObjectVal) => { + return await createLeaderboard({ + obj: { + project_id: projectIdFromParts({entity, project}), + object_id: objectId, + val: leaderboardVal, + }, + }); + }; + + return updateLeaderboard; +}; + +export const LeaderboardPageContentInner: React.FC< + LeaderboardPageProps & { + setName: (name: string) => void; + isEditing: boolean; + setIsEditing: (isEditing: boolean) => void; + } & { + leaderboardVal: LeaderboardObjectVal; + } +> = props => { + const updateLeaderboard = useUpdateLeaderboard( + props.entity, + props.project, + props.leaderboardName + ); + const [leaderboardVal, setLeaderboardVal] = useState(props.leaderboardVal); + const [workingLeaderboardValCopy, setWorkingLeaderboardValCopy] = + useState(leaderboardVal); + useEffect(() => { + props.setName(workingLeaderboardValCopy.name ?? ''); + }, [props, workingLeaderboardValCopy.name]); + const {loading, data, evalData} = useSavedLeaderboardData( + props.entity, + props.project, + workingLeaderboardValCopy.columns + ); + const [saving, setSaving] = useState(false); + const discardChanges = useCallback(() => { + setWorkingLeaderboardValCopy(leaderboardVal); + props.setIsEditing(false); + }, [leaderboardVal, props]); + const commitChanges = useCallback(() => { + const mounted = true; + setSaving(true); + updateLeaderboard(workingLeaderboardValCopy) + .then(() => { + if (mounted) { + props.setIsEditing(false); + setLeaderboardVal(workingLeaderboardValCopy); + setWorkingLeaderboardValCopy(workingLeaderboardValCopy); + setSaving(false); + } + }) + .catch(e => { + console.error(e); + if (mounted) { + setWorkingLeaderboardValCopy(leaderboardVal); + setSaving(false); + } + }); + }, [leaderboardVal, props, updateLeaderboard, workingLeaderboardValCopy]); + const isDirty = useMemo(() => { + return !_.isEqual(leaderboardVal, workingLeaderboardValCopy); + }, [leaderboardVal, workingLeaderboardValCopy]); + const columnOrder = useMemo(() => { + return workingLeaderboardValCopy.columns + .map(col => { + const datasetGroup = evalData[col.evaluation_object_ref]?.datasetGroup; + const scorerGroup = + evalData[col.evaluation_object_ref]?.scorers[col.scorer_name]; + const metricGroup = col.summary_metric_path; + + if (datasetGroup && scorerGroup && metricGroup) { + return { + datasetGroup, + scorerGroup, + metricGroup, + minimize: col.should_minimize ?? false, + }; + } + return null; + }) + .filter(c => c != null) as LeaderboardColumnOrderType; + }, [workingLeaderboardValCopy, evalData]); + + return ( + + + {workingLeaderboardValCopy.description && ( + + + {workingLeaderboardValCopy.description} + + + )} + + + + + {props.isEditing && ( + + + + )} + + ); +}; + +export const ToggleLeaderboardConfig: React.FC<{ + isOpen: boolean; + onClick: () => void; +}> = ({isOpen, onClick}) => { + return ( + + + + ); +}; + +export const useIsEditor = (entity: string) => { + const {loading: loadingUserInfo, userInfo} = useViewerInfo(); + return useMemo(() => { + if (loadingUserInfo) { + return { + loading: true, + isEditor: false, + }; + } + const viewer = userInfo ? userInfo.id : null; + + return { + loading: false, + isEditor: viewer && userInfo?.teams.includes(entity), + }; + }, [entity, loadingUserInfo, userInfo]); +}; + +const StyledReactMarkdown = styled(ReactMarkdown)` + > *:first-child { + margin-top: 0; + } + h1 { + font-weight: 600; + font-size: 1.2rem; + } + h2 { + font-weight: 600; + font-size: 1.15rem; + } + h3 { + font-weight: 600; + font-size: 1.1rem; + } + h4 { + font-weight: 600; + font-size: 1.05rem; + } + h5 { + font-weight: 600; + font-size: 1rem; + } + h6 { + font-weight: 600; + font-size: 1rem; + } +`; diff --git a/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/LeaderboardPage/SimpleCodeLikeTextArea.tsx b/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/LeaderboardPage/SimpleCodeLikeTextArea.tsx new file mode 100644 index 00000000000..826e0717b1a --- /dev/null +++ b/weave-js/src/components/PagePanelComponents/Home/Browse3/pages/LeaderboardPage/SimpleCodeLikeTextArea.tsx @@ -0,0 +1,221 @@ +/** + * A simple multi-line text editor component that mimics code editor styling. + * Features auto-sizing, manual resize handle, and code-like formatting. + * + * Inspired by: weave-js/src/components/Form/TextField.tsx + */ + +import {Tailwind} from '@wandb/weave/components/Tailwind'; +import classNames from 'classnames'; +import React, {useCallback, useEffect, useRef, useState} from 'react'; + +export const TextAreaSizes = { + Medium: 'medium', + Large: 'large', +} as const; +export type TextAreaSize = (typeof TextAreaSizes)[keyof typeof TextAreaSizes]; + +type TextAreaProps = { + placeholder?: string; + value?: string; + onChange?: (value: string) => void; + onKeyDown?: ( + key: string, + e: React.KeyboardEvent + ) => void; + onBlur?: (value: string) => void; + autoFocus?: boolean; + disabled?: boolean; + ariaLabel?: string; + errorState?: boolean; + maxLength?: number; + maxRows?: number; + dataTest?: string; +}; + +export const SimpleCodeLikeTextArea = ({ + placeholder, + value, + onChange, + onKeyDown, + onBlur, + autoFocus, + disabled, + ariaLabel, + errorState, + maxLength, + maxRows = 8, + dataTest, +}: TextAreaProps) => { + const textareaRef = useRef(null); + const [isManuallyResized, setIsManuallyResized] = useState(false); + const isDraggingRef = useRef(false); + const initialHeightRef = useRef(0); + const initialMouseYRef = useRef(0); + + // Automatically adjust height based on content + const adjustHeight = () => { + const textarea = textareaRef.current; + if (!textarea || isManuallyResized) { + return; + } + + textarea.style.height = 'auto'; + const lineHeight = parseInt( + getComputedStyle(textarea).lineHeight || '20', + 10 + ); + const maxHeight = lineHeight * maxRows; + const newHeight = Math.min(textarea.scrollHeight, maxHeight); + textarea.style.height = `${newHeight}px`; + }; + + useEffect(() => { + adjustHeight(); + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [value, isManuallyResized]); + + // Handle resize drag start + const handleResizeStart = (e: React.MouseEvent) => { + e.preventDefault(); + if (disabled) { + return; + } + + const textarea = textareaRef.current; + if (!textarea) { + return; + } + + isDraggingRef.current = true; + setIsManuallyResized(true); + initialHeightRef.current = textarea.offsetHeight; + initialMouseYRef.current = e.clientY; + + // Add event listeners for drag and release + document.addEventListener('mousemove', handleResizeMove); + document.addEventListener('mouseup', handleResizeEnd); + }; + + // Handle resize drag movement + const handleResizeMove = useCallback((e: MouseEvent) => { + if (!isDraggingRef.current || !textareaRef.current) { + return; + } + + const deltaY = e.clientY - initialMouseYRef.current; + const newHeight = Math.max(80, initialHeightRef.current + deltaY); // Min height of 80px + textareaRef.current.style.height = `${newHeight}px`; + }, []); + + // Handle resize drag end + const handleResizeEnd = useCallback(() => { + isDraggingRef.current = false; + document.removeEventListener('mousemove', handleResizeMove); + document.removeEventListener('mouseup', handleResizeEnd); + }, [handleResizeMove]); + + // Cleanup event listeners + useEffect(() => { + return () => { + document.removeEventListener('mousemove', handleResizeMove); + document.removeEventListener('mouseup', handleResizeEnd); + }; + }, [handleResizeEnd, handleResizeMove]); + + // Double click handler to reset to auto-size + const handleResizeDoubleClick = () => { + setIsManuallyResized(false); + adjustHeight(); + }; + + const handleChange = onChange + ? (e: React.ChangeEvent) => { + onChange(e.target.value); + } + : undefined; + const handleKeyDown = onKeyDown + ? (e: React.KeyboardEvent) => { + onKeyDown(e.key, e); + } + : undefined; + const handleBlur = onBlur + ? (e: React.ChangeEvent) => { + onBlur?.(e.target.value); + } + : undefined; + + return ( + +
+
+