Skip to content

Commit

Permalink
Update monitor & plots (#2506)
Browse files Browse the repository at this point in the history
  • Loading branch information
merrymercy authored Oct 2, 2023
1 parent f5eee7d commit 759dfbe
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 18 deletions.
11 changes: 11 additions & 0 deletions docs/commands/leaderboard.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,14 @@ scp atlas:/data/lmzheng/FastChat/fastchat/serve/monitor/elo_results_20230905.pkl
```
wget https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/raw/main/leaderboard_table_20230905.csv
```

### Update files on webserver
```
DATE=20231002
rm -rf elo_results.pkl leaderboard_table.csv
wget https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/resolve/main/elo_results_$DATE.pkl
wget https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/resolve/main/leaderboard_table_$DATE.csv
ln -s leaderboard_table_$DATE.csv leaderboard_table.csv
ln -s elo_results_$DATE.pkl elo_results.pkl
```
16 changes: 10 additions & 6 deletions fastchat/serve/huggingface_api_worker.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
"""
A model worker to call huggingface api.
JSON file format:
A model worker that calls huggingface inference endpoint.
Register models in a JSON file with the following format:
{
"falcon-180b-chat": {
"model_path": "tiiuae/falcon-180B-chat",
"api_base": "https://api-inference.huggingface.co/models",
"token": "hf_xxx",
"context_length": 2048
"context_length": 2048,
"model_names": "falcon-180b-chat",
"conv_template": null,
"conv_template": null
}
}
Only "model_path", "api_base", and "token" are necessary, others are optional.
"model_path", "api_base", "token", and "context_length" are necessary, while others are optional.
"""
import argparse
import asyncio
Expand Down Expand Up @@ -116,6 +117,9 @@ def __init__(
f"Connecting with huggingface api {self.model_path} as {self.model_names} on worker {worker_id} ..."
)

if not no_register:
self.init_heart_beat()

def count_token(self, params):
# No tokenizer here
ret = {
Expand Down Expand Up @@ -312,7 +316,7 @@ def create_huggingface_api_worker():
api_base_list.append(model_info[m]["api_base"])
token_list.append(model_info[m]["token"])

context_length = model_info[m].get("context_length", 1024)
context_length = model_info[m]["context_length"]
model_names = model_info[m].get("model_names", [m.split("/")[-1]])
if isinstance(model_names, str):
model_names = [model_names]
Expand Down
4 changes: 2 additions & 2 deletions fastchat/serve/monitor/clean_battle_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@

def get_log_files(max_num_files=None):
dates = []
for month in [4, 5, 6, 7, 8, 9]:
for day in range(1, 32):
for month in range(4, 12):
for day in range(1, 33):
dates.append(f"2023-{month:02d}-{day:02d}")

filenames = []
Expand Down
4 changes: 2 additions & 2 deletions fastchat/serve/monitor/clean_chat_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@

def get_log_files(max_num_files=None):
dates = []
for month in [4, 5, 6, 7, 8, 9, 10]:
for day in range(1, 32):
for month in range(4, 12):
for day in range(1, 33):
dates.append(f"2023-{month:02d}-{day:02d}")

filenames = []
Expand Down
25 changes: 19 additions & 6 deletions fastchat/serve/monitor/elo_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def get_median_elo_from_bootstrap(bootstrap_df):
return median


def compute_pairwise_win_fraction(battles, model_order):
def compute_pairwise_win_fraction(battles, model_order, limit_show_number=None):
# Times each model wins as Model A
a_win_ptbl = pd.pivot_table(
battles[battles["winner"] == "model_a"],
Expand Down Expand Up @@ -92,6 +92,9 @@ def compute_pairwise_win_fraction(battles, model_order):
prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
model_order = list(prop_wins.keys())

if limit_show_number is not None:
model_order = model_order[:limit_show_number]

# Arrange ordering according to proprition of wins
row_beats_col = row_beats_col_freq.loc[model_order, model_order]
return row_beats_col
Expand Down Expand Up @@ -166,8 +169,10 @@ def visualize_battle_count(battles, model_order):
return fig


def visualize_average_win_rate(battles):
row_beats_col_freq = compute_pairwise_win_fraction(battles, None)
def visualize_average_win_rate(battles, limit_show_number):
row_beats_col_freq = compute_pairwise_win_fraction(
battles, None, limit_show_number=limit_show_number
)
fig = px.bar(
row_beats_col_freq.mean(axis=1).sort_values(ascending=False),
text_auto=".2f",
Expand All @@ -180,7 +185,7 @@ def visualize_average_win_rate(battles):
return fig


def visualize_bootstrap_elo_rating(df):
def visualize_bootstrap_elo_rating(df, limit_show_number):
bars = (
pd.DataFrame(
dict(
Expand All @@ -192,6 +197,7 @@ def visualize_bootstrap_elo_rating(df):
.reset_index(names="model")
.sort_values("rating", ascending=False)
)
bars = bars[:limit_show_number]
bars["error_y"] = bars["upper"] - bars["rating"]
bars["error_y_minus"] = bars["rating"] - bars["lower"]
bars["rating_rounded"] = np.round(bars["rating"], 2)
Expand Down Expand Up @@ -225,12 +231,19 @@ def report_elo_analysis_results(battles_json):
model_order = list(elo_rating_median.keys())
model_order.sort(key=lambda k: -elo_rating_median[k])

limit_show_number = 25 # limit show number to make plots smaller
model_order = model_order[:limit_show_number]

# Plots
leaderboard_table = visualize_leaderboard_table(elo_rating_median)
win_fraction_heatmap = visualize_pairwise_win_fraction(battles_no_ties, model_order)
battle_count_heatmap = visualize_battle_count(battles_no_ties, model_order)
average_win_rate_bar = visualize_average_win_rate(battles_no_ties)
bootstrap_elo_rating = visualize_bootstrap_elo_rating(bootstrap_df)
average_win_rate_bar = visualize_average_win_rate(
battles_no_ties, limit_show_number
)
bootstrap_elo_rating = visualize_bootstrap_elo_rating(
bootstrap_df, limit_show_number
)

last_updated_tstamp = battles["tstamp"].max()
last_updated_datetime = datetime.datetime.fromtimestamp(
Expand Down
4 changes: 2 additions & 2 deletions fastchat/serve/monitor/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ def make_leaderboard_md(elo_results):
| [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
🏆 This leaderboard is based on the following three benchmarks.
- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 70K+ user votes to compute Elo ratings.
- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 90K+ user votes to compute Elo ratings.
- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Sept, 2023.
💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: October, 2023.
"""
return leaderboard_md

Expand Down

0 comments on commit 759dfbe

Please sign in to comment.