From a37115eb726afb726142ac8a7b63033c24fd756f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cyixin-huang1=E2=80=9D?= Date: Sat, 19 Oct 2024 01:36:44 -0700 Subject: [PATCH 1/6] added a style control column for arena hard --- fastchat/serve/monitor/monitor.py | 139 ++++++++++++++++++++++++------ 1 file changed, 111 insertions(+), 28 deletions(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index e34d046a6..dc4cedc89 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -25,7 +25,8 @@ from fastchat.utils import build_logger, get_window_url_params_js -from fastchat.serve.monitor.monitor_md import ( +# from fastchat.serve.monitor.monitor_md import ( +from monitor_md import ( cat_name_to_baseline, key_to_category_name, cat_name_to_explanation, @@ -249,12 +250,38 @@ def get_full_table(arena_df, model_table_df, model_to_score): def arena_hard_process(leaderboard_table_file, filepath): - arena_hard = pd.read_csv(filepath) + with open(filepath, "rb") as f: # 'rb' is for reading in binary mode + combined_arena_hard = pickle.load(f) # normal + styled results + + arena_hard_normal = pd.DataFrame(combined_arena_hard["normal"]) # normal + arena_hard_style = pd.DataFrame(combined_arena_hard["style_control"]) # style + + + normal_rankings = recompute_final_ranking(arena_hard_normal) + style_rankings = recompute_final_ranking(arena_hard_style) + + + arena_hard_normal.insert(loc=0, column="Rank* (UB)", value=normal_rankings) # normal rankings + arena_hard_style.insert(loc=0, column="Rank (StyleCtrl)", value=style_rankings) # style rankings + + arena_hard_normal["avg_tokens"] = arena_hard_normal["avg_tokens"].astype(int) + + # combine together + combined_df = pd.merge(arena_hard_normal, arena_hard_style[['model', 'Rank (StyleCtrl)']], + on='model', how='left') + + # Move 'Rank (StyleCtrl)' to position 1 (second column) + columns = list(combined_df.columns) + columns.insert(1, columns.pop(columns.index('Rank (StyleCtrl)'))) + combined_df = combined_df[columns] + + leaderboard_table = pd.read_csv(leaderboard_table_file) links = leaderboard_table.get("Link") display_name = leaderboard_table.get("Model") model_name = leaderboard_table.get("key") organization = leaderboard_table.get("Organization") + info = {} for i in range(len(model_name)): @@ -265,23 +292,21 @@ def arena_hard_process(leaderboard_table_file, filepath): info[model_name[i]] = model_info organization = [] - for i in range(len(arena_hard)): + for i in range(len(combined_df)): assert ( - arena_hard.loc[i, "model"] in info - ), f"need to update leaderboard_table info by adding {arena_hard.loc[i, 'model']}" - organization.append(info[arena_hard.loc[i, "model"]]["org"]) - link = info[arena_hard.loc[i, "model"]]["link"] - arena_hard.loc[i, "model"] = model_hyperlink( - info[arena_hard.loc[i, "model"]]["display"], link + combined_df.loc[i, "model"] in info + ), f"need to update leaderboard_table info by adding {combined_df.loc[i, 'model']}" + organization.append(info[combined_df.loc[i, "model"]]["org"]) + link = info[combined_df.loc[i, "model"]]["link"] + combined_df.loc[i, "model"] = model_hyperlink( + info[combined_df.loc[i, "model"]]["display"], link ) - arena_hard.insert( - loc=len(arena_hard.columns), column="Organization", value=organization + combined_df.insert( + loc=len(combined_df.columns), column="Organization", value=organization ) - rankings = recompute_final_ranking(arena_hard) - arena_hard.insert(loc=0, column="Rank* (UB)", value=rankings) - return arena_hard + return combined_df def get_arena_table( @@ -440,6 +465,49 @@ def compare_func(row): ).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]]) + +def update_hard_leaderboard_df(arena_table_vals): + columns = [ + "Rank* (UB)", + "Rank (StyleCtrl)", + "Model", + "Win-rate", + "95% CI", + "Average Tokens", + "Organization" + ] + dataframe = pd.DataFrame(arena_table_vals, columns=columns) + + + def highlight_red(s): + return [("color: red; font-weight: bold") for v in s] + + def highlight_green(s): + return [("color: green; font-weight: bold") for v in s] + + def compare_func(row): + if row["Rank (StyleCtrl)"] is None: + return 0 + if row["Rank (StyleCtrl)"] == row["Rank* (UB)"]: + return 0 + elif row["Rank (StyleCtrl)"] < row["Rank* (UB)"]: + return 1 + else: + return -1 + + comparison = dataframe.apply( + compare_func, + axis=1, + ) + indices_red = [i for i, value in enumerate(comparison) if value == -1] + indices_green = [i for i, value in enumerate(comparison) if value == 1] + + return dataframe.style.apply( + highlight_red, subset=pd.IndexSlice[indices_red, ["Rank (StyleCtrl)"]] + ).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]]) + + + def build_arena_tab( elo_results, model_table_df, @@ -453,7 +521,7 @@ def build_arena_tab( """, ) return - + arena_dfs = {} category_elo_results = {} last_updated_time = elo_results["full"]["last_updated_datetime"].split(" ")[0] @@ -730,6 +798,7 @@ def update_leaderboard_and_plots(category, filters): return [plot_1, plot_2, plot_3, plot_4] + def build_full_leaderboard_tab(elo_results, model_table_df, model_to_score): arena_df = elo_results["full"]["leaderboard_table_df"] md = make_full_leaderboard_md() @@ -1013,12 +1082,15 @@ def build_leaderboard_tab( dataFrame = arena_hard_process( leaderboard_table_file, arena_hard_leaderboard ) + + date = dataFrame["date"][0] dataFrame = dataFrame.drop( columns=["rating_q025", "rating_q975", "date"] ) dataFrame["CI"] = dataFrame.CI.map(ast.literal_eval) - dataFrame["CI"] = dataFrame.CI.map(lambda x: f"+{x[1]}/-{x[0]}") + dataFrame["CI"] = dataFrame.CI.map(lambda x: f"+{x[1]}/{x[0]}") + dataFrame = dataFrame.rename( columns={ "model": "Model", @@ -1027,24 +1099,35 @@ def build_leaderboard_tab( "avg_tokens": "Average Tokens", } ) + dataFrame['Win-rate'] = dataFrame['Win-rate'].apply(lambda x: f'{x:g}' if pd.notnull(x) else x) + model_to_score = {} for i in range(len(dataFrame)): - model_to_score[dataFrame.loc[i, "Model"]] = dataFrame.loc[ - i, "Win-rate" - ] + model_name = dataFrame.loc[i, "Model"] + win_rate = dataFrame.loc[i, "Win-rate"] + + model_to_score[model_name] = win_rate + md = arena_hard_title(date) gr.Markdown(md, elem_id="leaderboard_markdown") + + dataFrame = update_hard_leaderboard_df(dataFrame) gr.DataFrame( dataFrame, - datatype=[ - "markdown" if col == "Model" else "str" - for col in dataFrame.columns + datatype=[ + "number", + "number", + "markdown", + "str", + "str", + "number", + "str" ], elem_id="arena_hard_leaderboard", - height=1000, + height = 1000, wrap=True, - column_widths=[70, 190, 80, 80, 90, 150], - ) + column_widths=[70, 70, 190, 80, 80, 90, 150], + ) with gr.Tab("Full Leaderboard", id=4): build_full_leaderboard_tab( @@ -1099,10 +1182,10 @@ def build_demo(elo_results_file, leaderboard_table_file, arena_hard_leaderboard) button_small_text_size="20px", button_large_text_weight="100", button_small_text_weight="100", - button_shadow="*shadow_drop_lg", - button_shadow_hover="*shadow_drop_lg", + # button_shadow="*shadow_drop_lg", + # button_shadow_hover="*shadow_drop_lg", checkbox_label_shadow="*shadow_drop_lg", - button_shadow_active="*shadow_inset", + # button_shadow_active="*shadow_inset", button_secondary_background_fill="*primary_300", button_secondary_background_fill_dark="*primary_700", button_secondary_background_fill_hover="*primary_200", From b689113bb6551ae4a422a685461f3d65c8dc611f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cyixin-huang1=E2=80=9D?= Date: Sat, 19 Oct 2024 14:46:47 -0700 Subject: [PATCH 2/6] changed the formatting --- fastchat/serve/monitor/monitor.py | 79 ++++++++++++++++--------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index dc4cedc89..e66c8e7e7 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -251,37 +251,45 @@ def get_full_table(arena_df, model_table_df, model_to_score): def arena_hard_process(leaderboard_table_file, filepath): with open(filepath, "rb") as f: # 'rb' is for reading in binary mode - combined_arena_hard = pickle.load(f) # normal + styled results - - arena_hard_normal = pd.DataFrame(combined_arena_hard["normal"]) # normal - arena_hard_style = pd.DataFrame(combined_arena_hard["style_control"]) # style - - + combined_arena_hard = pickle.load(f) # normal + styled results + + arena_hard_normal = pd.DataFrame(combined_arena_hard["normal"]) # normal + arena_hard_style = pd.DataFrame(combined_arena_hard["style_control"]) # style + normal_rankings = recompute_final_ranking(arena_hard_normal) - style_rankings = recompute_final_ranking(arena_hard_style) - - - arena_hard_normal.insert(loc=0, column="Rank* (UB)", value=normal_rankings) # normal rankings - arena_hard_style.insert(loc=0, column="Rank (StyleCtrl)", value=style_rankings) # style rankings - + style_rankings = recompute_final_ranking(arena_hard_style) + + arena_hard_normal.insert( + loc=0, column="Rank* (UB)", value=normal_rankings + ) # normal rankings + arena_hard_style.insert( + loc=0, column="Rank (StyleCtrl)", value=style_rankings + ) # style rankings + arena_hard_normal["avg_tokens"] = arena_hard_normal["avg_tokens"].astype(int) - + # combine together - combined_df = pd.merge(arena_hard_normal, arena_hard_style[['model', 'Rank (StyleCtrl)']], - on='model', how='left') + combined_df = pd.merge( + arena_hard_normal, + arena_hard_style[["model", "Rank (StyleCtrl)"]], + on="model", + how="left", + ) # Move 'Rank (StyleCtrl)' to position 1 (second column) columns = list(combined_df.columns) - columns.insert(1, columns.pop(columns.index('Rank (StyleCtrl)'))) + columns.insert(1, columns.pop(columns.index("Rank (StyleCtrl)"))) combined_df = combined_df[columns] - - + + combined_df["Rank (StyleCtrl)"] = combined_df["Rank (StyleCtrl)"].astype( + int + ) # convert the ranking to integer values + leaderboard_table = pd.read_csv(leaderboard_table_file) links = leaderboard_table.get("Link") display_name = leaderboard_table.get("Model") model_name = leaderboard_table.get("key") organization = leaderboard_table.get("Organization") - info = {} for i in range(len(model_name)): @@ -465,7 +473,6 @@ def compare_func(row): ).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]]) - def update_hard_leaderboard_df(arena_table_vals): columns = [ "Rank* (UB)", @@ -474,11 +481,10 @@ def update_hard_leaderboard_df(arena_table_vals): "Win-rate", "95% CI", "Average Tokens", - "Organization" + "Organization", ] dataframe = pd.DataFrame(arena_table_vals, columns=columns) - def highlight_red(s): return [("color: red; font-weight: bold") for v in s] @@ -507,7 +513,6 @@ def compare_func(row): ).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]]) - def build_arena_tab( elo_results, model_table_df, @@ -521,7 +526,7 @@ def build_arena_tab( """, ) return - + arena_dfs = {} category_elo_results = {} last_updated_time = elo_results["full"]["last_updated_datetime"].split(" ")[0] @@ -798,7 +803,6 @@ def update_leaderboard_and_plots(category, filters): return [plot_1, plot_2, plot_3, plot_4] - def build_full_leaderboard_tab(elo_results, model_table_df, model_to_score): arena_df = elo_results["full"]["leaderboard_table_df"] md = make_full_leaderboard_md() @@ -1082,15 +1086,14 @@ def build_leaderboard_tab( dataFrame = arena_hard_process( leaderboard_table_file, arena_hard_leaderboard ) - - + date = dataFrame["date"][0] dataFrame = dataFrame.drop( columns=["rating_q025", "rating_q975", "date"] ) dataFrame["CI"] = dataFrame.CI.map(ast.literal_eval) dataFrame["CI"] = dataFrame.CI.map(lambda x: f"+{x[1]}/{x[0]}") - + dataFrame = dataFrame.rename( columns={ "model": "Model", @@ -1099,35 +1102,37 @@ def build_leaderboard_tab( "avg_tokens": "Average Tokens", } ) - dataFrame['Win-rate'] = dataFrame['Win-rate'].apply(lambda x: f'{x:g}' if pd.notnull(x) else x) - + dataFrame["Win-rate"] = dataFrame["Win-rate"].apply( + lambda x: f"{x:g}" if pd.notnull(x) else x + ) + model_to_score = {} for i in range(len(dataFrame)): model_name = dataFrame.loc[i, "Model"] win_rate = dataFrame.loc[i, "Win-rate"] - + model_to_score[model_name] = win_rate - + md = arena_hard_title(date) gr.Markdown(md, elem_id="leaderboard_markdown") - + dataFrame = update_hard_leaderboard_df(dataFrame) gr.DataFrame( dataFrame, - datatype=[ + datatype=[ "number", "number", "markdown", "str", "str", "number", - "str" + "str", ], elem_id="arena_hard_leaderboard", - height = 1000, + height=1000, wrap=True, column_widths=[70, 70, 190, 80, 80, 90, 150], - ) + ) with gr.Tab("Full Leaderboard", id=4): build_full_leaderboard_tab( From 4d6c8818649dda3b279b92c1c212cd43b929926a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cyixin-huang1=E2=80=9D?= Date: Sun, 20 Oct 2024 12:37:44 -0700 Subject: [PATCH 3/6] updated the monitor file --- fastchat/serve/monitor/monitor.py | 147 ++++++++++++++---------------- 1 file changed, 69 insertions(+), 78 deletions(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index e66c8e7e7..2a1f98e48 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -25,8 +25,7 @@ from fastchat.utils import build_logger, get_window_url_params_js -# from fastchat.serve.monitor.monitor_md import ( -from monitor_md import ( +from fastchat.serve.monitor.monitor_md import ( cat_name_to_baseline, key_to_category_name, cat_name_to_explanation, @@ -251,45 +250,38 @@ def get_full_table(arena_df, model_table_df, model_to_score): def arena_hard_process(leaderboard_table_file, filepath): with open(filepath, "rb") as f: # 'rb' is for reading in binary mode - combined_arena_hard = pickle.load(f) # normal + styled results - - arena_hard_normal = pd.DataFrame(combined_arena_hard["normal"]) # normal - arena_hard_style = pd.DataFrame(combined_arena_hard["style_control"]) # style - + combined_arena_hard = pickle.load(f) # normal + styled results + + arena_hard_normal = pd.DataFrame(combined_arena_hard["normal"]) # normal + arena_hard_style = pd.DataFrame(combined_arena_hard["style_control"]) # style + + normal_rankings = recompute_final_ranking(arena_hard_normal) - style_rankings = recompute_final_ranking(arena_hard_style) - - arena_hard_normal.insert( - loc=0, column="Rank* (UB)", value=normal_rankings - ) # normal rankings - arena_hard_style.insert( - loc=0, column="Rank (StyleCtrl)", value=style_rankings - ) # style rankings - + style_rankings = recompute_final_ranking(arena_hard_style) + + + arena_hard_normal.insert(loc=0, column="Rank* (UB)", value=normal_rankings) # normal rankings + arena_hard_style.insert(loc=0, column="Rank (StyleCtrl)", value=style_rankings) # style rankings + arena_hard_normal["avg_tokens"] = arena_hard_normal["avg_tokens"].astype(int) - + # combine together - combined_df = pd.merge( - arena_hard_normal, - arena_hard_style[["model", "Rank (StyleCtrl)"]], - on="model", - how="left", - ) + combined_df = pd.merge(arena_hard_normal, arena_hard_style[['model', 'Rank (StyleCtrl)']], + on='model', how='left') # Move 'Rank (StyleCtrl)' to position 1 (second column) columns = list(combined_df.columns) - columns.insert(1, columns.pop(columns.index("Rank (StyleCtrl)"))) + columns.insert(1, columns.pop(columns.index('Rank (StyleCtrl)'))) combined_df = combined_df[columns] - - combined_df["Rank (StyleCtrl)"] = combined_df["Rank (StyleCtrl)"].astype( - int - ) # convert the ranking to integer values - + + combined_df["Rank (StyleCtrl)"] = combined_df["Rank (StyleCtrl)"].astype(int) # convert the ranking to integer values + leaderboard_table = pd.read_csv(leaderboard_table_file) links = leaderboard_table.get("Link") display_name = leaderboard_table.get("Model") model_name = leaderboard_table.get("key") organization = leaderboard_table.get("Organization") + info = {} for i in range(len(model_name)): @@ -302,7 +294,7 @@ def arena_hard_process(leaderboard_table_file, filepath): organization = [] for i in range(len(combined_df)): assert ( - combined_df.loc[i, "model"] in info + combined_df.loc[i, "model"] in info ), f"need to update leaderboard_table info by adding {combined_df.loc[i, 'model']}" organization.append(info[combined_df.loc[i, "model"]]["org"]) link = info[combined_df.loc[i, "model"]]["link"] @@ -430,7 +422,22 @@ def highlight_rank_max(s): return elo_dataframe.style.apply(highlight_rank_max, subset=["Delta"]) +def highlight_red(s): + return [("color: red; font-weight: bold") for v in s] +def highlight_green(s): + return [("color: green; font-weight: bold") for v in s] + +def compare_func(row): + if row["Rank (StyleCtrl)"] is None: + return 0 + if row["Rank (StyleCtrl)"] == row["Rank* (UB)"]: + return 0 + elif row["Rank (StyleCtrl)"] < row["Rank* (UB)"]: + return 1 + else: + return -1 + def update_overall_leaderboard_df(arena_table_vals): columns = [ "Rank* (UB)", @@ -445,22 +452,6 @@ def update_overall_leaderboard_df(arena_table_vals): ] elo_dataframe = pd.DataFrame(arena_table_vals, columns=columns) - def highlight_red(s): - return [("color: red; font-weight: bold") for v in s] - - def highlight_green(s): - return [("color: green; font-weight: bold") for v in s] - - def compare_func(row): - if row["Rank (StyleCtrl)"] is None: - return 0 - if row["Rank (StyleCtrl)"] == row["Rank* (UB)"]: - return 0 - elif row["Rank (StyleCtrl)"] < row["Rank* (UB)"]: - return 1 - else: - return -1 - comparison = elo_dataframe.apply( compare_func, axis=1, @@ -473,6 +464,7 @@ def compare_func(row): ).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]]) + def update_hard_leaderboard_df(arena_table_vals): columns = [ "Rank* (UB)", @@ -481,25 +473,10 @@ def update_hard_leaderboard_df(arena_table_vals): "Win-rate", "95% CI", "Average Tokens", - "Organization", + "Organization" ] dataframe = pd.DataFrame(arena_table_vals, columns=columns) - def highlight_red(s): - return [("color: red; font-weight: bold") for v in s] - - def highlight_green(s): - return [("color: green; font-weight: bold") for v in s] - - def compare_func(row): - if row["Rank (StyleCtrl)"] is None: - return 0 - if row["Rank (StyleCtrl)"] == row["Rank* (UB)"]: - return 0 - elif row["Rank (StyleCtrl)"] < row["Rank* (UB)"]: - return 1 - else: - return -1 comparison = dataframe.apply( compare_func, @@ -513,6 +490,7 @@ def compare_func(row): ).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]]) + def build_arena_tab( elo_results, model_table_df, @@ -526,7 +504,7 @@ def build_arena_tab( """, ) return - + arena_dfs = {} category_elo_results = {} last_updated_time = elo_results["full"]["last_updated_datetime"].split(" ")[0] @@ -803,6 +781,7 @@ def update_leaderboard_and_plots(category, filters): return [plot_1, plot_2, plot_3, plot_4] + def build_full_leaderboard_tab(elo_results, model_table_df, model_to_score): arena_df = elo_results["full"]["leaderboard_table_df"] md = make_full_leaderboard_md() @@ -1086,14 +1065,15 @@ def build_leaderboard_tab( dataFrame = arena_hard_process( leaderboard_table_file, arena_hard_leaderboard ) - + + date = dataFrame["date"][0] dataFrame = dataFrame.drop( columns=["rating_q025", "rating_q975", "date"] ) dataFrame["CI"] = dataFrame.CI.map(ast.literal_eval) dataFrame["CI"] = dataFrame.CI.map(lambda x: f"+{x[1]}/{x[0]}") - + dataFrame = dataFrame.rename( columns={ "model": "Model", @@ -1102,37 +1082,48 @@ def build_leaderboard_tab( "avg_tokens": "Average Tokens", } ) - dataFrame["Win-rate"] = dataFrame["Win-rate"].apply( - lambda x: f"{x:g}" if pd.notnull(x) else x - ) - + dataFrame['Win-rate'] = dataFrame['Win-rate'].apply(lambda x: f'{x:g}' if pd.notnull(x) else x) + model_to_score = {} for i in range(len(dataFrame)): model_name = dataFrame.loc[i, "Model"] win_rate = dataFrame.loc[i, "Win-rate"] - + model_to_score[model_name] = win_rate - + md = arena_hard_title(date) gr.Markdown(md, elem_id="leaderboard_markdown") - + dataFrame = update_hard_leaderboard_df(dataFrame) gr.DataFrame( dataFrame, - datatype=[ + datatype=[ "number", "number", "markdown", "str", "str", "number", - "str", + "str" ], elem_id="arena_hard_leaderboard", - height=1000, + height = 1000, wrap=True, column_widths=[70, 70, 190, 80, 80, 90, 150], ) + gr.Markdown( + f""" +***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model. +Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). +See Figure 1 below for visualization of the confidence intervals of model scores. + +**Rank (StyleCtrl)**: model's ranking with style control, which accounts for factors like response length and markdown usage to decouple model performance from these potential confounding variables. +See [blog post](https://blog.lmarena.ai/blog/2024/style-control/) for further details. + +Note: in each category, we exclude models with fewer than 300 votes as their confidence intervals can be large. +""", + elem_id="leaderboard_markdown", + ) with gr.Tab("Full Leaderboard", id=4): build_full_leaderboard_tab( @@ -1187,10 +1178,10 @@ def build_demo(elo_results_file, leaderboard_table_file, arena_hard_leaderboard) button_small_text_size="20px", button_large_text_weight="100", button_small_text_weight="100", - # button_shadow="*shadow_drop_lg", - # button_shadow_hover="*shadow_drop_lg", + button_shadow="*shadow_drop_lg", + button_shadow_hover="*shadow_drop_lg", checkbox_label_shadow="*shadow_drop_lg", - # button_shadow_active="*shadow_inset", + button_shadow_active="*shadow_inset", button_secondary_background_fill="*primary_300", button_secondary_background_fill_dark="*primary_700", button_secondary_background_fill_hover="*primary_200", From 2c20754d27480792fd62c104e7886be920cd2629 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cyixin-huang1=E2=80=9D?= Date: Sun, 20 Oct 2024 12:40:08 -0700 Subject: [PATCH 4/6] reformatted the styling of monitor --- fastchat/serve/monitor/monitor.py | 102 ++++++++++++++++-------------- 1 file changed, 54 insertions(+), 48 deletions(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 2a1f98e48..3c5c8490e 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -250,38 +250,45 @@ def get_full_table(arena_df, model_table_df, model_to_score): def arena_hard_process(leaderboard_table_file, filepath): with open(filepath, "rb") as f: # 'rb' is for reading in binary mode - combined_arena_hard = pickle.load(f) # normal + styled results - - arena_hard_normal = pd.DataFrame(combined_arena_hard["normal"]) # normal - arena_hard_style = pd.DataFrame(combined_arena_hard["style_control"]) # style - - + combined_arena_hard = pickle.load(f) # normal + styled results + + arena_hard_normal = pd.DataFrame(combined_arena_hard["normal"]) # normal + arena_hard_style = pd.DataFrame(combined_arena_hard["style_control"]) # style + normal_rankings = recompute_final_ranking(arena_hard_normal) - style_rankings = recompute_final_ranking(arena_hard_style) - - - arena_hard_normal.insert(loc=0, column="Rank* (UB)", value=normal_rankings) # normal rankings - arena_hard_style.insert(loc=0, column="Rank (StyleCtrl)", value=style_rankings) # style rankings - + style_rankings = recompute_final_ranking(arena_hard_style) + + arena_hard_normal.insert( + loc=0, column="Rank* (UB)", value=normal_rankings + ) # normal rankings + arena_hard_style.insert( + loc=0, column="Rank (StyleCtrl)", value=style_rankings + ) # style rankings + arena_hard_normal["avg_tokens"] = arena_hard_normal["avg_tokens"].astype(int) - + # combine together - combined_df = pd.merge(arena_hard_normal, arena_hard_style[['model', 'Rank (StyleCtrl)']], - on='model', how='left') + combined_df = pd.merge( + arena_hard_normal, + arena_hard_style[["model", "Rank (StyleCtrl)"]], + on="model", + how="left", + ) # Move 'Rank (StyleCtrl)' to position 1 (second column) columns = list(combined_df.columns) - columns.insert(1, columns.pop(columns.index('Rank (StyleCtrl)'))) + columns.insert(1, columns.pop(columns.index("Rank (StyleCtrl)"))) combined_df = combined_df[columns] - - combined_df["Rank (StyleCtrl)"] = combined_df["Rank (StyleCtrl)"].astype(int) # convert the ranking to integer values - + + combined_df["Rank (StyleCtrl)"] = combined_df["Rank (StyleCtrl)"].astype( + int + ) # convert the ranking to integer values + leaderboard_table = pd.read_csv(leaderboard_table_file) links = leaderboard_table.get("Link") display_name = leaderboard_table.get("Model") model_name = leaderboard_table.get("key") organization = leaderboard_table.get("Organization") - info = {} for i in range(len(model_name)): @@ -294,7 +301,7 @@ def arena_hard_process(leaderboard_table_file, filepath): organization = [] for i in range(len(combined_df)): assert ( - combined_df.loc[i, "model"] in info + combined_df.loc[i, "model"] in info ), f"need to update leaderboard_table info by adding {combined_df.loc[i, 'model']}" organization.append(info[combined_df.loc[i, "model"]]["org"]) link = info[combined_df.loc[i, "model"]]["link"] @@ -413,21 +420,22 @@ def highlight_rank_max(s): ( "color: green; font-weight: bold" if v > 0 - else "color: red; font-weight: bold" - if v < 0 - else "" + else "color: red; font-weight: bold" if v < 0 else "" ) for v in s ] return elo_dataframe.style.apply(highlight_rank_max, subset=["Delta"]) + def highlight_red(s): return [("color: red; font-weight: bold") for v in s] + def highlight_green(s): return [("color: green; font-weight: bold") for v in s] + def compare_func(row): if row["Rank (StyleCtrl)"] is None: return 0 @@ -437,7 +445,8 @@ def compare_func(row): return 1 else: return -1 - + + def update_overall_leaderboard_df(arena_table_vals): columns = [ "Rank* (UB)", @@ -464,7 +473,6 @@ def update_overall_leaderboard_df(arena_table_vals): ).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]]) - def update_hard_leaderboard_df(arena_table_vals): columns = [ "Rank* (UB)", @@ -473,11 +481,10 @@ def update_hard_leaderboard_df(arena_table_vals): "Win-rate", "95% CI", "Average Tokens", - "Organization" + "Organization", ] dataframe = pd.DataFrame(arena_table_vals, columns=columns) - comparison = dataframe.apply( compare_func, axis=1, @@ -490,7 +497,6 @@ def update_hard_leaderboard_df(arena_table_vals): ).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]]) - def build_arena_tab( elo_results, model_table_df, @@ -504,7 +510,7 @@ def build_arena_tab( """, ) return - + arena_dfs = {} category_elo_results = {} last_updated_time = elo_results["full"]["last_updated_datetime"].split(" ")[0] @@ -544,9 +550,9 @@ def update_leaderboard_and_plots(category, filters): arena_values = get_arena_table( arena_df, model_table_df, - arena_subset_df=arena_subset_df - if category != "Overall" - else arena_overall_sc_df, + arena_subset_df=( + arena_subset_df if category != "Overall" else arena_overall_sc_df + ), hidden_models=( None if len(filters) > 0 and "Show Deprecated" in filters @@ -781,7 +787,6 @@ def update_leaderboard_and_plots(category, filters): return [plot_1, plot_2, plot_3, plot_4] - def build_full_leaderboard_tab(elo_results, model_table_df, model_to_score): arena_df = elo_results["full"]["leaderboard_table_df"] md = make_full_leaderboard_md() @@ -1065,15 +1070,14 @@ def build_leaderboard_tab( dataFrame = arena_hard_process( leaderboard_table_file, arena_hard_leaderboard ) - - + date = dataFrame["date"][0] dataFrame = dataFrame.drop( columns=["rating_q025", "rating_q975", "date"] ) dataFrame["CI"] = dataFrame.CI.map(ast.literal_eval) dataFrame["CI"] = dataFrame.CI.map(lambda x: f"+{x[1]}/{x[0]}") - + dataFrame = dataFrame.rename( columns={ "model": "Model", @@ -1082,37 +1086,39 @@ def build_leaderboard_tab( "avg_tokens": "Average Tokens", } ) - dataFrame['Win-rate'] = dataFrame['Win-rate'].apply(lambda x: f'{x:g}' if pd.notnull(x) else x) - + dataFrame["Win-rate"] = dataFrame["Win-rate"].apply( + lambda x: f"{x:g}" if pd.notnull(x) else x + ) + model_to_score = {} for i in range(len(dataFrame)): model_name = dataFrame.loc[i, "Model"] win_rate = dataFrame.loc[i, "Win-rate"] - + model_to_score[model_name] = win_rate - + md = arena_hard_title(date) gr.Markdown(md, elem_id="leaderboard_markdown") - + dataFrame = update_hard_leaderboard_df(dataFrame) gr.DataFrame( dataFrame, - datatype=[ + datatype=[ "number", "number", "markdown", "str", "str", "number", - "str" + "str", ], elem_id="arena_hard_leaderboard", - height = 1000, + height=1000, wrap=True, column_widths=[70, 70, 190, 80, 80, 90, 150], ) gr.Markdown( - f""" + f""" ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model. Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). See Figure 1 below for visualization of the confidence intervals of model scores. @@ -1122,8 +1128,8 @@ def build_leaderboard_tab( Note: in each category, we exclude models with fewer than 300 votes as their confidence intervals can be large. """, - elem_id="leaderboard_markdown", - ) + elem_id="leaderboard_markdown", + ) with gr.Tab("Full Leaderboard", id=4): build_full_leaderboard_tab( From 75409868ff3ea631a2a26b6ff85d4c6f6d516d48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cyixin-huang1=E2=80=9D?= Date: Sun, 20 Oct 2024 12:53:34 -0700 Subject: [PATCH 5/6] more reformatting --- fastchat/serve/monitor/monitor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 3c5c8490e..ff1b6b960 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -428,14 +428,17 @@ def highlight_rank_max(s): return elo_dataframe.style.apply(highlight_rank_max, subset=["Delta"]) +# highlight the style control rank value as red def highlight_red(s): return [("color: red; font-weight: bold") for v in s] +# highlight the style control rank value as green def highlight_green(s): return [("color: green; font-weight: bold") for v in s] +# decide whether to highlight the style control rank value as green or red def compare_func(row): if row["Rank (StyleCtrl)"] is None: return 0 From 93f46004e3c2533fe8c950356a8d2e6574b2bc31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cyixin-huang1=E2=80=9D?= Date: Sun, 20 Oct 2024 13:00:53 -0700 Subject: [PATCH 6/6] changed formatting again --- fastchat/serve/monitor/monitor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index ff1b6b960..39ec3cb35 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -420,7 +420,9 @@ def highlight_rank_max(s): ( "color: green; font-weight: bold" if v > 0 - else "color: red; font-weight: bold" if v < 0 else "" + else "color: red; font-weight: bold" + if v < 0 + else "" ) for v in s ]