-
Notifications
You must be signed in to change notification settings - Fork 2
/
chat_eval.py
293 lines (244 loc) · 11.5 KB
/
chat_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# implementing three evaluation metrics for dialogue quality evaluation: Perplexity, N-distinct and LLM-As-A-JUDGE
import os
import sys
import re
import numpy as np
import torch
import transformers
import json
import evaluate
import string
from llm_engines import ChatgptLLM, LLMApi
from dialogue_react_agent import load_base_prompt
from nltk.util import ngrams
import logging
# load gpt2 tokenizer
from transformers import GPT2Tokenizer
from tqdm import tqdm
# Set up logging
logging.basicConfig(filename='evaluation.log', level=logging.INFO, filemode="w", format="%(asctime)-15s %(message)s", force=True)
absolute_eval_prompt_path="prompts/absolute_eval.j2"
absolute_eval_prompt=load_base_prompt(absolute_eval_prompt_path)
pairwise_eval_prompt_path="prompts/pairwise_eval.j2"
pairwise_eval_prompt=load_base_prompt(pairwise_eval_prompt_path)
def load_chat_history(path:str):
## chat histories are json files with a list of chat messages, each message is an array of turn, speaker, text
# it should be put in Bob: message\nAlice: message format
# the path is a single line json, load it
with open(path, "r") as f:
chat_history = json.loads(f.readline())
# now, every message is a list of turn, speaker, text, we want to convert it to a single string
chat_str=""
for message in chat_history:
chat_str+=message[1].strip()+": "+message[2].strip()+"\n"
return chat_str
def calc_perplexity(chat_history:str|list[str]):
## use hf evaluate to calculate per chat history perplexity
## returns a vector of perplexity scores (one per chat)
# load the model
if type(chat_history)==str:
chat_history = [chat_history]
# make only the second graphics card available
# Split chat history into batches
batch_size = 8
chat_batches = [chat_history[i:i+batch_size] for i in range(0, len(chat_history), batch_size)]
perplexities = []
# Process each batch
for batch in chat_batches:
perplexity=evaluate.load("perplexity", module_type="metric")
batch_results = perplexity.compute(model_id='gpt2',
add_start_token=False,
predictions=batch,
max_length=1024,
device="cuda")
# clean gpu memory
torch.cuda.empty_cache()
del perplexity
perplexities.extend(batch_results["perplexities"])
return perplexities
def calc_distinct_n(chat_history:str|list[str], n:int=1):
# calculate different n-grams in the chat history
# return vector of n_distincts scores (one per chat)
if type(chat_history)==str:
chat_history = [chat_history]
# for chat in chat_history, calculate the n-grams of size n
n_distincts = []
for chat in chat_history:
# n of different n-grams
# first substitute \n with space
chat = chat.replace("\n", " ")
# make everything lowercase
chat = chat.lower()
n_distinct = len(set(ngrams(chat.split(), n)))
# make the n_distinct proportional to the length of the chat
n_distinct = n_distinct/len(chat.split())
n_distincts.append(n_distinct)
return n_distincts
# now we need to calculate the LLM-As-A-JUDGE
def calc_llm_as_a_judge(chat_history:str|list[str], model:str="gpt-4o", n_consistency:int=1):
# calculate the LLM-As-A-JUDGE metric
# return vector of scores (one per chat)
# format chat_history to a list of strings if it is a single string
if type(chat_history)==str:
chat_history = [chat_history]
if model=="prometheus-2.0":
# use the new model
model="prometheus-7b-v2.0.Q5_0.gguf"
llm=LLMApi()
llm.model=model
else:
# set correct model
llm=ChatgptLLM()
llm.model=model
# the prompt is a jinja template
prompt = absolute_eval_prompt
scores = []
def extract_score(llm_answer:str):
# the score will be the number after the final [RESULT] token
# it should be a number between 0 and 1
result_token="[RESULT]"
# find the position of last result_token
result_pos = llm_answer.rfind(result_token)
if result_pos==-1:
logging.info(f"No result token found in the answer: {llm_answer}")
assert False, f"No result token found in the answer: {llm_answer}"
# get the substring after the result_token
llm_answer = llm_answer[result_pos+len(result_token):]
# now, the score should be the first non white space char
# find the first non white space char
match = re.match(r"(\s*)(\S+)", llm_answer)
if match:
score = float(match.group(2))
if score<0 or score>5:
logging.info(f"Invalid score: {score}")
print(llm_answer)
assert False, f"Invalid score: {score}"
return score
else:
logging.info(f"Invalid answer: {llm_answer}")
print(llm_answer)
assert False, f"Invalid answer: {llm_answer}"
logging.info(f"Judging singularly {len(chat_history)} chat histories with model {llm.model}")
for chat in tqdm(chat_history, desc="Absolute evaluation"):
single_chat_scores=[]
prompt_rendered = prompt.render(chat_history=chat)
logging.info(f"Prompt rendered: {prompt_rendered}")
logging.info(f"Generating {n_consistency} scores for this chat history.")
while len(single_chat_scores)<n_consistency:
score = None
while not score:
try:
score = extract_score(llm.generate_response(prompt_rendered))
except:
pass
single_chat_scores.append(score)
logging.info(f"Score generated: {score}")
logging.info(f"Overall scores for this chat history: {np.mean(single_chat_scores)}")
scores.append(np.mean(single_chat_scores))
if len(chat_history)>1:
logging.info(f"Mean score: {np.mean(scores)}")
else:
logging.info(f"Score: {scores[0]}")
return scores
## same function but pairwise, for each pair of chat histories, calculate the LLM-As-A-JUDGE score
def calc_llm_as_a_judge_pairwise(chat_history_a:str|list[str], chat_history_b:str|list[str], model:str="gpt-4o", n_consistency:int=1):
# calculate the LLM-As-A-JUDGE metric
# output the winner of each pair of chat histories as a list
prompt=pairwise_eval_prompt
# format chat_history to a list of strings if it is a single string
if type(chat_history_a)==str:
chat_history_a = [chat_history_a]
if type(chat_history_b)==str:
chat_history_b = [chat_history_b]
# check if the chat histories have the same length
assert len(chat_history_a)==len(chat_history_b), "Chat histories must have the same length"
if model=="prometheus-2.0":
# use the new model
model="prometheus-7b-v2.0.Q5_0.gguf"
llm=LLMApi()
llm.model=model
else:
# set correct model
llm=ChatgptLLM()
llm.model=model
def extract_score(llm_answer:str):
""" Extract the winner of the chat history from the LLM answer (A or B)
If the answer is not in the correct format, raise an exception
Input: llm_answer: str
Output: score: str (either A or B)
"""
# the score will be the number after the final [RESULT] token
# it should be either A or B
result_token="[RESULT]"
# find the position of last result_token
result_pos = llm_answer.rfind(result_token)
if result_pos==-1:
logging.info(f"No result token found in the answer: {llm_answer}")
assert False, f"No result token found in the answer: {llm_answer}"
# get the substring after the result_token
llm_answer = llm_answer[result_pos+len(result_token):]
# now, the score should be the first non white space char
# find the first non white space char
match = re.match(r"(\s*)(\S+)", llm_answer)
if match:
score = match.group(2)
if score not in ["A", "B"]:
logging.info(f"Invalid score: {score}")
assert False, f"Invalid score: {score}"
return score
else:
logging.info(f"Invalid answer: {llm_answer}")
assert False, f"Invalid answer: {llm_answer}"
logging.info(f"Judging pairwise {len(chat_history_a)} chat of group A with {len(chat_history_b)} of group b chat histories with model {llm.model}")
matchup_winners=[]
for chat_a, chat_b in tqdm(zip(chat_history_a, chat_history_b), total=len(chat_history_a), desc="Pairwise evaluation"):
# shuffle the order of the chats
single_comparison_winners=[]
logging.info(f"Generating {n_consistency} winners for this pair of chat histories.")
# use majority voting to get the final matchup winner
while len(single_comparison_winners)<n_consistency:
# shuffling to stop favoring one chat history
if np.random.rand()>0.5:
chat_a, chat_b = chat_b, chat_a
switch=True
prompt_rendered = prompt.render(chat_history_a=chat_a, chat_history_b=chat_b)
winner_this_round = None
while not winner_this_round:
try:
winner_this_round = extract_score(llm.generate_response(prompt_rendered))
# fix the winner if the chat histories were switched
if switch:
winner_this_round = "A" if winner_this_round=="B" else "B"
except:
pass
single_comparison_winners.append(winner_this_round)
logging.info(f"Winner generated: {winner_this_round}")
winner_this_matchup = max(set(single_comparison_winners), key=single_comparison_winners.count)
# log match wins per chat history
logging.info(f"Consistency level wins for A: {single_comparison_winners.count('A')}")
logging.info(f"Consistency level wins for B: {single_comparison_winners.count('B')}")
logging.info(f"Overall winner for this pair of chat histories: {winner_this_matchup}")
matchup_winners.append(winner_this_matchup)
# log the wins per groups of chat histories
logging.info(f"Matchup wins for A: {matchup_winners.count('A')}")
logging.info(f"Matchup wins for B: {matchup_winners.count('B')}")
# return array of winners
return matchup_winners
if __name__ == "__main__":
chat_history = [
"Alice: Hi,\nBob: Hello,\nAlice: How are you?\nBob: I'm fine, thank you.",
"Alice: Hi,\nBob: Hello,\nAlice: How are you?\nBob: Not bad",
]
judge_score = calc_llm_as_a_judge(chat_history, n_consistency=3, model="prometheus-2.0")
print(f"LLM-As-A-JUDGE score: {judge_score}")
# test pairwise
chat_history_a = [
"Alice: Hi,\nBob: Hello,\nAlice: How are you?\nBob: I'm fine, thank you.",
"Alice: Hi,\nBob: Hello,\nAlice: How are you?\nBob: Not bad",
]
chat_history_b = [
"Alice: Hi,\nBob: Hello,\nAlice: How are you?\nBob: I hate you",
"Alice: Hi,\nBob: Hello,\nAlice: The pen is on the table\nBob: Not bad",
]
pairwise_matches=calc_llm_as_a_judge_pairwise(chat_history_a, chat_history_b, n_consistency=3, model="prometheus-2.0")
print(f"Pairwise winners: {pairwise_matches}")