Skip to content

Commit

Permalink
Allow custom answer generation function in WWB (#507)
Browse files Browse the repository at this point in the history
This functionally will help for enable different model API that has
different interface for generation answers (e.g. OpenVINO GenAI)

example with GenAI:

```
from transformers import AutoModelForCausalLM, AutoTokenizer
import huggingface_hub as hf_hub
import whowhatbench
import openvino_genai

model_id = "databricks/dolly-v2-3b"
base_model = AutoModelForCausalLM.from_pretrained(model_id)
ov_model_dir = "./dolly-v2-3b-int4-ov"

hf_hub.snapshot_download("OpenVINO/dolly-v2-3b-int4-ov", local_dir=ov_model_dir)
optimized_model = openvino_genai.LLMPipeline(ov_model_dir, "CPU")
tokenizer = AutoTokenizer.from_pretrained(model_id)

def genai_gen_answer(model, tokenizer, question, max_new_tokens, skip_question):
    out = model.generate(question, max_new_tokens=max_new_tokens)
    return out.texts[0]

evaluator = whowhatbench.Evaluator(base_model=base_model, tokenizer=tokenizer)
metrics_per_prompt, metrics = evaluator.score(optimized_mode, gen_answer_fn=genai_gen_answer)

```
  • Loading branch information
eaidova authored Jun 20, 2024
1 parent c382c6c commit c7c592d
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 9 deletions.
20 changes: 12 additions & 8 deletions llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ def __init__(
def dump_gt(self, csv_name: str):
self.gt_data.to_csv(csv_name)

def score(self, model):
predictions = self._generate_data(model)
def score(self, model, gen_answer_fn=None):
predictions = self._generate_data(model, gen_answer_fn)

all_metrics_per_question = {}
all_metrics = {}
Expand Down Expand Up @@ -119,7 +119,15 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):

return res

def _generate_data(self, model):
def _generate_data(self, model, gen_answer_fn=None):
def default_gen_answer(model, tokenizer, question, max_new_tokens, crop_question):
inputs = self.tokenizer(question, return_tensors="pt")
tokens = model.generate(**inputs, max_new_tokens=max_new_tokens)
out = self.tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
return out[len(question) :] if crop_question else out

gen_answer_fn = gen_answer_fn or default_gen_answer

if self.test_data:
if isinstance(self.test_data, str):
data = pd.read_csv(self.test_data)
Expand All @@ -138,11 +146,7 @@ def _generate_data(self, model):
answers = []

for q in tqdm(questions.values, desc="Evaluate pipeline"):
inputs = self.tokenizer(q, return_tensors="pt")
tokens = model.generate(**inputs, max_new_tokens=self.max_new_tokens)
out = self.tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
answer = out[len(q):] if self._crop_question else out
answers.append(answer)
answers.append(gen_answer_fn(model, self.tokenizer, q, self.max_new_tokens, self._crop_question))

res_data = {"questions": list(questions.values), "answers": answers}
df = pd.DataFrame(res_data)
Expand Down

0 comments on commit c7c592d

Please sign in to comment.