From 877177f3bf06634e2688896b7c59d38ccc5e3ff1 Mon Sep 17 00:00:00 2001 From: opentaco Date: Fri, 25 Aug 2023 18:46:59 +0200 Subject: [PATCH] Check DPO min completion len --- openvalidators/reward/dpo.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/openvalidators/reward/dpo.py b/openvalidators/reward/dpo.py index 3a5860b..6386a80 100644 --- a/openvalidators/reward/dpo.py +++ b/openvalidators/reward/dpo.py @@ -32,6 +32,7 @@ def name(self) -> str: return RewardModelType.dpo.value def __init__(self, device: str): super().__init__() + self.min_completion_len = 5 # minimum number of tokens expected in each completion part. self.device = device self.tokenizer = AutoTokenizer.from_pretrained(DirectPreferenceRewardModel.reward_model_name) self.model = AutoModelForCausalLM.from_pretrained(DirectPreferenceRewardModel.reward_model_name, @@ -49,6 +50,12 @@ def reward_single(self, prompt: str, completion: str, name: str) -> float: # Tokenize only the prompt, to help determine prompt token length. prompt_part = self.tokenizer(prompt, return_tensors="pt").input_ids[0].to(self.device) # [prompt_len] + # Number of tokens in the completion part. + completion_len = len(combined) - len(prompt_part) + # Completion part is less than the minimum allowed length. + if completion_len < self.min_completion_len: + return -11. # exp(-11)=1.67e-5 < 2e-5=1/50257 (typical vocab size) + # Completion doesn't fit into model sequence, so return lowest reward. if self.tokenizer.model_max_length <= len(prompt_part): return -11. # exp(-11)=1.67e-5 < 2e-5=1/50257 (typical vocab size)