From b156a6d45e37b7037339b1d5b7579e74dcb72a71 Mon Sep 17 00:00:00 2001 From: Eugene Date: Fri, 25 Aug 2023 13:46:49 -0700 Subject: [PATCH] comment --- openvalidators/reward/dpo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openvalidators/reward/dpo.py b/openvalidators/reward/dpo.py index 7bfdf57..6cc7d59 100644 --- a/openvalidators/reward/dpo.py +++ b/openvalidators/reward/dpo.py @@ -33,7 +33,7 @@ def name(self) -> str: return RewardModelType.dpo.value def __init__(self, device: str): super().__init__() self.device = device - self.penalty = 1.2 + self.penalty = 1.2 # Same penalty as the original [paper](https://arxiv.org/pdf/1909.05858.pdf). self.tokenizer = AutoTokenizer.from_pretrained(DirectPreferenceRewardModel.reward_model_name) self.model = AutoModelForCausalLM.from_pretrained(DirectPreferenceRewardModel.reward_model_name, trust_remote_code=True,