From baf42de41fe76d07a764027adf1c28808b1d484c Mon Sep 17 00:00:00 2001 From: opentaco Date: Thu, 20 Jul 2023 19:46:35 +0200 Subject: [PATCH 01/26] Add DirectPreferenceRewardModel --- openvalidators/config.py | 6 +++ openvalidators/event.py | 2 + openvalidators/neuron.py | 5 ++ openvalidators/reward/__init__.py | 1 + openvalidators/reward/config.py | 4 +- openvalidators/reward/dpo.py | 80 +++++++++++++++++++++++++++++++ 6 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 openvalidators/reward/dpo.py diff --git a/openvalidators/config.py b/openvalidators/config.py index 6565cef..1ec0c35 100644 --- a/openvalidators/config.py +++ b/openvalidators/config.py @@ -249,6 +249,12 @@ def add_args(cls, parser): help="Weight for the reciprocate reward model", default=DefaultRewardFrameworkConfig.reciprocate_model_weight, ) + parser.add_argument( + "--reward.dpo_weight", + type=float, + help="Weight for the dpo reward model", + default=DefaultRewardFrameworkConfig.dpo_model_weight, + ) parser.add_argument( "--reward.rlhf_weight", type=float, diff --git a/openvalidators/event.py b/openvalidators/event.py index d790318..bbf797d 100644 --- a/openvalidators/event.py +++ b/openvalidators/event.py @@ -41,6 +41,7 @@ class EventSchema: nsfw_filter: Optional[List[float]] # Output vector of the nsfw filter reciprocate_reward_model: Optional[List[float]] # Output vector of the reciprocate reward model diversity_reward_model: Optional[List[float]] # Output vector of the diversity reward model + dpo_reward_model: Optional[List[float]] # Output vector of the dpo reward model rlhf_reward_model: Optional[List[float]] # Output vector of the rlhf reward model prompt_reward_model: Optional[List[float]] # Output vector of the prompt reward model relevance_filter: Optional[List[float]] # Output vector of the relevance scoring reward model @@ -58,6 +59,7 @@ def from_dict(event_dict: dict, disable_log_rewards: bool) -> 'EventSchema': 'relevance_filter': event_dict.get(RewardModelType.relevance.value), 'reciprocate_reward_model': event_dict.get(RewardModelType.reciprocate.value), 'diversity_reward_model': event_dict.get(RewardModelType.diversity.value), + 'dpo_reward_model': event_dict.get(RewardModelType.dpo.value), 'rlhf_reward_model': event_dict.get(RewardModelType.rlhf.value), 'prompt_reward_model': event_dict.get(RewardModelType.prompt.value), } diff --git a/openvalidators/neuron.py b/openvalidators/neuron.py index 8944767..9995a78 100644 --- a/openvalidators/neuron.py +++ b/openvalidators/neuron.py @@ -35,6 +35,7 @@ from openvalidators.reward import ( Blacklist, NSFWRewardModel, + DirectPreferenceRewardModel, OpenAssistantRewardModel, ReciprocateRewardModel, BertRelevanceRewardModel, @@ -142,6 +143,7 @@ def __init__(self): else: self.reward_weights = torch.tensor( [ + self.config.reward.dpo_weight, self.config.reward.rlhf_weight, self.config.reward.reciprocate_weight, self.config.reward.dahoas_weight, @@ -160,6 +162,9 @@ def __init__(self): raise Exception(message) self.reward_functions = [ + DirectPreferenceRewardModel(device=self.device) + if self.config.reward.dpo_weight > 0 + else MockRewardModel(RewardModelType.dpo.value), OpenAssistantRewardModel(device=self.device) if self.config.reward.rlhf_weight > 0 else MockRewardModel(RewardModelType.rlhf.value), diff --git a/openvalidators/reward/__init__.py b/openvalidators/reward/__init__.py index 6a94469..28a8a5a 100644 --- a/openvalidators/reward/__init__.py +++ b/openvalidators/reward/__init__.py @@ -1,5 +1,6 @@ from .blacklist import Blacklist from .nsfw import NSFWRewardModel +from .dpo import DirectPreferenceRewardModel from .open_assistant import OpenAssistantRewardModel from .reciprocate import ReciprocateRewardModel from .relevance import BertRelevanceRewardModel diff --git a/openvalidators/reward/config.py b/openvalidators/reward/config.py index 2f4b63b..2037a73 100644 --- a/openvalidators/reward/config.py +++ b/openvalidators/reward/config.py @@ -20,6 +20,7 @@ class RewardModelType(Enum): + dpo = 'dpo_reward_model' rlhf = 'rlhf_reward_model' reciprocate = 'reciprocate_reward_model' dahoas = 'dahoas_reward_model' @@ -35,7 +36,8 @@ class DefaultRewardFrameworkConfig: """Reward framework default configuration. Note: All the weights should add up to 1.0. """ - rlhf_model_weight: float = 0.6 + dpo_model_weight: float = 0.2 + rlhf_model_weight: float = 0.4 reciprocate_model_weight: float = 0.4 dahoas_model_weight: float = 0 prompt_model_weight: float = 0 diff --git a/openvalidators/reward/dpo.py b/openvalidators/reward/dpo.py new file mode 100644 index 0000000..780f18f --- /dev/null +++ b/openvalidators/reward/dpo.py @@ -0,0 +1,80 @@ +# The MIT License (MIT) +# Copyright © 2021 Yuma Rao + +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the “Software”), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of +# the Software. + +# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +import torch +from typing import List +from .config import RewardModelType +from .reward import BaseRewardModel +from transformers import AutoTokenizer, AutoModelForCausalLM + + +class DirectPreferenceRewardModel(BaseRewardModel): + + reward_model_name: str = "TheBloke/Llama-2-7B-fp16" + + @property + def name(self) -> str: return RewardModelType.dpo.value + + def __init__(self, device: str): + super().__init__() + self.device = device + self.tokenizer = AutoTokenizer.from_pretrained(DirectPreferenceRewardModel.reward_model_name) + self.model = AutoModelForCausalLM.from_pretrained(DirectPreferenceRewardModel.reward_model_name, + torch_dtype=torch.float16).to(self.device) + + def reward_single(self, prompt: str, completion: str, name: str) -> float: + r""" Calculates a direct preference optimization (DPO) style reward for a completion, + which is a reference model's average log-probability for completion tokens given a prompt. + Uses guidance from https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py. + """ + with torch.no_grad(): + # Tokenize the combined prompt + completion. + combined = self.tokenizer(prompt + completion, return_tensors="pt").input_ids[0].to(self.device) # [seq_len] + # Tokenize only the prompt, to help determine prompt token length. + prompt_part = self.tokenizer(prompt, return_tensors="pt").input_ids[0].to(self.device) # [prompt_len] + # Ensure that the prompt_part tokens align with the combined tokens. + assert (prompt_part == combined[:len(prompt_part)]).all() + + labels = combined.clone() # [seq_len] + # Label only each next token prediction ground-truth. + labels = labels[1:] # [seq_len-1] + # Ignore prompt part for calculating reward. + labels[1:len(prompt_part)] = -100 + loss_mask = (labels != -100) # [seq_len-1] + + # Dummy token to allow for indexing, but loss will be ignored. + labels[labels == -100] = 0 + # Reshape for gather operation. + labels = labels.unsqueeze(0).unsqueeze(2) # [batch_size=1, seq_len-1, :] + + # Forward pass to calculate logit predictions for each sequence position. + logits = self.model(combined.unsqueeze(0)).logits # [batch_size=1, seq_len, vocab_len] + # Predict only where labels are available + logits = logits[:, :-1, :] # [batch_size=1, seq_len-1, vocab_len] + # Rescale via log(softmax(logits)). + logits = logits.log_softmax(-1) + # Calculate the model's log-probability for each actual completion token. + per_token_logps = torch.gather(logits, dim=2, index=labels).squeeze(2) # [batch_size=1, seq_len-1] + # Average log-probability over completion sequence. + reward = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1) # [batch_size=1] + reward = reward[0].cpu().detach() + + return reward + + def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor: + return torch.tensor([self.reward_single(prompt, completion, name) for completion in completions], + dtype=torch.float32).to(self.device) From 8fe9acc9a65dfa72cdc9a527b49d9a9972414eab Mon Sep 17 00:00:00 2001 From: opentaco Date: Thu, 20 Jul 2023 19:47:16 +0200 Subject: [PATCH 02/26] Maint comment update --- openvalidators/reward/dpo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openvalidators/reward/dpo.py b/openvalidators/reward/dpo.py index 780f18f..9fc0b02 100644 --- a/openvalidators/reward/dpo.py +++ b/openvalidators/reward/dpo.py @@ -63,7 +63,7 @@ def reward_single(self, prompt: str, completion: str, name: str) -> float: # Forward pass to calculate logit predictions for each sequence position. logits = self.model(combined.unsqueeze(0)).logits # [batch_size=1, seq_len, vocab_len] - # Predict only where labels are available + # Predict only where labels are available. logits = logits[:, :-1, :] # [batch_size=1, seq_len-1, vocab_len] # Rescale via log(softmax(logits)). logits = logits.log_softmax(-1) From 01f0c3f6dfff3165dfedb8e929a3329c4178808f Mon Sep 17 00:00:00 2001 From: opentaco Date: Thu, 20 Jul 2023 20:36:46 +0200 Subject: [PATCH 03/26] Support Llama-2 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 18de33a..d8bf1a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ bittensor==5.2.0 -transformers<=4.28.0 +transformers<=4.31.0 wandb==0.15.3 datasets==2.12.0 plotly==5.14.1 From 4b2d7fe1a4e02b6ce320790391b93308b82b3d50 Mon Sep 17 00:00:00 2001 From: opentaco Date: Thu, 20 Jul 2023 21:00:16 +0200 Subject: [PATCH 04/26] Update token handling --- openvalidators/reward/dpo.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/openvalidators/reward/dpo.py b/openvalidators/reward/dpo.py index 9fc0b02..45f58db 100644 --- a/openvalidators/reward/dpo.py +++ b/openvalidators/reward/dpo.py @@ -46,14 +46,12 @@ def reward_single(self, prompt: str, completion: str, name: str) -> float: combined = self.tokenizer(prompt + completion, return_tensors="pt").input_ids[0].to(self.device) # [seq_len] # Tokenize only the prompt, to help determine prompt token length. prompt_part = self.tokenizer(prompt, return_tensors="pt").input_ids[0].to(self.device) # [prompt_len] - # Ensure that the prompt_part tokens align with the combined tokens. - assert (prompt_part == combined[:len(prompt_part)]).all() labels = combined.clone() # [seq_len] + # Ignore prompt part for calculating reward. + labels[:len(prompt_part)] = -100 # Label only each next token prediction ground-truth. labels = labels[1:] # [seq_len-1] - # Ignore prompt part for calculating reward. - labels[1:len(prompt_part)] = -100 loss_mask = (labels != -100) # [seq_len-1] # Dummy token to allow for indexing, but loss will be ignored. From 59b7ec0b92a4b5eb0ee472001e666a90593b0c5c Mon Sep 17 00:00:00 2001 From: opentaco Date: Thu, 20 Jul 2023 21:01:39 +0200 Subject: [PATCH 05/26] Update requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d8bf1a2..7a0758d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ bittensor==5.2.0 -transformers<=4.31.0 +transformers>=4.31.0 wandb==0.15.3 datasets==2.12.0 plotly==5.14.1 From d1e1f9f55bc89998074c335badeee5449b1d9aef Mon Sep 17 00:00:00 2001 From: opentaco Date: Thu, 20 Jul 2023 21:03:10 +0200 Subject: [PATCH 06/26] Update requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7a0758d..d8bf1a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ bittensor==5.2.0 -transformers>=4.31.0 +transformers<=4.31.0 wandb==0.15.3 datasets==2.12.0 plotly==5.14.1 From e8f7559de0235cd1d9ea1a2594070c5a89417a76 Mon Sep 17 00:00:00 2001 From: opentaco Date: Fri, 21 Jul 2023 17:00:55 +0200 Subject: [PATCH 07/26] Add trace --- openvalidators/reward/dpo.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/openvalidators/reward/dpo.py b/openvalidators/reward/dpo.py index 45f58db..83f0b26 100644 --- a/openvalidators/reward/dpo.py +++ b/openvalidators/reward/dpo.py @@ -16,6 +16,7 @@ # DEALINGS IN THE SOFTWARE. import torch +import bittensor as bt from typing import List from .config import RewardModelType from .reward import BaseRewardModel @@ -53,7 +54,8 @@ def reward_single(self, prompt: str, completion: str, name: str) -> float: # Label only each next token prediction ground-truth. labels = labels[1:] # [seq_len-1] loss_mask = (labels != -100) # [seq_len-1] - + bt.logging.trace(f"DirectPreferenceRewardModel | len(combined)={len(combined)}, " + f"len(prompt)={len(prompt_part)}") # Dummy token to allow for indexing, but loss will be ignored. labels[labels == -100] = 0 # Reshape for gather operation. @@ -63,6 +65,8 @@ def reward_single(self, prompt: str, completion: str, name: str) -> float: logits = self.model(combined.unsqueeze(0)).logits # [batch_size=1, seq_len, vocab_len] # Predict only where labels are available. logits = logits[:, :-1, :] # [batch_size=1, seq_len-1, vocab_len] + bt.logging.trace(f"DirectPreferenceRewardModel | logits: {logits}") + # Rescale via log(softmax(logits)). logits = logits.log_softmax(-1) # Calculate the model's log-probability for each actual completion token. @@ -71,6 +75,8 @@ def reward_single(self, prompt: str, completion: str, name: str) -> float: reward = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1) # [batch_size=1] reward = reward[0].cpu().detach() + bt.logging.trace(f"DirectPreferenceRewardModel | reward: {reward}") + return reward def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor: From 7e8a0d08ed2aa757dab67865dd74921c0c47536a Mon Sep 17 00:00:00 2001 From: opentaco Date: Fri, 21 Jul 2023 17:54:20 +0200 Subject: [PATCH 08/26] Handle nan/inf --- openvalidators/reward/dpo.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/openvalidators/reward/dpo.py b/openvalidators/reward/dpo.py index 83f0b26..ea1f493 100644 --- a/openvalidators/reward/dpo.py +++ b/openvalidators/reward/dpo.py @@ -54,8 +54,6 @@ def reward_single(self, prompt: str, completion: str, name: str) -> float: # Label only each next token prediction ground-truth. labels = labels[1:] # [seq_len-1] loss_mask = (labels != -100) # [seq_len-1] - bt.logging.trace(f"DirectPreferenceRewardModel | len(combined)={len(combined)}, " - f"len(prompt)={len(prompt_part)}") # Dummy token to allow for indexing, but loss will be ignored. labels[labels == -100] = 0 # Reshape for gather operation. @@ -65,7 +63,6 @@ def reward_single(self, prompt: str, completion: str, name: str) -> float: logits = self.model(combined.unsqueeze(0)).logits # [batch_size=1, seq_len, vocab_len] # Predict only where labels are available. logits = logits[:, :-1, :] # [batch_size=1, seq_len-1, vocab_len] - bt.logging.trace(f"DirectPreferenceRewardModel | logits: {logits}") # Rescale via log(softmax(logits)). logits = logits.log_softmax(-1) @@ -75,10 +72,13 @@ def reward_single(self, prompt: str, completion: str, name: str) -> float: reward = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1) # [batch_size=1] reward = reward[0].cpu().detach() - bt.logging.trace(f"DirectPreferenceRewardModel | reward: {reward}") - - return reward + # NaNs can possibly arise through log(0)=-inf, replace with suitably small logits. + if torch.isnan(reward) or torch.isinf(reward): + return -11. # exp(-11)=1.67e-5 < 2e-5=1/50257 (typical vocab size) + return reward.item() def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor: - return torch.tensor([self.reward_single(prompt, completion, name) for completion in completions], - dtype=torch.float32).to(self.device) + rewards = torch.tensor([self.reward_single(prompt, completion, name) for completion in completions], + dtype=torch.float32).to(self.device) + bt.logging.trace(f"DirectPreferenceRewardModel | rewards: {rewards.tolist()}") + return rewards From 2996a22db788d0ac6b7155f626d23c01a63602d4 Mon Sep 17 00:00:00 2001 From: opentaco Date: Thu, 27 Jul 2023 20:00:38 +0200 Subject: [PATCH 09/26] Use BTLM and check max seq length --- openvalidators/reward/dpo.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/openvalidators/reward/dpo.py b/openvalidators/reward/dpo.py index ea1f493..3a5860b 100644 --- a/openvalidators/reward/dpo.py +++ b/openvalidators/reward/dpo.py @@ -25,7 +25,7 @@ class DirectPreferenceRewardModel(BaseRewardModel): - reward_model_name: str = "TheBloke/Llama-2-7B-fp16" + reward_model_name: str = "cerebras/btlm-3b-8k-base" @property def name(self) -> str: return RewardModelType.dpo.value @@ -35,6 +35,7 @@ def __init__(self, device: str): self.device = device self.tokenizer = AutoTokenizer.from_pretrained(DirectPreferenceRewardModel.reward_model_name) self.model = AutoModelForCausalLM.from_pretrained(DirectPreferenceRewardModel.reward_model_name, + trust_remote_code=True, torch_dtype=torch.float16).to(self.device) def reward_single(self, prompt: str, completion: str, name: str) -> float: @@ -48,6 +49,14 @@ def reward_single(self, prompt: str, completion: str, name: str) -> float: # Tokenize only the prompt, to help determine prompt token length. prompt_part = self.tokenizer(prompt, return_tensors="pt").input_ids[0].to(self.device) # [prompt_len] + # Completion doesn't fit into model sequence, so return lowest reward. + if self.tokenizer.model_max_length <= len(prompt_part): + return -11. # exp(-11)=1.67e-5 < 2e-5=1/50257 (typical vocab size) + + # Truncate combined to fit into model max sequence length. + if self.tokenizer.model_max_length < len(combined): + combined = combined[:self.tokenizer.model_max_length] + labels = combined.clone() # [seq_len] # Ignore prompt part for calculating reward. labels[:len(prompt_part)] = -100 From 247785cafca2804ccd5e34350b93c3d2aed9c723 Mon Sep 17 00:00:00 2001 From: Steffen Cruz Date: Mon, 14 Aug 2023 16:22:17 -0600 Subject: [PATCH 10/26] Update README.md --- README.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/README.md b/README.md index 581c1a8..ac17f8f 100644 --- a/README.md +++ b/README.md @@ -118,13 +118,6 @@ Check the [README of the data collector](./scripts/README.md) for more informati ---- ## Experimental Features -### Prompt-Based Scoring -The reward mechanism for miner completions plays a crucial role in the overall quality of the network. As such, we are constantly developing and testing new methods that make the reward process **open** and **robust**. This benefits everyone. Presently, miners weights are set based on evaluations of their completions that are carried out by a reward model. This presents two major challenges: - -1. Reward model evaluations are a bottleneck, owing to the large model size -2. Reward models are vulnerable to attacks, which reduces the network quality for everyone - -Consequently, validators also perform *shadow scoring*, which outsources the reward mechanism to the network. This feature is currently under development, and so the prompt-based scores are only used for research purposes. ## Sentence Embedding Gating Model Another cornerstone of the validator functionality is the use of a mixture of experts (MoE) model, which we call the gating model, to enable queries to be efficiently routed to the best-suited miners. **This incentivizes miners to become specialists, which in turn improves response quality**. It also reduces latency and addresses bandwidth issues in the network. From 44fb6c0e6369e107b0cbfc5182c342034b6467da Mon Sep 17 00:00:00 2001 From: Cameron Fairchild Date: Mon, 21 Aug 2023 12:14:25 -0400 Subject: [PATCH 11/26] Need to use print exception instead --- openvalidators/run.py | 4 ++-- scripts/data_collector.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/openvalidators/run.py b/openvalidators/run.py index cd69f6c..47a85fc 100644 --- a/openvalidators/run.py +++ b/openvalidators/run.py @@ -17,7 +17,7 @@ import asyncio import bittensor as bt -from traceback import print_exc +from traceback import print_exception from openvalidators.forward import forward from openvalidators.utils import should_checkpoint, checkpoint, should_reinit_wandb, reinit_wandb, load_state, save_state @@ -58,4 +58,4 @@ async def run_forward(): except Exception as e: bt.logging.error("Error in training loop", str(e)) - bt.logging.debug(print_exc(e)) + bt.logging.debug(print_exception(value=e)) diff --git a/scripts/data_collector.py b/scripts/data_collector.py index 24c1563..a67c36d 100644 --- a/scripts/data_collector.py +++ b/scripts/data_collector.py @@ -22,7 +22,7 @@ import openvalidators import os from analysis.utils import get_runs, download_data -from traceback import print_exc +from traceback import print_exception from typing import List from data_formatter import create_json_dataset, create_csv_dataset, create_openai_dataset @@ -170,4 +170,4 @@ def create_mining_dataset( ) except Exception as e: bt.logging.error("Error in training loop", str(e)) - bt.logging.debug(print_exc(e)) + bt.logging.debug(print_exception(value=e)) From e5bab039f77b048307ed6c475c0b5d86cb5bd197 Mon Sep 17 00:00:00 2001 From: p-ferreira <38992619+p-ferreira@users.noreply.github.com> Date: Wed, 23 Aug 2023 09:48:44 -0400 Subject: [PATCH 12/26] update wandb config --- openvalidators/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openvalidators/utils.py b/openvalidators/utils.py index e9e6c92..23827b2 100644 --- a/openvalidators/utils.py +++ b/openvalidators/utils.py @@ -49,12 +49,15 @@ def init_wandb(self, reinit=False): if self.config.neuron.disable_log_rewards: tags.append("disable_log_rewards") + wandb_config = {key: self.config.get(key, None) for key in ('neuron', 'reward', 'netuid', 'wandb')} + wandb_config['neuron'].pop('full_path', None) + self.wandb = wandb.init( anonymous="allow", reinit=reinit, project=self.config.wandb.project_name, entity=self.config.wandb.entity, - config={key: self.config.get(key, None) for key in ('neuron', 'reward')}, + config=wandb_config, mode="offline" if self.config.wandb.offline else "online", dir=self.config.neuron.full_path, tags=tags, From baf3f6333f607ca239f74bf095df24df62a1dbff Mon Sep 17 00:00:00 2001 From: p-ferreira <38992619+p-ferreira@users.noreply.github.com> Date: Wed, 23 Aug 2023 10:03:28 -0400 Subject: [PATCH 13/26] adds deep copy --- openvalidators/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openvalidators/utils.py b/openvalidators/utils.py index 23827b2..c9d12f8 100644 --- a/openvalidators/utils.py +++ b/openvalidators/utils.py @@ -49,7 +49,7 @@ def init_wandb(self, reinit=False): if self.config.neuron.disable_log_rewards: tags.append("disable_log_rewards") - wandb_config = {key: self.config.get(key, None) for key in ('neuron', 'reward', 'netuid', 'wandb')} + wandb_config = {key: copy.deepcopy(self.config.get(key, None)) for key in ('neuron', 'reward', 'netuid', 'wandb')} wandb_config['neuron'].pop('full_path', None) self.wandb = wandb.init( From e1519d17ac61da8e7a183d400d81543f9cadf788 Mon Sep 17 00:00:00 2001 From: opentaco Date: Thu, 24 Aug 2023 11:01:57 +0200 Subject: [PATCH 14/26] Add DPO to test --- tests/test_event.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_event.py b/tests/test_event.py index 7fb9f2b..ff366f5 100644 --- a/tests/test_event.py +++ b/tests/test_event.py @@ -42,6 +42,7 @@ def test_event_from_dict_all_forward_columns_match(self): RewardModelType.nsfw.value: [1.0], RewardModelType.reciprocate.value: [1.0], RewardModelType.diversity.value: [1.0], + RewardModelType.dpo.value: [1.0], RewardModelType.rlhf.value: [1.0], RewardModelType.prompt.value: [1.0], RewardModelType.relevance.value: [1.0], @@ -100,6 +101,7 @@ def test_event_from_dict_forward_no_reward_logging(self): assert event.nsfw_filter is None assert event.reciprocate_reward_model is None assert event.diversity_reward_model is None + assert event.dpo_reward_model is None assert event.rlhf_reward_model is None assert event.prompt_reward_model is None assert event.relevance_filter is None @@ -141,6 +143,7 @@ def test_event_from_dict_forward_reward_logging_mismatch(self): assert event.nsfw_filter is None assert event.reciprocate_reward_model is None assert event.diversity_reward_model is None + assert event.dpo_reward_model is None assert event.rlhf_reward_model is None assert event.prompt_reward_model is None assert event.relevance_filter is None From 23d62eaddeb621e36267128eef6448dcd5904e0a Mon Sep 17 00:00:00 2001 From: isabella618033 Date: Thu, 24 Aug 2023 14:49:19 +0000 Subject: [PATCH 15/26] added original reward to wandb --- openvalidators/forward.py | 14 ++++++++------ openvalidators/reward/reward.py | 12 +++++++----- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/openvalidators/forward.py b/openvalidators/forward.py index 59589b2..5b5e071 100644 --- a/openvalidators/forward.py +++ b/openvalidators/forward.py @@ -87,18 +87,20 @@ async def run_step(self, prompt: str, k: int, timeout: float, name: str, exclude # Compute the rewards for the responses given the prompt. rewards: torch.FloatTensor = torch.zeros(len(responses), dtype=torch.float32).to(self.device) for weight_i, reward_fn_i in zip(self.reward_weights, self.reward_functions): - reward_i = reward_fn_i.apply(prompt, responses, name).to(self.device) - rewards += weight_i * reward_i + reward_i, reward_i_normalized = reward_fn_i.apply(prompt, responses, name).to(self.device) + rewards += weight_i * reward_i_normalized if not self.config.neuron.disable_log_rewards: event[reward_fn_i.name] = reward_i.tolist() - bt.logging.trace(str(reward_fn_i.name), reward_i.tolist()) + event[reward_fn_i.name + '_normalized'] = reward_i_normalized.tolist() + bt.logging.trace(str(reward_fn_i.name), reward_i_normalized.tolist()) for masking_fn_i in self.masking_functions: - mask_i = masking_fn_i.apply(base_prompt, responses, name).to(self.device) - rewards *= mask_i # includes diversity + mask_i, mask_i_normalized = masking_fn_i.apply(base_prompt, responses, name).to(self.device) + rewards *= mask_i_normalized # includes diversity if not self.config.neuron.disable_log_rewards: event[masking_fn_i.name] = mask_i.tolist() - bt.logging.trace(str(masking_fn_i.name), mask_i.tolist()) + event[masking_fn_i.name + '_normalized'] = mask_i_normalized.tolist() + bt.logging.trace(str(masking_fn_i.name), mask_i_normalized.tolist()) # Train the gating model based on the predicted scores and the actual rewards. gating_scores: torch.FloatTensor = self.gating_model(prompt).to(self.device) diff --git a/openvalidators/reward/reward.py b/openvalidators/reward/reward.py index 292d5d6..92d8958 100644 --- a/openvalidators/reward/reward.py +++ b/openvalidators/reward/reward.py @@ -98,17 +98,19 @@ def apply( self, prompt: str, responses: List[ bt.DendriteCall ], name: str) -> successful_rewards = self.get_rewards( prompt, successful_completions, name ) # Softmax rewards across samples. - successful_rewards = self.normalize_rewards( successful_rewards ) + successful_rewards_normalized = self.normalize_rewards( successful_rewards ) # Init zero rewards for all calls. filled_rewards = torch.zeros( len( responses ), dtype=torch.float32) + filled_rewards_normalized = torch.zeros( len( responses ), dtype=torch.float32) # Fill reward tensor. - for idx, reward in zip(successful_completions_indices, successful_rewards): + for idx, reward, normalized_reward in zip(successful_completions_indices, successful_rewards, successful_rewards_normalized): filled_rewards[idx] = reward + filled_rewards_normalized[idx] = normalized_reward # Return the filled rewards. - return filled_rewards + return filled_rewards, filled_rewards_normalized class MockRewardModel( BaseRewardModel ): @@ -121,7 +123,7 @@ def __init__(self, mock_name: str = 'MockReward'): self.mock_name = mock_name def apply( self, prompt: str, completion: List[str], name: str ) -> torch.FloatTensor: - return torch.tensor( [0 for _ in completion], dtype=torch.float32 ) - + mock_reward = torch.tensor( [0 for _ in completion], dtype=torch.float32 ) + return mock_reward, mock_reward \ No newline at end of file From 21dbce94ebb10bdcec6d48364266b3f3031080d7 Mon Sep 17 00:00:00 2001 From: isabella618033 Date: Thu, 24 Aug 2023 14:50:56 +0000 Subject: [PATCH 16/26] naming convention --- openvalidators/reward/reward.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openvalidators/reward/reward.py b/openvalidators/reward/reward.py index 92d8958..e95f931 100644 --- a/openvalidators/reward/reward.py +++ b/openvalidators/reward/reward.py @@ -105,9 +105,9 @@ def apply( self, prompt: str, responses: List[ bt.DendriteCall ], name: str) -> filled_rewards_normalized = torch.zeros( len( responses ), dtype=torch.float32) # Fill reward tensor. - for idx, reward, normalized_reward in zip(successful_completions_indices, successful_rewards, successful_rewards_normalized): + for idx, reward, reward_normalized in zip(successful_completions_indices, successful_rewards, successful_rewards_normalized): filled_rewards[idx] = reward - filled_rewards_normalized[idx] = normalized_reward + filled_rewards_normalized[idx] = reward_normalized # Return the filled rewards. return filled_rewards, filled_rewards_normalized From ea3f44d4dadf5e58afa0990dd5977286fa2f4713 Mon Sep 17 00:00:00 2001 From: isabella618033 Date: Fri, 25 Aug 2023 17:13:18 +0000 Subject: [PATCH 17/26] big fix --- openvalidators/forward.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/openvalidators/forward.py b/openvalidators/forward.py index 5b5e071..8b71847 100644 --- a/openvalidators/forward.py +++ b/openvalidators/forward.py @@ -87,16 +87,16 @@ async def run_step(self, prompt: str, k: int, timeout: float, name: str, exclude # Compute the rewards for the responses given the prompt. rewards: torch.FloatTensor = torch.zeros(len(responses), dtype=torch.float32).to(self.device) for weight_i, reward_fn_i in zip(self.reward_weights, self.reward_functions): - reward_i, reward_i_normalized = reward_fn_i.apply(prompt, responses, name).to(self.device) - rewards += weight_i * reward_i_normalized + reward_i, reward_i_normalized = reward_fn_i.apply(prompt, responses, name) + rewards += weight_i * reward_i_normalized.to(self.device) if not self.config.neuron.disable_log_rewards: event[reward_fn_i.name] = reward_i.tolist() event[reward_fn_i.name + '_normalized'] = reward_i_normalized.tolist() bt.logging.trace(str(reward_fn_i.name), reward_i_normalized.tolist()) for masking_fn_i in self.masking_functions: - mask_i, mask_i_normalized = masking_fn_i.apply(base_prompt, responses, name).to(self.device) - rewards *= mask_i_normalized # includes diversity + mask_i, mask_i_normalized = masking_fn_i.apply(base_prompt, responses, name) + rewards *= mask_i_normalized.to(self.device) # includes diversity if not self.config.neuron.disable_log_rewards: event[masking_fn_i.name] = mask_i.tolist() event[masking_fn_i.name + '_normalized'] = mask_i_normalized.tolist() From 2c10303acf4daec8dc6132a56d493778475519bd Mon Sep 17 00:00:00 2001 From: Eugene Date: Fri, 25 Aug 2023 11:41:02 -0700 Subject: [PATCH 18/26] penalty update --- openvalidators/reward/dpo.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/openvalidators/reward/dpo.py b/openvalidators/reward/dpo.py index 3a5860b..a48f350 100644 --- a/openvalidators/reward/dpo.py +++ b/openvalidators/reward/dpo.py @@ -33,17 +33,23 @@ def name(self) -> str: return RewardModelType.dpo.value def __init__(self, device: str): super().__init__() self.device = device + self.penalty = 1.5 self.tokenizer = AutoTokenizer.from_pretrained(DirectPreferenceRewardModel.reward_model_name) self.model = AutoModelForCausalLM.from_pretrained(DirectPreferenceRewardModel.reward_model_name, trust_remote_code=True, torch_dtype=torch.float16).to(self.device) - def reward_single(self, prompt: str, completion: str, name: str) -> float: + def reward_single(self, prompt: str, completion: str, name: str ,with_penalty=True) -> float: r""" Calculates a direct preference optimization (DPO) style reward for a completion, which is a reference model's average log-probability for completion tokens given a prompt. Uses guidance from https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py. """ with torch.no_grad(): + + # Check if completion is + if completion.strip() == '' or len(completion) <= 5: + return -11 # exp(-11)=1.67e-5 < 2e-5=1/50257 (typical vocab size) + # Tokenize the combined prompt + completion. combined = self.tokenizer(prompt + completion, return_tensors="pt").input_ids[0].to(self.device) # [seq_len] # Tokenize only the prompt, to help determine prompt token length. @@ -73,6 +79,13 @@ def reward_single(self, prompt: str, completion: str, name: str) -> float: # Predict only where labels are available. logits = logits[:, :-1, :] # [batch_size=1, seq_len-1, vocab_len] + if with_penalty: + # Apply penalty for repeated generation + for i in range(len(prompt_part)+1, len(combined)-1): + logit = logits[:,i,:].clone() + inputs = combined[len(prompt_part):i].clone() + logits[:,i,:] = self.logit_penalty(input_ids=inputs, logit=logit) + # Rescale via log(softmax(logits)). logits = logits.log_softmax(-1) # Calculate the model's log-probability for each actual completion token. @@ -91,3 +104,14 @@ def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.F dtype=torch.float32).to(self.device) bt.logging.trace(f"DirectPreferenceRewardModel | rewards: {rewards.tolist()}") return rewards + + def logit_penalty(self, input_ids: torch.LongTensor, logit: torch.FloatTensor) -> torch.FloatTensor: + # Counts the unique tokens within each generation + uniques, counts = input_ids.unique(return_counts=True) + score = torch.gather(logit, 1, uniques.unsqueeze(0)) + + # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability + score = torch.where(score < 0, score * (self.penalty**counts), score / (self.penalty**counts)) + + logit.scatter_(1, uniques.unsqueeze(0), score.to(logit.dtype)) + return logit \ No newline at end of file From e8a65f2c1e169776b357ff7144c386ee2ef7148b Mon Sep 17 00:00:00 2001 From: Eugene Date: Fri, 25 Aug 2023 11:41:33 -0700 Subject: [PATCH 19/26] dpo 1.2 --- openvalidators/reward/dpo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openvalidators/reward/dpo.py b/openvalidators/reward/dpo.py index a48f350..7bfdf57 100644 --- a/openvalidators/reward/dpo.py +++ b/openvalidators/reward/dpo.py @@ -33,7 +33,7 @@ def name(self) -> str: return RewardModelType.dpo.value def __init__(self, device: str): super().__init__() self.device = device - self.penalty = 1.5 + self.penalty = 1.2 self.tokenizer = AutoTokenizer.from_pretrained(DirectPreferenceRewardModel.reward_model_name) self.model = AutoModelForCausalLM.from_pretrained(DirectPreferenceRewardModel.reward_model_name, trust_remote_code=True, From 49da13453b91b26a2b625fc57a3b1c0ea4e20e3f Mon Sep 17 00:00:00 2001 From: isabella618033 Date: Fri, 25 Aug 2023 19:01:52 +0000 Subject: [PATCH 20/26] logging fix --- openvalidators/reward/reward.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openvalidators/reward/reward.py b/openvalidators/reward/reward.py index e95f931..11ce39c 100644 --- a/openvalidators/reward/reward.py +++ b/openvalidators/reward/reward.py @@ -101,7 +101,7 @@ def apply( self, prompt: str, responses: List[ bt.DendriteCall ], name: str) -> successful_rewards_normalized = self.normalize_rewards( successful_rewards ) # Init zero rewards for all calls. - filled_rewards = torch.zeros( len( responses ), dtype=torch.float32) + filled_rewards = torch.ones( len( responses ), dtype=torch.float32) * torch.nan filled_rewards_normalized = torch.zeros( len( responses ), dtype=torch.float32) # Fill reward tensor. From 39e5b75b64342ef4a67fbc64738cde007408f1ca Mon Sep 17 00:00:00 2001 From: isabella618033 Date: Fri, 25 Aug 2023 20:03:06 +0000 Subject: [PATCH 21/26] added normalized values to event --- openvalidators/event.py | 22 +++++++++++++++++++++- tests/test_event.py | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/openvalidators/event.py b/openvalidators/event.py index 48ebabd..a7af8a0 100644 --- a/openvalidators/event.py +++ b/openvalidators/event.py @@ -47,6 +47,16 @@ class EventSchema: relevance_filter: Optional[List[float]] # Output vector of the relevance scoring reward model task_validator_filter: Optional[List[float]] + dahoas_reward_model_normalized: Optional[List[float]] # Output vector of the dahoas reward model + nsfw_filter_normalized: Optional[List[float]] # Output vector of the nsfw filter + reciprocate_reward_model_normalized: Optional[List[float]] # Output vector of the reciprocate reward model + diversity_reward_model_normalized: Optional[List[float]] # Output vector of the diversity reward model + dpo_reward_model_normalized: Optional[List[float]] # Output vector of the dpo reward model + rlhf_reward_model_normalized: Optional[List[float]] # Output vector of the rlhf reward model + prompt_reward_model_normalized: Optional[List[float]] # Output vector of the prompt reward model + relevance_filter_normalized: Optional[List[float]] # Output vector of the relevance scoring reward model + task_validator_filter_normalized: Optional[List[float]] + # Weights data set_weights: Optional[List[List[float]]] @@ -54,8 +64,8 @@ class EventSchema: def from_dict(event_dict: dict, disable_log_rewards: bool) -> 'EventSchema': """Converts a dictionary to an EventSchema object.""" rewards = { - 'dahoas_reward_model': event_dict.get(RewardModelType.dahoas.value), 'blacklist_filter': event_dict.get(RewardModelType.blacklist.value), + 'dahoas_reward_model': event_dict.get(RewardModelType.dahoas.value), 'task_validator_filter': event_dict.get(RewardModelType.task_validator.value), 'nsfw_filter': event_dict.get(RewardModelType.nsfw.value), 'relevance_filter': event_dict.get(RewardModelType.relevance.value), @@ -64,6 +74,16 @@ def from_dict(event_dict: dict, disable_log_rewards: bool) -> 'EventSchema': 'dpo_reward_model': event_dict.get(RewardModelType.dpo.value), 'rlhf_reward_model': event_dict.get(RewardModelType.rlhf.value), 'prompt_reward_model': event_dict.get(RewardModelType.prompt.value), + + 'dahoas_reward_model_normalized': event_dict.get(RewardModelType.dahoas.value + '_normalized'), + 'task_validator_filter_normalized': event_dict.get(RewardModelType.task_validator.value + '_normalized'), + 'nsfw_filter_normalized': event_dict.get(RewardModelType.nsfw.value + '_normalized'), + 'relevance_filter_normalized': event_dict.get(RewardModelType.relevance.value + '_normalized'), + 'reciprocate_reward_model_normalized': event_dict.get(RewardModelType.reciprocate.value + '_normalized'), + 'diversity_reward_model_normalized': event_dict.get(RewardModelType.diversity.value + '_normalized'), + 'dpo_reward_model_normalized': event_dict.get(RewardModelType.dpo.value + '_normalized'), + 'rlhf_reward_model_normalized': event_dict.get(RewardModelType.rlhf.value + '_normalized'), + 'prompt_reward_model_normalized': event_dict.get(RewardModelType.prompt.value + '_normalized'), } # Logs warning that expected data was not set properly diff --git a/tests/test_event.py b/tests/test_event.py index ff366f5..44ce9c2 100644 --- a/tests/test_event.py +++ b/tests/test_event.py @@ -46,7 +46,18 @@ def test_event_from_dict_all_forward_columns_match(self): RewardModelType.rlhf.value: [1.0], RewardModelType.prompt.value: [1.0], RewardModelType.relevance.value: [1.0], - RewardModelType.task_validator.value: [1.0] + RewardModelType.task_validator.value: [1.0], + + RewardModelType.dahoas.value + '_normalized': [1.0], + RewardModelType.blacklist.value + '_normalized': [1.0], + RewardModelType.nsfw.value + '_normalized': [1.0], + RewardModelType.reciprocate.value + '_normalized': [1.0], + RewardModelType.diversity.value + '_normalized': [1.0], + RewardModelType.dpo.value + '_normalized': [1.0], + RewardModelType.rlhf.value + '_normalized': [1.0], + RewardModelType.prompt.value + '_normalized': [1.0], + RewardModelType.relevance.value + '_normalized': [1.0], + RewardModelType.task_validator.value + '_normalized': [1.0] } # Act @@ -107,6 +118,16 @@ def test_event_from_dict_forward_no_reward_logging(self): assert event.relevance_filter is None assert event.task_validator_filter is None + assert event.dahoas_reward_model_normalized is None + assert event.nsfw_filter_normalized is None + assert event.reciprocate_reward_model_normalized is None + assert event.diversity_reward_model_normalized is None + assert event.dpo_reward_model_normalized is None + assert event.rlhf_reward_model_normalized is None + assert event.prompt_reward_model_normalized is None + assert event.relevance_filter_normalized is None + assert event.task_validator_filter_normalized is None + def test_event_from_dict_forward_reward_logging_mismatch(self): """Test that all default columns logged on the forward pass are correctly converted and that that reward columns that should be logged are logged as warnings""" @@ -124,7 +145,12 @@ def test_event_from_dict_forward_reward_logging_mismatch(self): 'rewards': [1.0], } - not_logged_columns = [field.value for field in RewardModelType] + not_logged_columns = [] + for field in RewardModelType: + not_logged_columns.append(field.value) + if field.value != 'blacklist_filter': + not_logged_columns.append(field.value + '_normalized') + # Act with patch('bittensor.logging.warning') as mock_warning: @@ -149,3 +175,12 @@ def test_event_from_dict_forward_reward_logging_mismatch(self): assert event.relevance_filter is None assert event.task_validator_filter is None + assert event.dahoas_reward_model_normalized is None + assert event.nsfw_filter_normalized is None + assert event.reciprocate_reward_model_normalized is None + assert event.diversity_reward_model_normalized is None + assert event.dpo_reward_model_normalized is None + assert event.rlhf_reward_model_normalized is None + assert event.prompt_reward_model_normalized is None + assert event.relevance_filter_normalized is None + assert event.task_validator_filter_normalized is None From b156a6d45e37b7037339b1d5b7579e74dcb72a71 Mon Sep 17 00:00:00 2001 From: Eugene Date: Fri, 25 Aug 2023 13:46:49 -0700 Subject: [PATCH 22/26] comment --- openvalidators/reward/dpo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openvalidators/reward/dpo.py b/openvalidators/reward/dpo.py index 7bfdf57..6cc7d59 100644 --- a/openvalidators/reward/dpo.py +++ b/openvalidators/reward/dpo.py @@ -33,7 +33,7 @@ def name(self) -> str: return RewardModelType.dpo.value def __init__(self, device: str): super().__init__() self.device = device - self.penalty = 1.2 + self.penalty = 1.2 # Same penalty as the original [paper](https://arxiv.org/pdf/1909.05858.pdf). self.tokenizer = AutoTokenizer.from_pretrained(DirectPreferenceRewardModel.reward_model_name) self.model = AutoModelForCausalLM.from_pretrained(DirectPreferenceRewardModel.reward_model_name, trust_remote_code=True, From c9dbf4dfb99c5128a4783a1120131a1e8443616e Mon Sep 17 00:00:00 2001 From: Eugene Date: Fri, 25 Aug 2023 13:56:04 -0700 Subject: [PATCH 23/26] reweighting --- openvalidators/reward/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openvalidators/reward/config.py b/openvalidators/reward/config.py index cdab7d0..53581b0 100644 --- a/openvalidators/reward/config.py +++ b/openvalidators/reward/config.py @@ -35,8 +35,8 @@ class DefaultRewardFrameworkConfig: """Reward framework default configuration. Note: All the weights should add up to 1.0. """ - dpo_model_weight: float = 0.2 + dpo_model_weight: float = 0.4 rlhf_model_weight: float = 0.4 - reciprocate_model_weight: float = 0.4 + reciprocate_model_weight: float = 0.2 dahoas_model_weight: float = 0 prompt_model_weight: float = 0 From e513a97e77936fdac83441113b114d3b15b74926 Mon Sep 17 00:00:00 2001 From: Eugene Date: Mon, 28 Aug 2023 08:27:35 -0700 Subject: [PATCH 24/26] small reweight --- openvalidators/reward/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openvalidators/reward/config.py b/openvalidators/reward/config.py index 53581b0..a8ae5dc 100644 --- a/openvalidators/reward/config.py +++ b/openvalidators/reward/config.py @@ -35,8 +35,8 @@ class DefaultRewardFrameworkConfig: """Reward framework default configuration. Note: All the weights should add up to 1.0. """ - dpo_model_weight: float = 0.4 + dpo_model_weight: float = 0.3 rlhf_model_weight: float = 0.4 - reciprocate_model_weight: float = 0.2 + reciprocate_model_weight: float = 0.3 dahoas_model_weight: float = 0 prompt_model_weight: float = 0 From dfc2174b2a164cb959ee64686709ce0661844f83 Mon Sep 17 00:00:00 2001 From: Eugene Date: Mon, 28 Aug 2023 08:49:22 -0700 Subject: [PATCH 25/26] version update --- openvalidators/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openvalidators/__init__.py b/openvalidators/__init__.py index 87cf974..e065d6d 100644 --- a/openvalidators/__init__.py +++ b/openvalidators/__init__.py @@ -28,6 +28,6 @@ from . import weights from . import event -__version__ = "1.1.8" +__version__ = "1.2.0" version_split = __version__.split(".") __spec_version__ = (1000 * int(version_split[0])) + (10 * int(version_split[1])) + (1 * int(version_split[2])) From be21a17ba4b06cc77cc9975d6bcb2908d8c55dc5 Mon Sep 17 00:00:00 2001 From: p-ferreira <38992619+p-ferreira@users.noreply.github.com> Date: Mon, 28 Aug 2023 12:02:41 -0400 Subject: [PATCH 26/26] updates changelog --- CHANGELOG.md | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 86792ae..0ccfab9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,24 @@ # Changelog -## 1.1.8 / 2023-08-12 +## 1.2.0 / 2023-08-28 +### What's changed +- Adds Direct Optimization (DPO) style rewards by @opentaco on #99 +- Changes print format on exception catch by @camfairchild on #135 +- Brings back netuid and wandb to logged config by @p-ferreira on #137 +- Adds DPO penalty update by @Eugene-hu on #138 +- Adds original reward output to wandb logs by @isabella618033 on #139 +- Reweights reward models by @Eugene-hu on #140 +- Update stale documentation by @steffencruz on #129 -## What's Changed -- Make sure to serve axon first by @camfairchild in 14921d35c -**Full Changelog**: https://github.com/opentensor/validators/compare/v1.1.7...v1.1.8 +**Full Changelog**: https://github.com/opentensor/validators/compare/v1.1.7...v1.2.0 +## 1.1.8 / 2023-08-12 +### What's Changed +- Make sure to serve axon first by @camfairchild in 14921d35c +- Adds scripts for releases on github by @camfairchild in #128 +- Wandb config log changes @isabella618033 in #132 ## 1.1.7 / 2023-08-11 ### What’s Changed