diff --git a/openvalidators/forward.py b/openvalidators/forward.py index 59589b2..5b5e071 100644 --- a/openvalidators/forward.py +++ b/openvalidators/forward.py @@ -87,18 +87,20 @@ async def run_step(self, prompt: str, k: int, timeout: float, name: str, exclude # Compute the rewards for the responses given the prompt. rewards: torch.FloatTensor = torch.zeros(len(responses), dtype=torch.float32).to(self.device) for weight_i, reward_fn_i in zip(self.reward_weights, self.reward_functions): - reward_i = reward_fn_i.apply(prompt, responses, name).to(self.device) - rewards += weight_i * reward_i + reward_i, reward_i_normalized = reward_fn_i.apply(prompt, responses, name).to(self.device) + rewards += weight_i * reward_i_normalized if not self.config.neuron.disable_log_rewards: event[reward_fn_i.name] = reward_i.tolist() - bt.logging.trace(str(reward_fn_i.name), reward_i.tolist()) + event[reward_fn_i.name + '_normalized'] = reward_i_normalized.tolist() + bt.logging.trace(str(reward_fn_i.name), reward_i_normalized.tolist()) for masking_fn_i in self.masking_functions: - mask_i = masking_fn_i.apply(base_prompt, responses, name).to(self.device) - rewards *= mask_i # includes diversity + mask_i, mask_i_normalized = masking_fn_i.apply(base_prompt, responses, name).to(self.device) + rewards *= mask_i_normalized # includes diversity if not self.config.neuron.disable_log_rewards: event[masking_fn_i.name] = mask_i.tolist() - bt.logging.trace(str(masking_fn_i.name), mask_i.tolist()) + event[masking_fn_i.name + '_normalized'] = mask_i_normalized.tolist() + bt.logging.trace(str(masking_fn_i.name), mask_i_normalized.tolist()) # Train the gating model based on the predicted scores and the actual rewards. gating_scores: torch.FloatTensor = self.gating_model(prompt).to(self.device) diff --git a/openvalidators/reward/reward.py b/openvalidators/reward/reward.py index 292d5d6..92d8958 100644 --- a/openvalidators/reward/reward.py +++ b/openvalidators/reward/reward.py @@ -98,17 +98,19 @@ def apply( self, prompt: str, responses: List[ bt.DendriteCall ], name: str) -> successful_rewards = self.get_rewards( prompt, successful_completions, name ) # Softmax rewards across samples. - successful_rewards = self.normalize_rewards( successful_rewards ) + successful_rewards_normalized = self.normalize_rewards( successful_rewards ) # Init zero rewards for all calls. filled_rewards = torch.zeros( len( responses ), dtype=torch.float32) + filled_rewards_normalized = torch.zeros( len( responses ), dtype=torch.float32) # Fill reward tensor. - for idx, reward in zip(successful_completions_indices, successful_rewards): + for idx, reward, normalized_reward in zip(successful_completions_indices, successful_rewards, successful_rewards_normalized): filled_rewards[idx] = reward + filled_rewards_normalized[idx] = normalized_reward # Return the filled rewards. - return filled_rewards + return filled_rewards, filled_rewards_normalized class MockRewardModel( BaseRewardModel ): @@ -121,7 +123,7 @@ def __init__(self, mock_name: str = 'MockReward'): self.mock_name = mock_name def apply( self, prompt: str, completion: List[str], name: str ) -> torch.FloatTensor: - return torch.tensor( [0 for _ in completion], dtype=torch.float32 ) - + mock_reward = torch.tensor( [0 for _ in completion], dtype=torch.float32 ) + return mock_reward, mock_reward \ No newline at end of file