added original reward to wandb

opentensor · Aug 24, 2023 · 23d62ea · 23d62ea
1 parent 8267b2e
commit 23d62ea
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 11 deletions.
diff --git a/openvalidators/forward.py b/openvalidators/forward.py
@@ -87,18 +87,20 @@ async def run_step(self, prompt: str, k: int, timeout: float, name: str, exclude
     # Compute the rewards for the responses given the prompt.
     rewards: torch.FloatTensor = torch.zeros(len(responses), dtype=torch.float32).to(self.device)
     for weight_i, reward_fn_i in zip(self.reward_weights, self.reward_functions):
-        reward_i = reward_fn_i.apply(prompt, responses, name).to(self.device)
-        rewards += weight_i * reward_i
+        reward_i, reward_i_normalized = reward_fn_i.apply(prompt, responses, name).to(self.device)
+        rewards += weight_i * reward_i_normalized
         if not self.config.neuron.disable_log_rewards:
             event[reward_fn_i.name] = reward_i.tolist()
-        bt.logging.trace(str(reward_fn_i.name), reward_i.tolist())
+            event[reward_fn_i.name + '_normalized'] = reward_i_normalized.tolist()
+        bt.logging.trace(str(reward_fn_i.name), reward_i_normalized.tolist())
 
     for masking_fn_i in self.masking_functions:
-        mask_i = masking_fn_i.apply(base_prompt, responses, name).to(self.device)
-        rewards *= mask_i  # includes diversity
+        mask_i, mask_i_normalized = masking_fn_i.apply(base_prompt, responses, name).to(self.device)
+        rewards *= mask_i_normalized  # includes diversity
         if not self.config.neuron.disable_log_rewards:
             event[masking_fn_i.name] = mask_i.tolist()
-        bt.logging.trace(str(masking_fn_i.name), mask_i.tolist())
+            event[masking_fn_i.name + '_normalized'] = mask_i_normalized.tolist()
+        bt.logging.trace(str(masking_fn_i.name), mask_i_normalized.tolist())
 
     # Train the gating model based on the predicted scores and the actual rewards.
     gating_scores: torch.FloatTensor = self.gating_model(prompt).to(self.device)

diff --git a/openvalidators/reward/reward.py b/openvalidators/reward/reward.py
@@ -98,17 +98,19 @@ def apply( self, prompt: str, responses: List[ bt.DendriteCall ], name: str) ->
         successful_rewards = self.get_rewards( prompt, successful_completions, name )
 
         # Softmax rewards across samples.
-        successful_rewards = self.normalize_rewards( successful_rewards )
+        successful_rewards_normalized = self.normalize_rewards( successful_rewards )
 
         # Init zero rewards for all calls.
         filled_rewards = torch.zeros( len( responses ), dtype=torch.float32)
+        filled_rewards_normalized = torch.zeros( len( responses ), dtype=torch.float32)
 
         # Fill reward tensor.
-        for idx, reward in zip(successful_completions_indices, successful_rewards):
+        for idx, reward, normalized_reward in zip(successful_completions_indices, successful_rewards, successful_rewards_normalized):
             filled_rewards[idx] = reward
+            filled_rewards_normalized[idx] = normalized_reward
 
         # Return the filled rewards.
-        return filled_rewards 
+        return filled_rewards, filled_rewards_normalized
 
 
 class MockRewardModel( BaseRewardModel ):
@@ -121,7 +123,7 @@ def __init__(self, mock_name: str = 'MockReward'):
         self.mock_name = mock_name
 
     def apply( self, prompt: str, completion: List[str], name: str ) -> torch.FloatTensor: 
-        return torch.tensor( [0 for _ in completion], dtype=torch.float32 )
-
+        mock_reward = torch.tensor( [0 for _ in completion], dtype=torch.float32 )
+        return mock_reward, mock_reward