Skip to content

Commit

Permalink
added original reward to wandb
Browse files Browse the repository at this point in the history
  • Loading branch information
isabella618033 committed Aug 24, 2023
1 parent 8267b2e commit 23d62ea
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 11 deletions.
14 changes: 8 additions & 6 deletions openvalidators/forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,18 +87,20 @@ async def run_step(self, prompt: str, k: int, timeout: float, name: str, exclude
# Compute the rewards for the responses given the prompt.
rewards: torch.FloatTensor = torch.zeros(len(responses), dtype=torch.float32).to(self.device)
for weight_i, reward_fn_i in zip(self.reward_weights, self.reward_functions):
reward_i = reward_fn_i.apply(prompt, responses, name).to(self.device)
rewards += weight_i * reward_i
reward_i, reward_i_normalized = reward_fn_i.apply(prompt, responses, name).to(self.device)
rewards += weight_i * reward_i_normalized
if not self.config.neuron.disable_log_rewards:
event[reward_fn_i.name] = reward_i.tolist()
bt.logging.trace(str(reward_fn_i.name), reward_i.tolist())
event[reward_fn_i.name + '_normalized'] = reward_i_normalized.tolist()
bt.logging.trace(str(reward_fn_i.name), reward_i_normalized.tolist())

for masking_fn_i in self.masking_functions:
mask_i = masking_fn_i.apply(base_prompt, responses, name).to(self.device)
rewards *= mask_i # includes diversity
mask_i, mask_i_normalized = masking_fn_i.apply(base_prompt, responses, name).to(self.device)
rewards *= mask_i_normalized # includes diversity
if not self.config.neuron.disable_log_rewards:
event[masking_fn_i.name] = mask_i.tolist()
bt.logging.trace(str(masking_fn_i.name), mask_i.tolist())
event[masking_fn_i.name + '_normalized'] = mask_i_normalized.tolist()
bt.logging.trace(str(masking_fn_i.name), mask_i_normalized.tolist())

# Train the gating model based on the predicted scores and the actual rewards.
gating_scores: torch.FloatTensor = self.gating_model(prompt).to(self.device)
Expand Down
12 changes: 7 additions & 5 deletions openvalidators/reward/reward.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,17 +98,19 @@ def apply( self, prompt: str, responses: List[ bt.DendriteCall ], name: str) ->
successful_rewards = self.get_rewards( prompt, successful_completions, name )

# Softmax rewards across samples.
successful_rewards = self.normalize_rewards( successful_rewards )
successful_rewards_normalized = self.normalize_rewards( successful_rewards )

# Init zero rewards for all calls.
filled_rewards = torch.zeros( len( responses ), dtype=torch.float32)
filled_rewards_normalized = torch.zeros( len( responses ), dtype=torch.float32)

# Fill reward tensor.
for idx, reward in zip(successful_completions_indices, successful_rewards):
for idx, reward, normalized_reward in zip(successful_completions_indices, successful_rewards, successful_rewards_normalized):
filled_rewards[idx] = reward
filled_rewards_normalized[idx] = normalized_reward

# Return the filled rewards.
return filled_rewards
return filled_rewards, filled_rewards_normalized


class MockRewardModel( BaseRewardModel ):
Expand All @@ -121,7 +123,7 @@ def __init__(self, mock_name: str = 'MockReward'):
self.mock_name = mock_name

def apply( self, prompt: str, completion: List[str], name: str ) -> torch.FloatTensor:
return torch.tensor( [0 for _ in completion], dtype=torch.float32 )

mock_reward = torch.tensor( [0 for _ in completion], dtype=torch.float32 )
return mock_reward, mock_reward


0 comments on commit 23d62ea

Please sign in to comment.