Skip to content

Commit

Permalink
First attempt at fitting one sided directions
Browse files Browse the repository at this point in the history
  • Loading branch information
ojh31 committed Oct 13, 2023
1 parent 6d9ed24 commit 6663ec0
Showing 1 changed file with 113 additions and 0 deletions.
113 changes: 113 additions & 0 deletions fit_one_sided_directions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#%%
import einops
from functools import partial
import torch
from torch import Tensor
from torch.utils.data import DataLoader
from datasets import load_dataset
from jaxtyping import Float, Int, Bool
from typing import Dict, Iterable, List, Tuple, Union
from transformer_lens import HookedTransformer
from transformer_lens.utils import get_dataset, tokenize_and_concatenate, get_act_name, test_prompt
from transformer_lens.hook_points import HookPoint
from tqdm.notebook import tqdm
import pandas as pd
import yaml
from utils.store import load_array, save_html, save_array, is_file, get_model_name, clean_label, save_text
#%%
torch.set_grad_enabled(False)
device = "cuda"
model = HookedTransformer.from_pretrained(
"gpt2-small",
)
#%%
BATCH_SIZE = 64
owt_data = load_dataset("stas/openwebtext-10k", split="train")
dataset = tokenize_and_concatenate(owt_data, model.tokenizer)
data_loader = DataLoader(
dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True
)
#%%
ACT_NAME = get_act_name("resid_post", 0)
#%% # Neutral
count = 0
total = torch.zeros(model.cfg.d_model)
for batch in tqdm(data_loader):
_, cache = model.run_with_cache(
batch['tokens'],
return_type=None,
names_filter = lambda name: name == ACT_NAME
)
count += 1
total += cache[ACT_NAME][:, 1, :].mean(dim=0).cpu()
neutral_activation = total / count
print(neutral_activation.shape, neutral_activation.norm())
#%% # Positive
with open("prompts.yaml", "r") as f:
prompt_dict = yaml.safe_load(f)
#%%
positive_str_tokens = (
prompt_dict['positive_adjectives_train'] +
prompt_dict['positive_comment_adjectives'] +
prompt_dict['positive_nouns'] +
prompt_dict['positive_verbs'] +
prompt_dict['positive_infinitives']
)
positive_single_tokens = []
for token in positive_str_tokens:
token = " " + token
if len(model.to_str_tokens(token, prepend_bos=False)) == 1:
positive_single_tokens.append(token)
positive_tokens = model.to_tokens(
positive_single_tokens,
prepend_bos=True,
)
assert positive_tokens.shape[1] == 2
_, positive_cache = model.run_with_cache(
positive_tokens,
return_type=None,
names_filter = lambda name: name == ACT_NAME
)
positive_activation = positive_cache[ACT_NAME][:, -1].mean(dim=0).cpu()
print(positive_activation.shape, positive_activation.norm())
#%% # Negative
negative_str_tokens = (
prompt_dict['negative_adjectives_train'] +
prompt_dict['negative_comment_adjectives'] +
prompt_dict['negative_nouns'] +
prompt_dict['negative_verbs'] +
prompt_dict['negative_infinitives']
)
negative_single_tokens = []
for token in negative_str_tokens:
token = " " + token
if len(model.to_str_tokens(token, prepend_bos=False)) == 1:
negative_single_tokens.append(token)
negative_tokens = model.to_tokens(
negative_single_tokens,
prepend_bos=True,
)
assert negative_tokens.shape[1] == 2
_, negative_cache = model.run_with_cache(
negative_tokens,
return_type=None,
names_filter = lambda name: name == ACT_NAME
)
negative_activation = negative_cache[ACT_NAME][:, -1].mean(dim=0).cpu()
print(negative_activation.shape, negative_activation.norm())
# %%
positive_direction = positive_activation - neutral_activation
negative_direction = negative_activation - neutral_activation
positive_direction = positive_direction / positive_direction.norm()
negative_direction = negative_direction / negative_direction.norm()
#%%
save_array(
positive_direction.cpu().numpy(), "mean_diff_positive_layer01", model
)
save_array(
negative_direction.cpu().numpy(), "mean_diff_negative_layer01", model
)
#%% # compute cosine similarity
torch.cosine_similarity(positive_direction, negative_direction, dim=0)

# %%

0 comments on commit 6663ec0

Please sign in to comment.