Skip to content

Commit

Permalink
Tried handmade neutral tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
ojh31 committed Oct 13, 2023
1 parent 6663ec0 commit 2dbfedc
Showing 1 changed file with 41 additions and 21 deletions.
62 changes: 41 additions & 21 deletions fit_one_sided_directions.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,50 @@
"gpt2-small",
)
#%%
BATCH_SIZE = 64
owt_data = load_dataset("stas/openwebtext-10k", split="train")
dataset = tokenize_and_concatenate(owt_data, model.tokenizer)
data_loader = DataLoader(
dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True
)
#%%
ACT_NAME = get_act_name("resid_post", 0)
#%% # Neutral
count = 0
total = torch.zeros(model.cfg.d_model)
for batch in tqdm(data_loader):
_, cache = model.run_with_cache(
batch['tokens'],
return_type=None,
names_filter = lambda name: name == ACT_NAME
)
count += 1
total += cache[ACT_NAME][:, 1, :].mean(dim=0).cpu()
neutral_activation = total / count
print(neutral_activation.shape, neutral_activation.norm())
#%% # Positive
# #%%
# BATCH_SIZE = 64
# owt_data = load_dataset("stas/openwebtext-10k", split="train")
# dataset = tokenize_and_concatenate(owt_data, model.tokenizer)
# data_loader = DataLoader(
# dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True
# )
# #%% # Neutral
# count = 0
# total = torch.zeros(model.cfg.d_model)
# for batch in tqdm(data_loader):
# _, cache = model.run_with_cache(
# batch['tokens'],
# return_type=None,
# names_filter = lambda name: name == ACT_NAME
# )
# count += 1
# total += cache[ACT_NAME][:, 1, :].mean(dim=0).cpu()
# neutral_activation = total / count
# print(neutral_activation.shape, neutral_activation.norm())
#%% Handmade prompts
with open("prompts.yaml", "r") as f:
prompt_dict = yaml.safe_load(f)
#%% Handmade neutral
neutral_str_tokens = prompt_dict['neutral_adjectives']
neutral_single_tokens = []
for token in neutral_str_tokens:
token = " " + token
if len(model.to_str_tokens(token, prepend_bos=False)) == 1:
neutral_single_tokens.append(token)
neutral_tokens = model.to_tokens(
neutral_single_tokens,
prepend_bos=True,
)
assert neutral_tokens.shape[1] == 2
_, neutral_cache = model.run_with_cache(
neutral_tokens,
return_type=None,
names_filter = lambda name: name == ACT_NAME
)
neutral_activation = neutral_cache[ACT_NAME][:, -1].mean(dim=0).cpu()
print(neutral_activation.shape, neutral_activation.norm())
#%% # Positive
#%%
positive_str_tokens = (
prompt_dict['positive_adjectives_train'] +
Expand Down

0 comments on commit 2dbfedc

Please sign in to comment.