The issue of saving a model that has been modified with DuoAttention to local storage #11

xlxcomputer · 2024-11-13T04:39:14Z

Hello,Thank you very much for your work on DuoAttention; it has provided me with a lot of inspiration.
I attempted to save a model that uses DuoAttention locally and then reloaded it. However, I encountered the following warnings ：
=============================warning information========================================
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 14.94it/s]
Some weights of the model checkpoint at duoattention/Llama-3-8B-Instruct-Gradient-1048k were not used when initializing LlamaForCausalLM: ['model.layers.0.self_attn.full_attention_heads', 'model.layers.1.self_attn.full_attention_heads', 'model.layers.10.self_attn.full_attention_heads', 'model.layers.11.self_attn.full_attention_heads', 'model.layers.12.self_attn.full_attention_heads', 'model.layers.13.self_attn.full_attention_heads', 'model.layers.14.self_attn.full_attention_heads', 'model.layers.15.self_attn.full_attention_heads', 'model.layers.16.self_attn.full_attention_heads', 'model.layers.17.self_attn.full_attention_heads', 'model.layers.18.self_attn.full_attention_heads', 'model.layers.19.self_attn.full_attention_heads', 'model.layers.2.self_attn.full_attention_heads', 'model.layers.20.self_attn.full_attention_heads', 'model.layers.21.self_attn.full_attention_heads', 'model.layers.22.self_attn.full_attention_heads', 'model.layers.23.self_attn.full_attention_heads', 'model.layers.24.self_attn.full_attention_heads', 'model.layers.25.self_attn.full_attention_heads', 'model.layers.26.self_attn.full_attention_heads', 'model.layers.27.self_attn.full_attention_heads', 'model.layers.28.self_attn.full_attention_heads', 'model.layers.29.self_attn.full_attention_heads', 'model.layers.3.self_attn.full_attention_heads', 'model.layers.30.self_attn.full_attention_heads', 'model.layers.31.self_attn.full_attention_heads', 'model.layers.4.self_attn.full_attention_heads', 'model.layers.5.self_attn.full_attention_heads', 'model.layers.6.self_attn.full_attention_heads', 'model.layers.7.self_attn.full_attention_heads', 'model.layers.8.self_attn.full_attention_heads', 'model.layers.9.self_attn.full_attention_heads']

This IS expected if you are initializing LlamaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
-This IS NOT expected if you are initializing LlamaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
DuoAttention enabled for this model.
Input Text: Hello, do you like chinese food?
Starting from v4.46, the logits model output will have the same type as the model (except at train time, where it will always be FP32)
Generated Text: Hello, do you like chinese food? I do! I love it. I love the taste of the food, the smell of the food, the way it makes me feel. I love it all. I love the way it makes me feel. I love the way it makes me feel
======================================================================================
However, I am still able to get normal outputs. So I would like to ask if such warnings are normal when I use the locally cached model with DuoAttention applied?Below is the code I used to save the DuoAttention model and to load the locally stored model.
===================================store_model.py======================================
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from duo_attn.utils import load_attn_pattern, sparsify_attention_heads
from duo_attn.patch import enable_duo_attention_eval

model_path = "/data1/HF-Models/gradientai/Llama-3-8B-Instruct-Gradient-1048k"
output_path = "/data1/HF-Models/duoattention/Llama-3-8B-Instruct-Gradient-1048k"

def apply_and_save_duoattention(model_path, output_path, attn_pattern_file):
```
print("Loading model and tokenizer...")
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_path)


print("Loading DuoAttention pattern...")
attn_heads, sink_size, recent_size = load_attn_pattern(attn_pattern_file)
attn_heads, sparsity = sparsify_attention_heads(attn_heads, sparsity=0.5)


print("Applying DuoAttention to the model...")
enable_duo_attention_eval(model, attn_heads, sink_size=64, recent_size=256)
print("DuoAttention has been applied to the model.")

os.makedirs(output_path, exist_ok=True)
print(f"Saving model and tokenizer to {output_path}...")
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)

attention_pattern_path = os.path.join(output_path, "attention_pattern")
os.makedirs(attention_pattern_path, exist_ok=True)
torch.save(attn_heads, os.path.join(attention_pattern_path, "pattern_file.pt"))

print("Model, tokenizer, and DuoAttention settings have been saved successfully.")
```
attn_pattern_file = "/data1/xuluxin/duo-attention/attn_patterns/Llama-3-8B-Instruct-Gradient-1048k/lr=0.02-reg=0.05-ctx=1000_32000-multi_passkey10"
apply_and_save_duoattention(model_path, output_path, attn_pattern_file)
=====================================================================================
========================================test_example====================================
def load_duoattention_model(model_path):
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
print("DuoAttention model loaded successfully.")
return model, tokenizer

def generate_text(model, tokenizer, input_text, max_length=50):
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

output_ids = inputs["input_ids"]
for _ in range(max_length):
    with torch.no_grad():
        outputs = model(input_ids=output_ids)
    next_token_logits = outputs.logits[:, -1, :]
    next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1).to("cuda")
    output_ids = torch.cat([output_ids, next_token_id], dim=-1)
    if next_token_id.item() == tokenizer.eos_token_id:
        break
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Generated Text:", output_text)
return output_text

if name == "main":
model, tokenizer = load_duoattention_model(model_path)
input_text = "Hello, do you like chinese food?"
print("Input Text:", input_text)
generate_text(model, tokenizer, input_text)

I would be very grateful if you could provide me with relevant advice.

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

The issue of saving a model that has been modified with DuoAttention to local storage #11

The issue of saving a model that has been modified with DuoAttention to local storage #11

xlxcomputer commented Nov 13, 2024

The issue of saving a model that has been modified with DuoAttention to local storage #11

The issue of saving a model that has been modified with DuoAttention to local storage #11

Comments

xlxcomputer commented Nov 13, 2024

if name == "main": model, tokenizer = load_duoattention_model(model_path) input_text = "Hello, do you like chinese food?" print("Input Text:", input_text) generate_text(model, tokenizer, input_text)

if name == "main":
model, tokenizer = load_duoattention_model(model_path)
input_text = "Hello, do you like chinese food?"
print("Input Text:", input_text)
generate_text(model, tokenizer, input_text)