-
Notifications
You must be signed in to change notification settings - Fork 0
/
fine_tune_model.py
104 lines (83 loc) · 3.54 KB
/
fine_tune_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import datasets
from datasets import load_dataset
from transformers import LlamaTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, \
DataCollatorForTokenClassification
#defining helper functions
def get_training_args():
candidate_labels = ["attack", "normal"]
return TrainingArguments(
output_dir="test_trainer",
logging_strategy="epoch",
label_names=candidate_labels,
load_best_model_at_end=True,
evaluation_strategy="steps",
eval_steps=50,
warmup_steps=100,
save_steps=50,
num_train_epochs=10,
do_train=True,
do_eval=True,
)
def get_data_set(from_percent, to_percent, seed=42):
model_features = datasets.Features(
{'text': datasets.Value('string'), 'label': datasets.ClassLabel(names=candidate_labels)})
return load_dataset("niting3c/malicious-packet-analysis",
features=model_features,
split=datasets.ReadInstruction("train",
from_=from_percent,
to=to_percent,
unit="%",
rounding="pct1_dropremainder"),
streaming= True,
).map(tokenize_function, batched=True).shuffle(seed=seed)
def tokenize_function(examples):
return tokenizer(examples["text"], padding=True, truncation=True, max_length=3600)
# Prepare datasets
normal_dataset_train = get_data_set(0, 90)
normal_dataset_validate = get_data_set(90, 100)
# load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("togethercomputer/LLaMA-2-7B-32K")
tokenizer = LlamaTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K")
tokenizer.add_special_tokens({'pad_token': "-100"})
# Define candidate labels and model config settings
candidate_labels = ["attack", "normal"]
model.config.architectures = ["LlamaForSequenceClassification"]
model.config.zero_shot_classification = True
# Prepare training arguments and data collator
training_args = get_training_args()
data_collator = DataCollatorForTokenClassification(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)
# Train and evaluate on the normal dataset
try:
trainer = Trainer(
model=model,
train_dataset=normal_dataset_train,
eval_dataset=normal_dataset_validate,
args=training_args,
data_collator=data_collator,
)
trainer.train()
trainer.save_model() # Save the model after training.
# Evaluate and report evaluation results.
eval_results = trainer.evaluate()
#write the eval_results into a output folder for later use
with open('output/eval_results.txt', 'w') as f:
print(eval_results, file=f)
print("Evaluation results on normal dataset:", eval_results)
except Exception as e:
print("An error occurred during training on normal dataset:")
print(str(e))
exit(1)
# Save the model and tokenizer to Hugging Face Hub.
model.save_pretrained("test_trainer")
tokenizer.save_pretrained("test_trainer")
# Push to Hugging Face Hub.
try:
model.push_to_hub("niting3c/llama-2-7b-hf-zero-shot", use_auth_token=True)
tokenizer.push_to_hub("niting3c/llama-2-7b-hf-zero-shot", use_auth_token=True)
print("Model and tokenizer pushed to the Hugging Face Hub.")
except Exception as e:
print("An error occurred while pushing to the Hugging Face Hub:")
print(str(e))
exit(1)