Skip to content

Commit

Permalink
adjust training procedure
Browse files Browse the repository at this point in the history
  • Loading branch information
whoisjones committed Nov 2, 2023
1 parent 023b3af commit f2e2cc6
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 55 deletions.
20 changes: 11 additions & 9 deletions first_experiment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,25 @@
parser = ArgumentParser()
parser.add_argument("--dataset", type=str, default="imdb")
parser.add_argument("--tam_model", type=str, default="distilbert-base-uncased")
parser.add_argument("--embedding_model", type=str, default="distilbert-base-uncased")
parser.add_argument("--embedding_model", type=str, default=None)
parser.add_argument("--init_strategy", type=str, choices=["random", "closest-to-centeroid", "furthest-to-centeroid", "expected-gradients", "certainty"], default="random")
parser.add_argument("--stopping_criteria", type=str)
parser.add_argument("--dataset_size", type=int, default=[32, 64, 128, 256, 512, 1024, 2048, 4096, 8192])
parser.add_argument("--dataset_size", type=int, nargs="+", default=[32, 64, 128, 256, 512, 1024, 2048, 4096, 0])
args = parser.parse_args()

full_dataset = load_dataset(args.dataset)
#TODO stopping_critera = load_stopping_criteria(args.stopping_criteria)
task_keys = task_to_keys[args.dataset]

for dataset_size in args.dataset_size:
dataset = select_fewshots(
args,
full_dataset,
dataset_size,
task_keys
)
if dataset_size > 0:
dataset = select_fewshots(
args,
full_dataset,
dataset_size,
task_keys
)
else:
dataset = full_dataset

train_classification(
args,
Expand Down
4 changes: 4 additions & 0 deletions first_experiment/selection_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,17 @@ def select_fewshots(
task_keys["label_column"]
)
elif args.init_strategy == "class-centeroid-closest":
if args.embedding_model is None:
raise ValueError("You need to specify an embedding model for this init strategy.")
dataset = closest_to_centeroid_selection(
args.embedding_model,
full_dataset,
dataset_size,
task_keys
)
elif args.init_strategy == "class-centeroid-furthest":
if args.embedding_model is None:
raise ValueError("You need to specify an embedding model for this init strategy.")
dataset = furthest_to_centeroid_selection(
args.embedding_model,
full_dataset,
Expand Down
52 changes: 23 additions & 29 deletions first_experiment/tam_training.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,47 @@
import json
from pathlib import Path
from argparse import Namespace

import numpy as np

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

from utils import *

PATH = Path("/glusterfs/dfs-gfs-dist/goldejon/initial-starting-point-generation")

def train_classification(
args,
dataset,
text_column,
label_column,
args: Namespace,
dataset: DatasetDict,
task_keys: dict
):
label_column = task_keys["label_column"]
id2label = dict(enumerate(dataset["train"].features[label_column].names))
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
args.tam_model,
num_labels=len(id2label),
id2label=id2label,
label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(args.tam_model)
model, tokenizer = get_classification_model_and_tokenizer(args.tam_model, id2label=id2label)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

if len(text_column) == 1:
def preprocess_function(examples):
return tokenizer(examples[text_column[0]], truncation=True)
else:
def preprocess_function(examples):
return tokenizer(examples[text_column[0]], examples[text_column[1]], truncation=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = dataset.map(
preprocess_function,
batched=True,
fn_kwargs={"tokenizer": tokenizer, "task_keys": task_keys}
)

if args.init_strategy in ["class-centeroid", "class-furthest"]:
embedding_model = args.embedding_model.split("/")[-1]
experiment_extension = f"{args.tam_model}_{args.dataset}_{args.dataset_size}_{args.init_strategy}_{embedding_model}"
else:
experiment_extension = f"{args.tam_model}_{args.dataset}_{args.dataset_size}_{args.init_strategy}"
experiment_extension = (f"{args.tam_model}"
f"_{args.dataset}"
f"_{args.dataset_size}"
f"_{args.init_strategy}"
f"_{args.embedding_model if args.embedding_model is not None else ''}")

log_path = PATH / experiment_extension

batch_size = 16
total_steps = min(len(dataset["train"]) // batch_size * 3, 200)
training_args = TrainingArguments(
output_dir=str(log_path),
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=20,
num_train_epochs=total_steps * batch_size // len(dataset["train"]),
warmup_ratio=0.1,
weight_decay=0.01,
logging_steps=5,
Expand All @@ -67,6 +60,7 @@ def compute_metrics(eval_pred):
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"] if "validation" in tokenized_dataset else None,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
Expand Down
30 changes: 13 additions & 17 deletions first_experiment/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os
from pathlib import Path
from typing import Tuple

import torch
Expand All @@ -7,7 +7,8 @@
from datasets import DatasetDict
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer, PreTrainedTokenizer, PreTrainedModel

CACHE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), ".cache")
PATH = Path("/glusterfs/dfs-gfs-dist/goldejon/initial-starting-point-generation")
CACHE_DIR = PATH / ".cache"


def get_embedding_model_and_tokenizer(
Expand All @@ -25,21 +26,27 @@ def get_embedding_model_and_tokenizer(

def get_classification_model_and_tokenizer(
model_name_or_path: str,
id2label: dict = None,
) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
"""
Get classification model and tokenizer.
"""
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(
model_name_or_path,
num_labels=len(id2label) if id2label is not None else None,
id2label=id2label if id2label is not None else None,
label2id={label: i for i, label in id2label.items()} if id2label is not None else None,
)
if torch.cuda.is_available():
model.cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
return model, tokenizer


def preprocess_function(
examples,
tokenizer: PreTrainedTokenizer,
task_keys: dict,
label_to_id: dict,
):
sentence1_key, sentence2_key = task_keys["text_column"]

Expand All @@ -50,12 +57,8 @@ def preprocess_function(
result = tokenizer(*texts, padding=True, max_length=256, truncation=True)

if "label" in examples:
if label_to_id is not None:
# Map labels to IDs (not necessary for GLUE tasks)
result["labels"] = [label_to_id[l] for l in examples["label"]]
else:
# In all cases, rename the column to labels because the model will expect that.
result["labels"] = examples["label"]
result["labels"] = examples["label"]

return result


Expand All @@ -68,20 +71,13 @@ def get_trainloader(
"""
Get dataloader for classification dataset.
"""
label_column = task_keys["label_column"]
try:
label_to_id = dict(enumerate(dataset["train"].features[label_column].feature.names))
except (AttributeError, KeyError):
label_to_id = None

processed_datasets = dataset.map(
preprocess_function,
batched=True,
remove_columns=dataset["train"].column_names,
fn_kwargs={
"tokenizer": tokenizer,
"task_keys": task_keys,
"label_to_id": label_to_id
},
desc="Running tokenizer on dataset",
)
Expand Down

0 comments on commit f2e2cc6

Please sign in to comment.