diff --git a/first_experiment/main.py b/first_experiment/main.py index 629fa55..cc925c3 100644 --- a/first_experiment/main.py +++ b/first_experiment/main.py @@ -17,23 +17,25 @@ parser = ArgumentParser() parser.add_argument("--dataset", type=str, default="imdb") parser.add_argument("--tam_model", type=str, default="distilbert-base-uncased") - parser.add_argument("--embedding_model", type=str, default="distilbert-base-uncased") + parser.add_argument("--embedding_model", type=str, default=None) parser.add_argument("--init_strategy", type=str, choices=["random", "closest-to-centeroid", "furthest-to-centeroid", "expected-gradients", "certainty"], default="random") parser.add_argument("--stopping_criteria", type=str) - parser.add_argument("--dataset_size", type=int, default=[32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]) + parser.add_argument("--dataset_size", type=int, nargs="+", default=[32, 64, 128, 256, 512, 1024, 2048, 4096, 0]) args = parser.parse_args() full_dataset = load_dataset(args.dataset) - #TODO stopping_critera = load_stopping_criteria(args.stopping_criteria) task_keys = task_to_keys[args.dataset] for dataset_size in args.dataset_size: - dataset = select_fewshots( - args, - full_dataset, - dataset_size, - task_keys - ) + if dataset_size > 0: + dataset = select_fewshots( + args, + full_dataset, + dataset_size, + task_keys + ) + else: + dataset = full_dataset train_classification( args, diff --git a/first_experiment/selection_strategies.py b/first_experiment/selection_strategies.py index 53d3a75..240fa82 100644 --- a/first_experiment/selection_strategies.py +++ b/first_experiment/selection_strategies.py @@ -25,6 +25,8 @@ def select_fewshots( task_keys["label_column"] ) elif args.init_strategy == "class-centeroid-closest": + if args.embedding_model is None: + raise ValueError("You need to specify an embedding model for this init strategy.") dataset = closest_to_centeroid_selection( args.embedding_model, full_dataset, @@ -32,6 +34,8 @@ def select_fewshots( task_keys ) elif args.init_strategy == "class-centeroid-furthest": + if args.embedding_model is None: + raise ValueError("You need to specify an embedding model for this init strategy.") dataset = furthest_to_centeroid_selection( args.embedding_model, full_dataset, diff --git a/first_experiment/tam_training.py b/first_experiment/tam_training.py index 36fe2f5..a548a14 100644 --- a/first_experiment/tam_training.py +++ b/first_experiment/tam_training.py @@ -1,54 +1,47 @@ import json -from pathlib import Path +from argparse import Namespace import numpy as np -from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer import evaluate +from transformers import DataCollatorWithPadding, TrainingArguments, Trainer + +from utils import * -PATH = Path("/glusterfs/dfs-gfs-dist/goldejon/initial-starting-point-generation") def train_classification( - args, - dataset, - text_column, - label_column, + args: Namespace, + dataset: DatasetDict, + task_keys: dict ): + label_column = task_keys["label_column"] id2label = dict(enumerate(dataset["train"].features[label_column].names)) - label2id = {v: k for k, v in id2label.items()} - model = AutoModelForSequenceClassification.from_pretrained( - args.tam_model, - num_labels=len(id2label), - id2label=id2label, - label2id=label2id - ) - tokenizer = AutoTokenizer.from_pretrained(args.tam_model) + model, tokenizer = get_classification_model_and_tokenizer(args.tam_model, id2label=id2label) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - if len(text_column) == 1: - def preprocess_function(examples): - return tokenizer(examples[text_column[0]], truncation=True) - else: - def preprocess_function(examples): - return tokenizer(examples[text_column[0]], examples[text_column[1]], truncation=True) - - tokenized_dataset = dataset.map(preprocess_function, batched=True) + tokenized_dataset = dataset.map( + preprocess_function, + batched=True, + fn_kwargs={"tokenizer": tokenizer, "task_keys": task_keys} + ) - if args.init_strategy in ["class-centeroid", "class-furthest"]: - embedding_model = args.embedding_model.split("/")[-1] - experiment_extension = f"{args.tam_model}_{args.dataset}_{args.dataset_size}_{args.init_strategy}_{embedding_model}" - else: - experiment_extension = f"{args.tam_model}_{args.dataset}_{args.dataset_size}_{args.init_strategy}" + experiment_extension = (f"{args.tam_model}" + f"_{args.dataset}" + f"_{args.dataset_size}" + f"_{args.init_strategy}" + f"_{args.embedding_model if args.embedding_model is not None else ''}") log_path = PATH / experiment_extension + batch_size = 16 + total_steps = min(len(dataset["train"]) // batch_size * 3, 200) training_args = TrainingArguments( output_dir=str(log_path), learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, - num_train_epochs=20, + num_train_epochs=total_steps * batch_size // len(dataset["train"]), warmup_ratio=0.1, weight_decay=0.01, logging_steps=5, @@ -67,6 +60,7 @@ def compute_metrics(eval_pred): model=model, args=training_args, train_dataset=tokenized_dataset["train"], + eval_dataset=tokenized_dataset["validation"] if "validation" in tokenized_dataset else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, diff --git a/first_experiment/utils.py b/first_experiment/utils.py index e600bf9..8e7ea7b 100644 --- a/first_experiment/utils.py +++ b/first_experiment/utils.py @@ -1,4 +1,4 @@ -import os +from pathlib import Path from typing import Tuple import torch @@ -7,7 +7,8 @@ from datasets import DatasetDict from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer, PreTrainedTokenizer, PreTrainedModel -CACHE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), ".cache") +PATH = Path("/glusterfs/dfs-gfs-dist/goldejon/initial-starting-point-generation") +CACHE_DIR = PATH / ".cache" def get_embedding_model_and_tokenizer( @@ -25,21 +26,27 @@ def get_embedding_model_and_tokenizer( def get_classification_model_and_tokenizer( model_name_or_path: str, + id2label: dict = None, ) -> Tuple[PreTrainedModel, PreTrainedTokenizer]: """ Get classification model and tokenizer. """ - model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path) + model = AutoModelForSequenceClassification.from_pretrained( + model_name_or_path, + num_labels=len(id2label) if id2label is not None else None, + id2label=id2label if id2label is not None else None, + label2id={label: i for i, label in id2label.items()} if id2label is not None else None, + ) if torch.cuda.is_available(): model.cuda() tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) return model, tokenizer + def preprocess_function( examples, tokenizer: PreTrainedTokenizer, task_keys: dict, - label_to_id: dict, ): sentence1_key, sentence2_key = task_keys["text_column"] @@ -50,12 +57,8 @@ def preprocess_function( result = tokenizer(*texts, padding=True, max_length=256, truncation=True) if "label" in examples: - if label_to_id is not None: - # Map labels to IDs (not necessary for GLUE tasks) - result["labels"] = [label_to_id[l] for l in examples["label"]] - else: - # In all cases, rename the column to labels because the model will expect that. - result["labels"] = examples["label"] + result["labels"] = examples["label"] + return result @@ -68,12 +71,6 @@ def get_trainloader( """ Get dataloader for classification dataset. """ - label_column = task_keys["label_column"] - try: - label_to_id = dict(enumerate(dataset["train"].features[label_column].feature.names)) - except (AttributeError, KeyError): - label_to_id = None - processed_datasets = dataset.map( preprocess_function, batched=True, @@ -81,7 +78,6 @@ def get_trainloader( fn_kwargs={ "tokenizer": tokenizer, "task_keys": task_keys, - "label_to_id": label_to_id }, desc="Running tokenizer on dataset", )