diff --git a/paper_experiments/fine_tune_ner/README.md b/paper_experiments/fine_tune_ner/README.md new file mode 100644 index 0000000..590f2d4 --- /dev/null +++ b/paper_experiments/fine_tune_ner/README.md @@ -0,0 +1,48 @@ +# Evaluation of LLMs NLP Downstream Task capabilities through instruction-fine-tuning + +## Setup + +Install library + +```bash +# In root of project +python -m pip install -e . +``` + +Install relevant requirements for experiment + +```bash +python -m pip install -r requirements.txt +``` + +## Fine-tune Model + +```bash +torchrun --nproc_per_node=2 train.py \ + --model_name_or_path "../llama_hf" \ + --bf16 True \ + --output_dir dar_llama_big_noinp_clean \ + --num_train_epochs 3 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 16 \ + --evaluation_strategy "no" \ + --save_strategy "epoch" \ + --save_total_limit 3 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --fsdp "full_shard auto_wrap" \ + --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \ + --tf32 True +``` + +## Evaluate LLM with library + +This will generate NER tokens based on the CONLL03 evaluation split and evaluate at it against the gold labels. + +```bash +python evaluate.py --model_name_or_path "" +``` diff --git a/paper_experiments/fine_tune_ner/evaluate.py b/paper_experiments/fine_tune_ner/evaluate.py new file mode 100644 index 0000000..be408de --- /dev/null +++ b/paper_experiments/fine_tune_ner/evaluate.py @@ -0,0 +1,131 @@ +import argparse +import os + +from datasets import load_dataset, load_from_disk, Dataset +from haystack.nodes import PromptNode + +from sklearn.metrics import classification_report, accuracy_score +from sklearn.preprocessing import MultiLabelBinarizer + +from fabricator import DatasetGenerator +from fabricator.prompts import BasePrompt +from fabricator.samplers import random_sampler + + +ner_prompt = ( + "Given the following text. Annotate the example and choose your annotations from: {}" +) + +def main(args): + + dataset = load_dataset(args.dataset_name, split=args.split) + + + prompt = BasePrompt( + task_description=ner_prompt, + generate_data_for_column="ner_tags", + fewshot_example_columns="tokens", + label_options={0: "O", 1: "B-PER", 2: "I-PER", 3: "B-ORG", 4: "I-ORG", 5: "B-LOC", 6: "I-LOC"}, + ) + + unlabeled_data = random_sampler(dataset, 30) + + if not args.use_cached: + + # "tiiuae/falcon-7b-instruct" + # timdettmers/guanaco-33b-merged + prompt_node = PromptNode( + model_name_or_path=args.model_name_or_path, + api_key=os.environ.get("HF_API_KEY"), + ) + + + generator = DatasetGenerator(prompt_node) + generated_dataset: Dataset = generator.generate( + prompt_template=prompt, + unlabeled_dataset=unlabeled_data, + max_prompt_calls=30, + timeout_per_prompt=2, + ) + + generated_dataset.save_to_disk("generated_dataset_starchat") + + else: + generated_dataset = load_from_disk("generated_dataset") + + + evaluate(dataset, generated_dataset) + + +def post_process(generated_samples): + """Some heuristics to clean up the generated samples""" + + def _post_process(generated_sample): + + cleaned_tags = [] + + for tag in generated_sample["ner_tags"]: + try: + cleaned_tags.append(int(tag)) + except ValueError: + if tag == "-": + cleaned_tags.append(0) + elif tag.startswith("[") and tag.endswith("]") and len(tag) > 2: + try: + cleaned_tags.append(int(tag[1:-1])) + except ValueError: + cleaned_tags.append(0) + + generated_sample["ner_tags"] = cleaned_tags + + return generated_sample + + return generated_samples.map(_post_process) + +def build_gold_and_prediction_pairs(dataset, generated_dataset): + """Builds a list of gold and predicted labels for each sample in the dataset""" + + golds = [] + predictions = [] + + for generated_sample in generated_dataset: + + for gold_sample in dataset: + + if generated_sample["tokens"] == gold_sample["tokens"]: + golds.append(gold_sample["ner_tags"]) + predictions.append(generated_sample["ner_tags"]) + + + return golds, predictions + +def calculate_metrics(golds, predictions): + mlb = MultiLabelBinarizer() + golds = mlb.fit_transform(golds) + predictions = mlb.transform(predictions) + acc = accuracy_score(golds, predictions) + report = classification_report(golds, predictions) + # Print the results + print(f"Accuracy: {acc}") + print(f"Classification Report:\n{report}") + + +def evaluate(dataset, generated_dataset): + generated_dataset = post_process(generated_dataset) + print(f"Using {generated_dataset} of samples from the generated dataset") + golds, predictions = build_gold_and_prediction_pairs(dataset, generated_dataset) + calculate_metrics(golds, predictions) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + parser.add_argument("--model_name_or_path", type=str, default="EleutherAI/pythia-70M-deduped") + parser.add_argument("--dataset_name", type=str, default="conll2003") + parser.add_argument("--split", type=str, default="validation") + parser.add_argument("--use_cached", type=bool, default=False) + + args = parser.parse_args() + + main(args) diff --git a/paper_experiments/fine_tune_ner/requirements.txt b/paper_experiments/fine_tune_ner/requirements.txt new file mode 100644 index 0000000..d70a9b2 --- /dev/null +++ b/paper_experiments/fine_tune_ner/requirements.txt @@ -0,0 +1,3 @@ +torch +transformers +accelerate diff --git a/paper_experiments/fine_tune_ner/train.py b/paper_experiments/fine_tune_ner/train.py new file mode 100644 index 0000000..e5cce65 --- /dev/null +++ b/paper_experiments/fine_tune_ner/train.py @@ -0,0 +1,204 @@ +import copy +import logging +from dataclasses import dataclass, field +from typing import Dict, Optional, Sequence + +import torch +import transformers +from torch.utils.data import Dataset +from transformers import Trainer + +IGNORE_INDEX = -100 +DEFAULT_PAD_TOKEN = "[PAD]" +DEFAULT_EOS_TOKEN = "" +DEFAULT_BOS_TOKEN = "" +DEFAULT_UNK_TOKEN = "" + + +ner_prompt = ( + "Write a response to the question or task specified in the instruction. " + "Note the input that provides further context.\n\n" + "Instruction:\n{instruction}\n\nResponse:" +), + +@dataclass +class ModelArguments: + model_name_or_path: Optional[str] = field(default="facebook/opt-125m") + + +@dataclass +class DataArguments: + data_path: str = field(default=None, metadata={"help": "Path to the training data."}) + + +@dataclass +class TrainingArguments(transformers.TrainingArguments): + cache_dir: Optional[str] = field(default=None) + optim: str = field(default="adamw_torch") + model_max_length: int = field( + default=512, + metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."}, + ) + + +def smart_tokenizer_and_embedding_resize( + special_tokens_dict: Dict, + tokenizer: transformers.PreTrainedTokenizer, + model: transformers.PreTrainedModel, +): + """Resize tokenizer and embedding. + + Note: This is the unoptimized version that may make your embedding size not be divisible by 64. + """ + num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) + model.resize_token_embeddings(len(tokenizer)) + + if num_new_tokens > 0: + input_embeddings = model.get_input_embeddings().weight.data + output_embeddings = model.get_output_embeddings().weight.data + + input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True) + output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True) + + input_embeddings[-num_new_tokens:] = input_embeddings_avg + output_embeddings[-num_new_tokens:] = output_embeddings_avg + + +def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict: + """Tokenize a list of strings.""" + tokenized_list = [ + tokenizer( + text, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ) + for text in strings + ] + input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list] + input_ids_lens = labels_lens = [ + tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list + ] + return dict( + input_ids=input_ids, + labels=labels, + input_ids_lens=input_ids_lens, + labels_lens=labels_lens, + ) + + +def preprocess( + sources: Sequence[str], + targets: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer, +) -> Dict: + """Preprocess the data by tokenizing.""" + examples = [s + t for s, t in zip(sources, targets)] + examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)] + input_ids = examples_tokenized["input_ids"] + labels = copy.deepcopy(input_ids) + for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]): + label[:source_len] = IGNORE_INDEX + return dict(input_ids=input_ids, labels=labels) + + +class SupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, list_dataset_dict, tokenizer: transformers.PreTrainedTokenizer): + super(SupervisedDataset, self).__init__() + logging.warning("Loading data...") + + sources = [ + ner_prompt.format_map(example) for example in list_data_dict + ] + targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict] + + logging.warning("Tokenizing inputs... This may take some time...") + data_dict = preprocess(sources, targets, tokenizer) + + self.input_ids = data_dict["input_ids"] + self.labels = data_dict["labels"] + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + return dict(input_ids=self.input_ids[i], labels=self.labels[i]) + + +@dataclass +class DataCollatorForSupervisedDataset(object): + """Collate examples for supervised fine-tuning.""" + + tokenizer: transformers.PreTrainedTokenizer + + def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels")) + input_ids = torch.nn.utils.rnn.pad_sequence( + input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id + ) + labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) + return dict( + input_ids=input_ids, + labels=labels, + attention_mask=input_ids.ne(self.tokenizer.pad_token_id), + ) + + +def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict: + """Make dataset and collator for supervised fine-tuning.""" + + from datasets import load_dataset + # Conll + dataset = load_dataset("conll2003", split="train") + # Convert to dict with keys "instruction" and "output" + dataset = [dict(instruction=" ".join(example["tokens"]), output=f'[{", ".join(example["ner_tags"])}]') for example in dataset] + train_dataset = SupervisedDataset(dataset, tokenizer=tokenizer) + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator) + + +def train(): + parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + model = transformers.AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + ) + model = None + + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right", + use_fast=False, + ) + special_tokens_dict = dict() + if tokenizer.pad_token is None or tokenizer.pad_token == "": + special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN + if tokenizer.eos_token is None or tokenizer.eos_token == "": + special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN + if tokenizer.bos_token is None or tokenizer.bos_token == "": + special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN + if tokenizer.unk_token is None or tokenizer.unk_token == "": + special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN + + smart_tokenizer_and_embedding_resize( + special_tokens_dict=special_tokens_dict, + tokenizer=tokenizer, + model=model, + ) + + data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args) + trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module) + trainer.train() + trainer.save_state() + trainer.save_model(output_dir=training_args.output_dir) + + +if __name__ == "__main__": + train()