Skip to content

Commit

Permalink
Merge pull request #64 from flairNLP/fine_tune_eval
Browse files Browse the repository at this point in the history
Add experiments for instruction fine-tuned on NER
  • Loading branch information
whoisjones authored Aug 8, 2023
2 parents 969d1de + 4b2e09c commit 4935b26
Show file tree
Hide file tree
Showing 4 changed files with 386 additions and 0 deletions.
48 changes: 48 additions & 0 deletions paper_experiments/fine_tune_ner/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Evaluation of LLMs NLP Downstream Task capabilities through instruction-fine-tuning

## Setup

Install library

```bash
# In root of project
python -m pip install -e .
```

Install relevant requirements for experiment

```bash
python -m pip install -r requirements.txt
```

## Fine-tune Model

```bash
torchrun --nproc_per_node=2 train.py \
--model_name_or_path "../llama_hf" \
--bf16 True \
--output_dir dar_llama_big_noinp_clean \
--num_train_epochs 3 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 16 \
--evaluation_strategy "no" \
--save_strategy "epoch" \
--save_total_limit 3 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--fsdp "full_shard auto_wrap" \
--fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
--tf32 True
```

## Evaluate LLM with library

This will generate NER tokens based on the CONLL03 evaluation split and evaluate at it against the gold labels.

```bash
python evaluate.py --model_name_or_path "<HF_MODEL>"
```
131 changes: 131 additions & 0 deletions paper_experiments/fine_tune_ner/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import argparse
import os

from datasets import load_dataset, load_from_disk, Dataset
from haystack.nodes import PromptNode

from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

from fabricator import DatasetGenerator
from fabricator.prompts import BasePrompt
from fabricator.samplers import random_sampler


ner_prompt = (
"Given the following text. Annotate the example and choose your annotations from: {}"
)

def main(args):

dataset = load_dataset(args.dataset_name, split=args.split)


prompt = BasePrompt(
task_description=ner_prompt,
generate_data_for_column="ner_tags",
fewshot_example_columns="tokens",
label_options={0: "O", 1: "B-PER", 2: "I-PER", 3: "B-ORG", 4: "I-ORG", 5: "B-LOC", 6: "I-LOC"},
)

unlabeled_data = random_sampler(dataset, 30)

if not args.use_cached:

# "tiiuae/falcon-7b-instruct"
# timdettmers/guanaco-33b-merged
prompt_node = PromptNode(
model_name_or_path=args.model_name_or_path,
api_key=os.environ.get("HF_API_KEY"),
)


generator = DatasetGenerator(prompt_node)
generated_dataset: Dataset = generator.generate(
prompt_template=prompt,
unlabeled_dataset=unlabeled_data,
max_prompt_calls=30,
timeout_per_prompt=2,
)

generated_dataset.save_to_disk("generated_dataset_starchat")

else:
generated_dataset = load_from_disk("generated_dataset")


evaluate(dataset, generated_dataset)


def post_process(generated_samples):
"""Some heuristics to clean up the generated samples"""

def _post_process(generated_sample):

cleaned_tags = []

for tag in generated_sample["ner_tags"]:
try:
cleaned_tags.append(int(tag))
except ValueError:
if tag == "-":
cleaned_tags.append(0)
elif tag.startswith("[") and tag.endswith("]") and len(tag) > 2:
try:
cleaned_tags.append(int(tag[1:-1]))
except ValueError:
cleaned_tags.append(0)

generated_sample["ner_tags"] = cleaned_tags

return generated_sample

return generated_samples.map(_post_process)

def build_gold_and_prediction_pairs(dataset, generated_dataset):
"""Builds a list of gold and predicted labels for each sample in the dataset"""

golds = []
predictions = []

for generated_sample in generated_dataset:

for gold_sample in dataset:

if generated_sample["tokens"] == gold_sample["tokens"]:
golds.append(gold_sample["ner_tags"])
predictions.append(generated_sample["ner_tags"])


return golds, predictions

def calculate_metrics(golds, predictions):
mlb = MultiLabelBinarizer()
golds = mlb.fit_transform(golds)
predictions = mlb.transform(predictions)
acc = accuracy_score(golds, predictions)
report = classification_report(golds, predictions)
# Print the results
print(f"Accuracy: {acc}")
print(f"Classification Report:\n{report}")


def evaluate(dataset, generated_dataset):
generated_dataset = post_process(generated_dataset)
print(f"Using {generated_dataset} of samples from the generated dataset")
golds, predictions = build_gold_and_prediction_pairs(dataset, generated_dataset)
calculate_metrics(golds, predictions)


if __name__ == "__main__":

parser = argparse.ArgumentParser()

parser.add_argument("--model_name_or_path", type=str, default="EleutherAI/pythia-70M-deduped")
parser.add_argument("--dataset_name", type=str, default="conll2003")
parser.add_argument("--split", type=str, default="validation")
parser.add_argument("--use_cached", type=bool, default=False)

args = parser.parse_args()

main(args)
3 changes: 3 additions & 0 deletions paper_experiments/fine_tune_ner/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
torch
transformers
accelerate
204 changes: 204 additions & 0 deletions paper_experiments/fine_tune_ner/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
import copy
import logging
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence

import torch
import transformers
from torch.utils.data import Dataset
from transformers import Trainer

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"


ner_prompt = (
"Write a response to the question or task specified in the instruction. "
"Note the input that provides further context.\n\n"
"Instruction:\n{instruction}\n\nResponse:"
),

@dataclass
class ModelArguments:
model_name_or_path: Optional[str] = field(default="facebook/opt-125m")


@dataclass
class DataArguments:
data_path: str = field(default=None, metadata={"help": "Path to the training data."})


@dataclass
class TrainingArguments(transformers.TrainingArguments):
cache_dir: Optional[str] = field(default=None)
optim: str = field(default="adamw_torch")
model_max_length: int = field(
default=512,
metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
)


def smart_tokenizer_and_embedding_resize(
special_tokens_dict: Dict,
tokenizer: transformers.PreTrainedTokenizer,
model: transformers.PreTrainedModel,
):
"""Resize tokenizer and embedding.
Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
"""
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

if num_new_tokens > 0:
input_embeddings = model.get_input_embeddings().weight.data
output_embeddings = model.get_output_embeddings().weight.data

input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

input_embeddings[-num_new_tokens:] = input_embeddings_avg
output_embeddings[-num_new_tokens:] = output_embeddings_avg


def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
"""Tokenize a list of strings."""
tokenized_list = [
tokenizer(
text,
return_tensors="pt",
padding="longest",
max_length=tokenizer.model_max_length,
truncation=True,
)
for text in strings
]
input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
input_ids_lens = labels_lens = [
tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
]
return dict(
input_ids=input_ids,
labels=labels,
input_ids_lens=input_ids_lens,
labels_lens=labels_lens,
)


def preprocess(
sources: Sequence[str],
targets: Sequence[str],
tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
"""Preprocess the data by tokenizing."""
examples = [s + t for s, t in zip(sources, targets)]
examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
input_ids = examples_tokenized["input_ids"]
labels = copy.deepcopy(input_ids)
for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
label[:source_len] = IGNORE_INDEX
return dict(input_ids=input_ids, labels=labels)


class SupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""

def __init__(self, list_dataset_dict, tokenizer: transformers.PreTrainedTokenizer):
super(SupervisedDataset, self).__init__()
logging.warning("Loading data...")

sources = [
ner_prompt.format_map(example) for example in list_data_dict
]
targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]

logging.warning("Tokenizing inputs... This may take some time...")
data_dict = preprocess(sources, targets, tokenizer)

self.input_ids = data_dict["input_ids"]
self.labels = data_dict["labels"]

def __len__(self):
return len(self.input_ids)

def __getitem__(self, i) -> Dict[str, torch.Tensor]:
return dict(input_ids=self.input_ids[i], labels=self.labels[i])


@dataclass
class DataCollatorForSupervisedDataset(object):
"""Collate examples for supervised fine-tuning."""

tokenizer: transformers.PreTrainedTokenizer

def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
input_ids = torch.nn.utils.rnn.pad_sequence(
input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
)
labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
return dict(
input_ids=input_ids,
labels=labels,
attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
)


def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
"""Make dataset and collator for supervised fine-tuning."""

from datasets import load_dataset
# Conll
dataset = load_dataset("conll2003", split="train")
# Convert to dict with keys "instruction" and "output"
dataset = [dict(instruction=" ".join(example["tokens"]), output=f'[{", ".join(example["ner_tags"])}]') for example in dataset]
train_dataset = SupervisedDataset(dataset, tokenizer=tokenizer)
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)


def train():
parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()

model = transformers.AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
cache_dir=training_args.cache_dir,
)
model = None

tokenizer = transformers.AutoTokenizer.from_pretrained(
model_args.model_name_or_path,
cache_dir=training_args.cache_dir,
model_max_length=training_args.model_max_length,
padding_side="right",
use_fast=False,
)
special_tokens_dict = dict()
if tokenizer.pad_token is None or tokenizer.pad_token == "":
special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None or tokenizer.eos_token == "":
special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None or tokenizer.bos_token == "":
special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None or tokenizer.unk_token == "":
special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

smart_tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens_dict,
tokenizer=tokenizer,
model=model,
)

data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)


if __name__ == "__main__":
train()

0 comments on commit 4935b26

Please sign in to comment.