From a605a398c4a2836ca27ca30ce21ed197ce3cc226 Mon Sep 17 00:00:00 2001 From: whoisjones Date: Tue, 24 Oct 2023 11:25:07 +0100 Subject: [PATCH 1/6] prepare setup.py for v0.1.1 --- setup.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 50d8f46..46dc980 100644 --- a/setup.py +++ b/setup.py @@ -8,9 +8,15 @@ def requirements(): setup( name='fabricator', - version='0.1', + version='0.1.1', author='Humboldt University Berlin, deepset GmbH', - description='Generate datasets with large language models.', + author_email="goldejon@informatik.hu-berlin.de", + description='Conveniently generating datasets with large language models.', + long_description="If you require textual datasets for specific tasks, you can utilize large language models " + "that possess immense generation capability. fabricator enables you to conveniently create " + "or annotate datasets to fine-tune your custom model. fabricator is constructed on " + "deepset's haystack and huggingface's datasets libraries, to seamlessly integrate " + "into existing NLP frameworks.", package_dir={"": "src"}, packages=find_packages("src"), license="Apache 2.0", From 223c2adbaa7a4244e5d12c4ee9af35f9294f3534 Mon Sep 17 00:00:00 2001 From: whoisjones Date: Tue, 24 Oct 2023 11:31:21 +0100 Subject: [PATCH 2/6] remove submission experiments from future releases --- paper_experiments/conll_annotate_dataset.py | 72 ------- paper_experiments/conll_gpt_train_model.py | 38 ---- paper_experiments/fine_tune_ner/README.md | 48 ----- paper_experiments/fine_tune_ner/evaluate.py | 131 ----------- .../fine_tune_ner/requirements.txt | 3 - paper_experiments/fine_tune_ner/train.py | 204 ------------------ paper_experiments/mrpc_annotate_dataset.py | 52 ----- paper_experiments/mrpc_train_model.py | 143 ------------ paper_experiments/snli_annotate_dataset.py | 54 ----- paper_experiments/snli_train_model.py | 144 ------------- paper_experiments/squad_annotate_dataset.py | 140 ------------ paper_experiments/squad_train_model.py | 59 ----- paper_experiments/trec_annotate_dataset.py | 72 ------- .../trec_annotation_train_model.py | 155 ------------- paper_experiments/trec_generate_dataset.py | 62 ------ .../trec_generation_train_model.py | 148 ------------- .../trec_hyperparameter_annotate_dataset.py | 73 ------- ...rec_hyperparameter_annotate_train_model.py | 121 ----------- .../trec_hyperparameter_generate_dataset.py | 70 ------ ...rec_hyperparameter_generate_train_model.py | 111 ---------- 20 files changed, 1900 deletions(-) delete mode 100644 paper_experiments/conll_annotate_dataset.py delete mode 100644 paper_experiments/conll_gpt_train_model.py delete mode 100644 paper_experiments/fine_tune_ner/README.md delete mode 100644 paper_experiments/fine_tune_ner/evaluate.py delete mode 100644 paper_experiments/fine_tune_ner/requirements.txt delete mode 100644 paper_experiments/fine_tune_ner/train.py delete mode 100644 paper_experiments/mrpc_annotate_dataset.py delete mode 100644 paper_experiments/mrpc_train_model.py delete mode 100644 paper_experiments/snli_annotate_dataset.py delete mode 100644 paper_experiments/snli_train_model.py delete mode 100644 paper_experiments/squad_annotate_dataset.py delete mode 100644 paper_experiments/squad_train_model.py delete mode 100644 paper_experiments/trec_annotate_dataset.py delete mode 100644 paper_experiments/trec_annotation_train_model.py delete mode 100644 paper_experiments/trec_generate_dataset.py delete mode 100644 paper_experiments/trec_generation_train_model.py delete mode 100644 paper_experiments/trec_hyperparameter_annotate_dataset.py delete mode 100644 paper_experiments/trec_hyperparameter_annotate_train_model.py delete mode 100644 paper_experiments/trec_hyperparameter_generate_dataset.py delete mode 100644 paper_experiments/trec_hyperparameter_generate_train_model.py diff --git a/paper_experiments/conll_annotate_dataset.py b/paper_experiments/conll_annotate_dataset.py deleted file mode 100644 index dee1eaa..0000000 --- a/paper_experiments/conll_annotate_dataset.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -from datasets import load_dataset -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator, BasePrompt -from fabricator.dataset_transformations.token_classification import convert_token_labels_to_spans - - -def run(): - fewshot_dataset = load_dataset("conll2003", split="train") - fewshot_dataset, label_options = convert_token_labels_to_spans( - fewshot_dataset, - "tokens", - "ner_tags", - expanded_label_mapping={ - 0: "O", - 1: "B-person", - 2: "I-person", - 3: "B-organization", - 4: "I-organization", - 5: "B-location", - 6: "I-location", - 7: "B-miscellaneous", - 8: "I-miscellaneous", - } - ) - - annotation_dataset = load_dataset("conll2003", split="validation") - annotation_dataset, label_options = convert_token_labels_to_spans( - annotation_dataset, - "tokens", - "ner_tags", - expanded_label_mapping={ - 0: "O", - 1: "B-person", - 2: "I-person", - 3: "B-organization", - 4: "I-organization", - 5: "B-location", - 6: "I-location", - 7: "B-miscellaneous", - 8: "I-miscellaneous", - } - ) - - prompt = BasePrompt( - task_description="Extract the following named entities from the text: {}. " - "Your output format must be exactly the same as from the fewshot examples.", - label_options=label_options, - generate_data_for_column="ner_tags", - fewshot_example_columns="tokens", - ) - - prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo", - api_key=os.environ.get("OPENAI_API_KEY"), - max_length=500, - ) - - generator = DatasetGenerator(prompt_node) - generated_dataset = generator.generate( - prompt_template=prompt, - fewshot_dataset=fewshot_dataset, - fewshot_examples_per_class=3, - unlabeled_dataset=annotation_dataset, - max_prompt_calls=len(annotation_dataset), - ) - - generated_dataset.push_to_hub("conll-validation-annotated", private=True) - - -if __name__ == "__main__": - run() diff --git a/paper_experiments/conll_gpt_train_model.py b/paper_experiments/conll_gpt_train_model.py deleted file mode 100644 index 3bccc31..0000000 --- a/paper_experiments/conll_gpt_train_model.py +++ /dev/null @@ -1,38 +0,0 @@ -from argparse import ArgumentParser -from datasets import load_dataset -from fabricator import convert_spans_to_token_labels -from seqeval.metrics import accuracy_score, f1_score - - -def run(args): - id2label = { - 0: "O", - 1: "B-person", - 2: "I-person", - 3: "B-organization", - 4: "I-organization", - 5: "B-location", - 6: "I-location", - 7: "B-miscellaneous", - 8: "I-miscellaneous", - } - dataset = load_dataset(args.corpus, split="train") - dataset = convert_spans_to_token_labels(dataset, "tokens", "ner_tags", id2label=id2label) - original = load_dataset("conll2003", split="validation") - y_pred = [] - y_true = [] - for generated_example, original_example in zip(dataset, original): - if len(generated_example["tokens"]) == len(original_example["tokens"]): - y_pred.append([id2label[tag] for tag in generated_example["ner_tags"]]) - y_true.append([id2label[tag] for tag in original_example["ner_tags"]]) - - print(len(y_pred) / len(dataset)) - print(accuracy_score(y_true, y_pred)) - print(f1_score(y_true, y_pred)) - - -if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument("--corpus", type=str) - arguments = parser.parse_args() - run(arguments) diff --git a/paper_experiments/fine_tune_ner/README.md b/paper_experiments/fine_tune_ner/README.md deleted file mode 100644 index 590f2d4..0000000 --- a/paper_experiments/fine_tune_ner/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# Evaluation of LLMs NLP Downstream Task capabilities through instruction-fine-tuning - -## Setup - -Install library - -```bash -# In root of project -python -m pip install -e . -``` - -Install relevant requirements for experiment - -```bash -python -m pip install -r requirements.txt -``` - -## Fine-tune Model - -```bash -torchrun --nproc_per_node=2 train.py \ - --model_name_or_path "../llama_hf" \ - --bf16 True \ - --output_dir dar_llama_big_noinp_clean \ - --num_train_epochs 3 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 16 \ - --evaluation_strategy "no" \ - --save_strategy "epoch" \ - --save_total_limit 3 \ - --learning_rate 2e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --fsdp "full_shard auto_wrap" \ - --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \ - --tf32 True -``` - -## Evaluate LLM with library - -This will generate NER tokens based on the CONLL03 evaluation split and evaluate at it against the gold labels. - -```bash -python evaluate.py --model_name_or_path "" -``` diff --git a/paper_experiments/fine_tune_ner/evaluate.py b/paper_experiments/fine_tune_ner/evaluate.py deleted file mode 100644 index be408de..0000000 --- a/paper_experiments/fine_tune_ner/evaluate.py +++ /dev/null @@ -1,131 +0,0 @@ -import argparse -import os - -from datasets import load_dataset, load_from_disk, Dataset -from haystack.nodes import PromptNode - -from sklearn.metrics import classification_report, accuracy_score -from sklearn.preprocessing import MultiLabelBinarizer - -from fabricator import DatasetGenerator -from fabricator.prompts import BasePrompt -from fabricator.samplers import random_sampler - - -ner_prompt = ( - "Given the following text. Annotate the example and choose your annotations from: {}" -) - -def main(args): - - dataset = load_dataset(args.dataset_name, split=args.split) - - - prompt = BasePrompt( - task_description=ner_prompt, - generate_data_for_column="ner_tags", - fewshot_example_columns="tokens", - label_options={0: "O", 1: "B-PER", 2: "I-PER", 3: "B-ORG", 4: "I-ORG", 5: "B-LOC", 6: "I-LOC"}, - ) - - unlabeled_data = random_sampler(dataset, 30) - - if not args.use_cached: - - # "tiiuae/falcon-7b-instruct" - # timdettmers/guanaco-33b-merged - prompt_node = PromptNode( - model_name_or_path=args.model_name_or_path, - api_key=os.environ.get("HF_API_KEY"), - ) - - - generator = DatasetGenerator(prompt_node) - generated_dataset: Dataset = generator.generate( - prompt_template=prompt, - unlabeled_dataset=unlabeled_data, - max_prompt_calls=30, - timeout_per_prompt=2, - ) - - generated_dataset.save_to_disk("generated_dataset_starchat") - - else: - generated_dataset = load_from_disk("generated_dataset") - - - evaluate(dataset, generated_dataset) - - -def post_process(generated_samples): - """Some heuristics to clean up the generated samples""" - - def _post_process(generated_sample): - - cleaned_tags = [] - - for tag in generated_sample["ner_tags"]: - try: - cleaned_tags.append(int(tag)) - except ValueError: - if tag == "-": - cleaned_tags.append(0) - elif tag.startswith("[") and tag.endswith("]") and len(tag) > 2: - try: - cleaned_tags.append(int(tag[1:-1])) - except ValueError: - cleaned_tags.append(0) - - generated_sample["ner_tags"] = cleaned_tags - - return generated_sample - - return generated_samples.map(_post_process) - -def build_gold_and_prediction_pairs(dataset, generated_dataset): - """Builds a list of gold and predicted labels for each sample in the dataset""" - - golds = [] - predictions = [] - - for generated_sample in generated_dataset: - - for gold_sample in dataset: - - if generated_sample["tokens"] == gold_sample["tokens"]: - golds.append(gold_sample["ner_tags"]) - predictions.append(generated_sample["ner_tags"]) - - - return golds, predictions - -def calculate_metrics(golds, predictions): - mlb = MultiLabelBinarizer() - golds = mlb.fit_transform(golds) - predictions = mlb.transform(predictions) - acc = accuracy_score(golds, predictions) - report = classification_report(golds, predictions) - # Print the results - print(f"Accuracy: {acc}") - print(f"Classification Report:\n{report}") - - -def evaluate(dataset, generated_dataset): - generated_dataset = post_process(generated_dataset) - print(f"Using {generated_dataset} of samples from the generated dataset") - golds, predictions = build_gold_and_prediction_pairs(dataset, generated_dataset) - calculate_metrics(golds, predictions) - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser() - - parser.add_argument("--model_name_or_path", type=str, default="EleutherAI/pythia-70M-deduped") - parser.add_argument("--dataset_name", type=str, default="conll2003") - parser.add_argument("--split", type=str, default="validation") - parser.add_argument("--use_cached", type=bool, default=False) - - args = parser.parse_args() - - main(args) diff --git a/paper_experiments/fine_tune_ner/requirements.txt b/paper_experiments/fine_tune_ner/requirements.txt deleted file mode 100644 index d70a9b2..0000000 --- a/paper_experiments/fine_tune_ner/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -torch -transformers -accelerate diff --git a/paper_experiments/fine_tune_ner/train.py b/paper_experiments/fine_tune_ner/train.py deleted file mode 100644 index e5cce65..0000000 --- a/paper_experiments/fine_tune_ner/train.py +++ /dev/null @@ -1,204 +0,0 @@ -import copy -import logging -from dataclasses import dataclass, field -from typing import Dict, Optional, Sequence - -import torch -import transformers -from torch.utils.data import Dataset -from transformers import Trainer - -IGNORE_INDEX = -100 -DEFAULT_PAD_TOKEN = "[PAD]" -DEFAULT_EOS_TOKEN = "" -DEFAULT_BOS_TOKEN = "" -DEFAULT_UNK_TOKEN = "" - - -ner_prompt = ( - "Write a response to the question or task specified in the instruction. " - "Note the input that provides further context.\n\n" - "Instruction:\n{instruction}\n\nResponse:" -), - -@dataclass -class ModelArguments: - model_name_or_path: Optional[str] = field(default="facebook/opt-125m") - - -@dataclass -class DataArguments: - data_path: str = field(default=None, metadata={"help": "Path to the training data."}) - - -@dataclass -class TrainingArguments(transformers.TrainingArguments): - cache_dir: Optional[str] = field(default=None) - optim: str = field(default="adamw_torch") - model_max_length: int = field( - default=512, - metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."}, - ) - - -def smart_tokenizer_and_embedding_resize( - special_tokens_dict: Dict, - tokenizer: transformers.PreTrainedTokenizer, - model: transformers.PreTrainedModel, -): - """Resize tokenizer and embedding. - - Note: This is the unoptimized version that may make your embedding size not be divisible by 64. - """ - num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) - model.resize_token_embeddings(len(tokenizer)) - - if num_new_tokens > 0: - input_embeddings = model.get_input_embeddings().weight.data - output_embeddings = model.get_output_embeddings().weight.data - - input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True) - output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True) - - input_embeddings[-num_new_tokens:] = input_embeddings_avg - output_embeddings[-num_new_tokens:] = output_embeddings_avg - - -def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict: - """Tokenize a list of strings.""" - tokenized_list = [ - tokenizer( - text, - return_tensors="pt", - padding="longest", - max_length=tokenizer.model_max_length, - truncation=True, - ) - for text in strings - ] - input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list] - input_ids_lens = labels_lens = [ - tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list - ] - return dict( - input_ids=input_ids, - labels=labels, - input_ids_lens=input_ids_lens, - labels_lens=labels_lens, - ) - - -def preprocess( - sources: Sequence[str], - targets: Sequence[str], - tokenizer: transformers.PreTrainedTokenizer, -) -> Dict: - """Preprocess the data by tokenizing.""" - examples = [s + t for s, t in zip(sources, targets)] - examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)] - input_ids = examples_tokenized["input_ids"] - labels = copy.deepcopy(input_ids) - for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]): - label[:source_len] = IGNORE_INDEX - return dict(input_ids=input_ids, labels=labels) - - -class SupervisedDataset(Dataset): - """Dataset for supervised fine-tuning.""" - - def __init__(self, list_dataset_dict, tokenizer: transformers.PreTrainedTokenizer): - super(SupervisedDataset, self).__init__() - logging.warning("Loading data...") - - sources = [ - ner_prompt.format_map(example) for example in list_data_dict - ] - targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict] - - logging.warning("Tokenizing inputs... This may take some time...") - data_dict = preprocess(sources, targets, tokenizer) - - self.input_ids = data_dict["input_ids"] - self.labels = data_dict["labels"] - - def __len__(self): - return len(self.input_ids) - - def __getitem__(self, i) -> Dict[str, torch.Tensor]: - return dict(input_ids=self.input_ids[i], labels=self.labels[i]) - - -@dataclass -class DataCollatorForSupervisedDataset(object): - """Collate examples for supervised fine-tuning.""" - - tokenizer: transformers.PreTrainedTokenizer - - def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: - input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels")) - input_ids = torch.nn.utils.rnn.pad_sequence( - input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id - ) - labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) - return dict( - input_ids=input_ids, - labels=labels, - attention_mask=input_ids.ne(self.tokenizer.pad_token_id), - ) - - -def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict: - """Make dataset and collator for supervised fine-tuning.""" - - from datasets import load_dataset - # Conll - dataset = load_dataset("conll2003", split="train") - # Convert to dict with keys "instruction" and "output" - dataset = [dict(instruction=" ".join(example["tokens"]), output=f'[{", ".join(example["ner_tags"])}]') for example in dataset] - train_dataset = SupervisedDataset(dataset, tokenizer=tokenizer) - data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) - return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator) - - -def train(): - parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - - model = transformers.AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - ) - model = None - - tokenizer = transformers.AutoTokenizer.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - model_max_length=training_args.model_max_length, - padding_side="right", - use_fast=False, - ) - special_tokens_dict = dict() - if tokenizer.pad_token is None or tokenizer.pad_token == "": - special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN - if tokenizer.eos_token is None or tokenizer.eos_token == "": - special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN - if tokenizer.bos_token is None or tokenizer.bos_token == "": - special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN - if tokenizer.unk_token is None or tokenizer.unk_token == "": - special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN - - smart_tokenizer_and_embedding_resize( - special_tokens_dict=special_tokens_dict, - tokenizer=tokenizer, - model=model, - ) - - data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args) - trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module) - trainer.train() - trainer.save_state() - trainer.save_model(output_dir=training_args.output_dir) - - -if __name__ == "__main__": - train() diff --git a/paper_experiments/mrpc_annotate_dataset.py b/paper_experiments/mrpc_annotate_dataset.py deleted file mode 100644 index a1706d5..0000000 --- a/paper_experiments/mrpc_annotate_dataset.py +++ /dev/null @@ -1,52 +0,0 @@ -import os -from datasets import load_dataset, concatenate_datasets -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator, BasePrompt -from fabricator.dataset_transformations.text_classification import convert_label_ids_to_texts - - -def run(): - annotation_dataset, label_options = convert_label_ids_to_texts( - load_dataset("glue", "mrpc", split="train"), - "label", - return_label_options=True, - ) - fewshot_datasets = [] - for label in range(2): - filtered_ds = load_dataset("glue", "mrpc", split="validation").filter( - lambda x: x["label"] == label) - fewshot_datasets.append(filtered_ds.select(range(6))) - fewshot_dataset = concatenate_datasets(fewshot_datasets).shuffle(seed=42) - - fewshot_dataset = convert_label_ids_to_texts(fewshot_dataset, "label") - - prompt = BasePrompt( - task_description="Given two sentences, determine by means of the fewshot examples whether these sentences are: {}.", - label_options=label_options, - generate_data_for_column="label", - fewshot_example_columns=["sentence1", "sentence2"], - ) - - prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo", - api_key=os.environ.get("OPENAI_API_KEY"), - max_length=100, - ) - - generator = DatasetGenerator(prompt_node) - generated_dataset, original_dataset = generator.generate( - prompt_template=prompt, - fewshot_dataset=fewshot_dataset, - fewshot_examples_per_class=2, - fewshot_sampling_strategy="stratified", - unlabeled_dataset=annotation_dataset, - max_prompt_calls=len(annotation_dataset), - return_unlabeled_dataset=True - ) - - generated_dataset.push_to_hub("glue_mrpc_annotated_12_fewshot_examples_2_per_prompt_stratified", private=True) - original_dataset.push_to_hub("glue_mrpc_original", private=True) - - -if __name__ == "__main__": - run() diff --git a/paper_experiments/mrpc_train_model.py b/paper_experiments/mrpc_train_model.py deleted file mode 100644 index 1e150c9..0000000 --- a/paper_experiments/mrpc_train_model.py +++ /dev/null @@ -1,143 +0,0 @@ -import argparse -import numpy as np -from datasets import load_dataset, ClassLabel -from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, \ - DataCollatorWithPadding -import evaluate -import shutil - - -def run(args): - # iterate over all corpora - for corpus_name in args.corpora: - # iterate over all sizes, -1 means we are taking all examples but at most 10k - for size in [-1, 50, 500, 1000]: - # Average results for corpus and size over 5 seeds - result_avg = [] - for seed in [41, 42, 43, 44, 45]: - # Load the dataset - dataset = load_dataset(corpus_name, split="train").shuffle(seed=seed) - test_split = load_dataset("glue", "mrpc", split="test") - - # preprocess annotated dataset - ensure unified labels (lowercased and no whitespaces) + correct - # ClassLabel feature - if "annotated" in corpus_name: - original_labels = test_split.features["label"].names - - def clean_labels(examples): - examples["label"] = examples["label"].lower() - return examples - - dataset = dataset.map(clean_labels) - - dst_feat = ClassLabel(names=original_labels) - dataset = dataset.map(lambda batch: { - "label": dst_feat.str2int(batch)}, input_columns="label", batched=True) - new_features = dataset.features.copy() - new_features["label"] = dst_feat - dataset = dataset.cast(new_features) - - # Compose final training dataset + gold-labeled test split - if size > 0: - dataset = dataset.select(range(size)) - dataset = dataset.train_test_split(test_size=0.1) - dataset["validation"] = dataset["test"] - dataset["test"] = test_split - num_labels = dataset["train"].features["label"].num_classes - - # Load the BERT tokenizer and model - model_name = "bert-base-uncased" - tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) - - # Preprocessing function - def preprocess_function(examples): - return tokenizer( - examples["sentence1"], - examples["sentence2"], - padding=True, - truncation=True, - return_tensors="pt" - ) - - tokenized_dataset = dataset.map(preprocess_function, batched=True) - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - accuracy = evaluate.load("accuracy") - - def compute_metrics(eval_pred): - predictions, labels = eval_pred - predictions = np.argmax(predictions, axis=1) - return accuracy.compute(predictions=predictions, references=labels) - - id2label = dict(enumerate(dataset["train"].features["label"].names)) - label2id = {v: k for k, v in id2label.items()} - - # Create model and move to CUDA - model = AutoModelForSequenceClassification.from_pretrained( - model_name, - num_labels=num_labels, - id2label=id2label, - label2id=label2id - ).to("cuda") - - # Set number of training epochs depending on dataset size - if size < 0: - num_train_epochs = 5 - elif size == 1000: - num_train_epochs = 10 - else: - num_train_epochs = 20 - - # Make tmp path for storing the model - tmp_path = f"tmp/{corpus_name.replace('/', '-')}-{size}-samples" - - # Training arguments - training_args = TrainingArguments( - output_dir=tmp_path, - learning_rate=2e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=num_train_epochs, - weight_decay=0.01, - save_total_limit=1, - evaluation_strategy="epoch", - push_to_hub=False, - ) - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_dataset["train"], - eval_dataset=tokenized_dataset["validation"], - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - trainer.train() - - results = trainer.predict(tokenized_dataset["test"]) - result_avg.append(results.metrics["test_accuracy"] * 100) - - # remove tmp path since we iterate over seeds, corpora and sizes - shutil.rmtree(tmp_path) - - # change -1 for logging to 'all' - if size > 0: - log_size = str(size) - else: - log_size = "all" - - # write results to log file - log_corpus_name = corpus_name.replace("whoisjones/", "") - file = f"{log_corpus_name}-{log_size}-samples" - with open(f"results/{file}.log", "w") as f: - f.write(f"Accuracy: {np.mean(result_avg)}\n") - f.write(f"Standard deviation: {np.std(result_avg)}\n") - - -if __name__ == "__main__": - # Run like 'python snli_train_model.py --corpora hfaccount/generated-model snli - parser = argparse.ArgumentParser() - parser.add_argument("--corpora", nargs='+') # a list of generated and gold-label corpus - arguments = parser.parse_args() - run(arguments) diff --git a/paper_experiments/snli_annotate_dataset.py b/paper_experiments/snli_annotate_dataset.py deleted file mode 100644 index 8a09ab4..0000000 --- a/paper_experiments/snli_annotate_dataset.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -from datasets import load_dataset, concatenate_datasets -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator, BasePrompt -from fabricator.dataset_transformations.text_classification import convert_label_ids_to_texts - - -def run(): - annotation_dataset, label_options = convert_label_ids_to_texts( - load_dataset("snli", split="train").filter(lambda x: x["label"] in [0,1,2]).shuffle(seed=42).select( - range(10000)), - "label", - return_label_options=True, - ) - - fewshot_datasets = [] - for label in range(3): - filtered_ds = load_dataset("snli", split="validation").filter(lambda x: x["label"] == label) - fewshot_datasets.append(filtered_ds.select(range(6))) - fewshot_dataset = concatenate_datasets(fewshot_datasets).shuffle(seed=42) - - fewshot_dataset = convert_label_ids_to_texts(fewshot_dataset, "label") - - prompt = BasePrompt( - task_description="Given two sentences, determine by means of the fewshot examples whether these sentence " - "pairs are: {}.", - label_options=label_options, - generate_data_for_column="label", - fewshot_example_columns=["premise", "hypothesis"], - ) - - prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo", - api_key=os.environ.get("OPENAI_API_KEY"), - max_length=100, - ) - - generator = DatasetGenerator(prompt_node) - generated_dataset, original_dataset = generator.generate( - prompt_template=prompt, - fewshot_dataset=fewshot_dataset, - fewshot_examples_per_class=2, - fewshot_sampling_strategy="stratified", - unlabeled_dataset=annotation_dataset, - max_prompt_calls=len(annotation_dataset), - return_unlabeled_dataset=True - ) - - generated_dataset.push_to_hub("snli_annotated_18_fewshot_examples_2_per_prompt_stratified", private=True) - original_dataset.push_to_hub("snli_original", private=True) - - -if __name__ == "__main__": - run() diff --git a/paper_experiments/snli_train_model.py b/paper_experiments/snli_train_model.py deleted file mode 100644 index 82e925e..0000000 --- a/paper_experiments/snli_train_model.py +++ /dev/null @@ -1,144 +0,0 @@ -import argparse -import numpy as np -from datasets import load_dataset, ClassLabel -from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, \ - DataCollatorWithPadding -import evaluate -import shutil - - -def run(args): - # iterate over all corpora - for corpus_name in args.corpora: - # iterate over all sizes, -1 means we are taking all examples but at most 10k - for size in [-1, 50, 500, 1000]: - # Average results for corpus and size over 5 seeds - result_avg = [] - for seed in [41, 42, 43, 44, 45]: - # Load the dataset - dataset = load_dataset(corpus_name, split="train").shuffle(seed=seed) - test_split = load_dataset("snli", split="test") - test_split = test_split.filter(lambda x: x["label"] != -1) - - # preprocess annotated dataset - ensure unified labels (lowercased and no whitespaces) + correct - # ClassLabel feature - if "annotated" in corpus_name: - original_labels = test_split.features["label"].names - - def clean_labels(examples): - examples["label"] = examples["label"].lower() - return examples - - dataset = dataset.map(clean_labels) - - dst_feat = ClassLabel(names=original_labels) - dataset = dataset.map(lambda batch: { - "label": dst_feat.str2int(batch)}, input_columns="label", batched=True) - new_features = dataset.features.copy() - new_features["label"] = dst_feat - dataset = dataset.cast(new_features) - - # Compose final training dataset + gold-labeled test split - if size > 0: - dataset = dataset.select(range(size)) - dataset = dataset.train_test_split(test_size=0.1) - dataset["validation"] = dataset["test"] - dataset["test"] = test_split - num_labels = dataset["train"].features["label"].num_classes - - # Load the BERT tokenizer and model - model_name = "bert-base-uncased" - tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) - - # Preprocessing function - def preprocess_function(examples): - return tokenizer( - examples["premise"], - examples["hypothesis"], - padding=True, - truncation=True, - return_tensors="pt" - ) - - tokenized_dataset = dataset.map(preprocess_function, batched=True) - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - accuracy = evaluate.load("accuracy") - - def compute_metrics(eval_pred): - predictions, labels = eval_pred - predictions = np.argmax(predictions, axis=1) - return accuracy.compute(predictions=predictions, references=labels) - - id2label = dict(enumerate(dataset["train"].features["label"].names)) - label2id = {v: k for k, v in id2label.items()} - - # Create model and move to CUDA - model = AutoModelForSequenceClassification.from_pretrained( - model_name, - num_labels=num_labels, - id2label=id2label, - label2id=label2id - ).to("cuda") - - # Set number of training epochs depending on dataset size - if size < 0: - num_train_epochs = 5 - elif size == 1000: - num_train_epochs = 10 - else: - num_train_epochs = 20 - - # Make tmp path for storing the model - tmp_path = f"tmp/{corpus_name.replace('/', '-')}-{size}-samples" - - # Training arguments - training_args = TrainingArguments( - output_dir=tmp_path, - learning_rate=2e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=num_train_epochs, - weight_decay=0.01, - save_total_limit=1, - evaluation_strategy="epoch", - push_to_hub=False, - ) - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_dataset["train"], - eval_dataset=tokenized_dataset["validation"], - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - trainer.train() - - results = trainer.predict(tokenized_dataset["test"]) - result_avg.append(results.metrics["test_accuracy"] * 100) - - # remove tmp path since we iterate over seeds, corpora and sizes - shutil.rmtree(tmp_path) - - # change -1 for logging to 'all' - if size > 0: - log_size = str(size) - else: - log_size = "all" - - # write results to log file - log_corpus_name = corpus_name.replace("whoisjones/", "") - file = f"{log_corpus_name}-{log_size}-samples" - with open(f"results/{file}.log", "w") as f: - f.write(f"Accuracy: {np.mean(result_avg)}\n") - f.write(f"Standard deviation: {np.std(result_avg)}\n") - - -if __name__ == "__main__": - # Run like 'python snli_train_model.py --corpora hfaccount/generated-model snli - parser = argparse.ArgumentParser() - parser.add_argument("--corpora", nargs='+') # a list of generated and gold-label corpus - arguments = parser.parse_args() - run(arguments) diff --git a/paper_experiments/squad_annotate_dataset.py b/paper_experiments/squad_annotate_dataset.py deleted file mode 100644 index b04fe44..0000000 --- a/paper_experiments/squad_annotate_dataset.py +++ /dev/null @@ -1,140 +0,0 @@ -import os -import random -from argparse import ArgumentParser - -from datasets import Sequence, Value, load_dataset, concatenate_datasets -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator, BasePrompt -from fabricator.dataset_transformations.question_answering import ( - preprocess_squad_format, - postprocess_squad_format, -) - - -def run(arguments): - """Generate answers based on a few-shot example from context and question.""" - org = load_dataset(arguments.dataset, split=arguments.split) - - dataset_name = f"qa-dataset" - dataset_answerable_questions = org.filter(lambda sample: sample['answers']['text']).shuffle() - dataset_unanswerable_questions = org.filter(lambda sample: not sample['answers']['text']).shuffle() - - prompt_node = PromptNode( - model_name_or_path=arguments.llm, api_key=os.environ.get("OPENAI_API_KEY"), max_length=100 - ) - generator = DatasetGenerator(prompt_node) - - question_words = ["Which", "What", "How", "When", "Who", "How many", "Where", "Why"] - - filtered_generated_datasets = [] - filtered_original_datasets = [] - - def merge_columns(example): - if example["answers"] == "": - example["question"] = example["question"] - return example - example["question"] = f"{example['question']}\nAnswer: {example['answers']}" - return example - - def split_columns(example): - entries = example["question"].split("\nAnswer:") - example["question"] = entries[0] - if len(entries) == 1: - example["answers"] = "" - return example - example["answers"] = entries[1].strip() - return example - - for index, dataset in enumerate([dataset_answerable_questions, dataset_unanswerable_questions]): - preprocessed_dataset = preprocess_squad_format(dataset) - preprocessed_dataset = preprocessed_dataset.map(merge_columns) - fewshot_examples = preprocessed_dataset.select(range(10)) - labels_to_generate = arguments.num_labels//3*2 if index == 0 else arguments.num_labels//3 - unlabeled_examples = preprocessed_dataset.select(range(10, labels_to_generate + 10, 1)) - - for i in range(0, len(unlabeled_examples), arguments.save_steps): - task_descriptions = [ - "Given a text, first create a difficult question that can be answered using the text. The question must describe the context of the text. Second, extract the answer to this question from the text. The answer must be word for word exactly as it appears in the text.", - f"You are a student and a teacher is teaching you about a new topic. Ask a short follow-up question about something the teacher hasn't mentioned yet at all. You must not ask something you already know the answer to from the teacher's explanations. You must not ask for further clarification if the teacher already mentioned something in passing. The question should be self-contained. It must not contain the word \"other\" as in \"which other\" or \"what other\". The question should start with one of {random.sample(question_words, 3)}"] - - prompt = BasePrompt( - task_description=task_descriptions[index], - fewshot_example_columns=arguments.input_variables, - generate_data_for_column=arguments.target_variable, - ) - - current_unlabeled_examples = unlabeled_examples.select(range(i, min(len(unlabeled_examples), i + arguments.save_steps))) - generated_dataset, original_dataset = generator.generate( - fewshot_dataset=fewshot_examples, - fewshot_examples_per_class=arguments.support_examples_per_prompt, - unlabeled_dataset=current_unlabeled_examples, - prompt_template=prompt, - max_prompt_calls=arguments.max_prompt_calls, - return_unlabeled_dataset=True, - ) - - generated_dataset = generated_dataset.map(split_columns) - - assert len(generated_dataset) == len(original_dataset) - - # filter bad samples from generated dataset - if index == 0: # answerable questions - generated_dataset = postprocess_squad_format(generated_dataset, add_answer_start=True) - indices_to_keep = \ - generated_dataset.map(lambda example, idx: {'idx': idx if example['answers']['answer_start'][0] >= 0 else -1}, - with_indices=True)['idx'] - else: # unanswerable questions - generated_dataset = postprocess_squad_format(generated_dataset, add_answer_start=False) - indices_to_keep = \ - generated_dataset.map(lambda example, idx: {'idx': idx if example['answers']['answer_start'] == [] else -1}, - with_indices=True)['idx'] - indices_to_keep = [i for i in indices_to_keep if i != -1] - - generated_dataset = generated_dataset.select(indices_to_keep) - original_dataset = original_dataset.select(indices_to_keep) - - # add id and title to generated dataset - generated_dataset = generated_dataset.add_column("id", original_dataset['id']) - generated_dataset = generated_dataset.add_column("title", original_dataset['title']) - - ids_to_keep = set(original_dataset['id']) - original_dataset = dataset.filter(lambda example: example['id'] in ids_to_keep) - - features = generated_dataset.features - features["answers"] = Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None) - generated_dataset = generated_dataset.cast(features) - - filtered_generated_datasets.append(generated_dataset) - filtered_original_datasets.append(original_dataset) - - filtered_generated_concatenated_dataset = concatenate_datasets(filtered_generated_datasets) - filtered_generated_concatenated_dataset.save_to_disk(f"{dataset_name}-generated-{index}-{i}") - filtered_original_concatenated_dataset = concatenate_datasets(filtered_original_datasets) - filtered_original_concatenated_dataset.save_to_disk(f"{dataset_name}-original-{index}-{i}") - - if arguments.push_to_hub: - filtered_generated_concatenated_dataset.push_to_hub(f"{dataset_name}-generated-{len(filtered_generated_concatenated_dataset)}", private=False) - filtered_original_concatenated_dataset.push_to_hub(f"{dataset_name}-original-{len(filtered_original_concatenated_dataset)}", private=False) - - -if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument("--llm", type=str, default="gpt-3.5-turbo") - parser.add_argument("--max_generation_length", type=int, default=100) - parser.add_argument( - "--task_description", - type=str, - default="Given a context and a question, generate an answer that occurs exactly and only once in the text.", - ) - parser.add_argument("--dataset", type=str, default="squad_v2") - parser.add_argument("--split", type=str, default="train") - parser.add_argument("--input_variables", type=str, nargs="+", default=["context"]) - parser.add_argument("--target_variable", type=str, default="question") - parser.add_argument("--output_format", type=str, default="text") - parser.add_argument("--max_prompt_calls", type=int, default=20) - parser.add_argument("--support_examples_per_prompt", type=int, default=1) - parser.add_argument("--push_to_hub", action="store_false") #TODO set default back to store_true - parser.add_argument("--save_steps", type=int, default=5) - parser.add_argument("--num_labels", type=int, default=10) - args = parser.parse_args() - run(args) diff --git a/paper_experiments/squad_train_model.py b/paper_experiments/squad_train_model.py deleted file mode 100644 index 2279ccf..0000000 --- a/paper_experiments/squad_train_model.py +++ /dev/null @@ -1,59 +0,0 @@ -from pathlib import Path - -from haystack.nodes import FARMReader -from datasets import load_dataset -import json - -from haystack.utils import SquadData - - -def hf_to_squad(dataset_name: str) -> str: - dataset = load_dataset(dataset_name, split="train").shuffle().to_dict() - paragraphs = [] - - for i in range(len(dataset["context"])): - if dataset["answers"][i]["text"]: - answers = [ - {"text": dataset["answers"][i]["text"][0], "answer_start": dataset["answers"][i]["answer_start"][0]}] - impossible = False - else: - answers = [] - impossible = True - paragraph = {"qas": [{"id": dataset["id"][i], "question": dataset["question"][i], "answers": answers, - "is_impossible": impossible}], "context": dataset["context"][i]} - paragraphs.append(paragraph) - - squad = {"version": "1.0", "data": [{"title": "test", "paragraphs": paragraphs}]} - - filename = f"squad-{dataset_name.split('/')[-1]}" - with open(f"{filename}.json", "w") as f: - f.write(json.dumps(squad)) - return filename - - -if __name__ == '__main__': - config = [1_000, 10_000, 20_000] - model_name = "roberta-base" # "bert-base-uncased" is a worse alternative - dataset_names = ["julianrisch/qa-dataset-generated-21020", "julianrisch/qa-dataset-original-21020"] - - for dataset_name in dataset_names: - squad_filename = hf_to_squad(dataset_name) - dataset = SquadData.from_file(filename=f"{squad_filename}.json") - for num_samples in config: - train_filename = f"{squad_filename}-{num_samples}.json" - sample = dataset.sample_questions(num_samples) - SquadData(squad_data=sample).save(train_filename) - - # Model Training - reader_directory = f"{squad_filename}-{num_samples}-{model_name}" - reader = FARMReader(model_name_or_path=model_name, return_no_answer=True, use_confidence_scores=False) - reader.train(data_dir="..", train_filename=train_filename, dev_split=0.1, use_gpu=True, batch_size=16, max_seq_len=384) - reader.save(Path(reader_directory)) - - # Model Evaluation - reader = FARMReader(reader_directory, return_no_answer=True, use_confidence_scores=False, max_seq_len=384) - reader_eval_results = reader.eval_on_file(data_dir="..", test_filename="dev-v2.0.json") - with open("log.txt", "a") as log_file: - log_file.write(str(reader_directory)+'\n') - log_file.write(str(reader_eval_results)+'\n') - diff --git a/paper_experiments/trec_annotate_dataset.py b/paper_experiments/trec_annotate_dataset.py deleted file mode 100644 index d10375a..0000000 --- a/paper_experiments/trec_annotate_dataset.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -from datasets import load_dataset, concatenate_datasets -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator, BasePrompt -from fabricator.dataset_transformations.text_classification import convert_label_ids_to_texts - - -def run(): - dataset = load_dataset("trec", split="train").shuffle(seed=42) - fewshot_datasets = [] - annotation_datasets = [] - for label in range(6): - filtered_ds = dataset.filter(lambda x: x["coarse_label"] == label) - fewshot_datasets.append(filtered_ds.select(range(6))) - annotation_datasets.append(filtered_ds.select(range(6, len(filtered_ds)))) - fewshot_dataset = concatenate_datasets(fewshot_datasets).shuffle(seed=42) - annotation_dataset = concatenate_datasets(annotation_datasets).shuffle(seed=42) - - extended_mapping = { - 0: "abbreviation", - 1: "entity", - 2: "description", - 3: "human", - 4: "location", - 5: "number" - } - - annotation_dataset, label_options = convert_label_ids_to_texts( - annotation_dataset, - "coarse_label", - expanded_label_mapping=extended_mapping, - return_label_options=True, - ) - - fewshot_dataset = convert_label_ids_to_texts( - fewshot_dataset, - "coarse_label", - expanded_label_mapping=extended_mapping, - ) - - prompt = BasePrompt( - task_description="Based on the fewshot examples, classify the question into exactly one of the following classes: {}.", - label_options=label_options, - generate_data_for_column="coarse_label", - fewshot_example_columns="text", - fewshot_formatting_template="Question: {text}\nClass: {coarse_label}", - target_formatting_template="Question: {text}\nClass: ", - ) - - prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo", - api_key=os.environ.get("OPENAI_API_KEY"), - max_length=100, - ) - - generator = DatasetGenerator(prompt_node) - generated_dataset, original_dataset = generator.generate( - prompt_template=prompt, - fewshot_dataset=fewshot_dataset, - fewshot_examples_per_class=2, - fewshot_sampling_strategy="stratified", - unlabeled_dataset=annotation_dataset, - max_prompt_calls=len(annotation_dataset), - return_unlabeled_dataset=True - ) - - generated_dataset.push_to_hub("trec_annotated_36_fewshot_examples_2_per_prompt_stratified", private=True) - original_dataset.push_to_hub("trec_original", private=True) - - -if __name__ == "__main__": - run() diff --git a/paper_experiments/trec_annotation_train_model.py b/paper_experiments/trec_annotation_train_model.py deleted file mode 100644 index db99650..0000000 --- a/paper_experiments/trec_annotation_train_model.py +++ /dev/null @@ -1,155 +0,0 @@ -import argparse -import numpy as np -from datasets import load_dataset, ClassLabel -from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, \ - DataCollatorWithPadding -import evaluate -import shutil - - -def run(args): - # iterate over all corpora - for corpus_name in args.corpora: - # iterate over all sizes, -1 means we are taking all examples but at most 10k - for size in [-1, 50, 500, 1000]: - # Average results for corpus and size over 5 seeds - result_avg = [] - for seed in [41, 42, 43, 44, 45]: - label_alignment = { - "NUM": "number", - "ENTY": "entity", - "DESC": "description", - "ABBR": "abbreviation", - "HUM": "human", - "LOC": "location", - } - # Load the dataset - dataset = load_dataset(corpus_name, split="train").shuffle(seed=seed) - test_split = load_dataset("trec", split="test") - - # preprocess annotated dataset - ensure unified labels (lowercased and no whitespaces) + correct - # ClassLabel feature - if "annotated" in corpus_name: - original_labels = test_split.features["coarse_label"].names - - def clean_labels(examples): - label = examples["coarse_label"].replace("Class: ", "") - if label not in list(label_alignment.values()): - label = "remove" - examples["coarse_label"] = label - return examples - - dataset = dataset.map(clean_labels) - dataset = dataset.filter(lambda x: x["coarse_label"] != "remove") - - dst_feat = ClassLabel(names=[label_alignment[k] for k in original_labels]) - dataset = dataset.map(lambda batch: { - "coarse_label": dst_feat.str2int(batch)}, input_columns="coarse_label", batched=True) - new_features = dataset.features.copy() - new_features["coarse_label"] = dst_feat - dataset = dataset.cast(new_features) - - # Compose final training dataset + gold-labeled test split - if size > 0: - dataset = dataset.select(range(size)) - dataset = dataset.train_test_split(test_size=0.1) - dataset["validation"] = dataset["test"] - dataset["test"] = test_split - dataset = dataset.rename_column("coarse_label", "label") - num_labels = dataset["train"].features["label"].num_classes - - # Load the BERT tokenizer and model - model_name = "bert-base-uncased" - tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) - - # Preprocessing function - def preprocess_function(examples): - return tokenizer( - examples["text"], - padding=True, - truncation=True, - return_tensors="pt" - ) - - tokenized_dataset = dataset.map(preprocess_function, batched=True) - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - accuracy = evaluate.load("accuracy") - - def compute_metrics(eval_pred): - predictions, labels = eval_pred - predictions = np.argmax(predictions, axis=1) - return accuracy.compute(predictions=predictions, references=labels) - - id2label = dict(enumerate(dataset["train"].features["label"].names)) - label2id = {v: k for k, v in id2label.items()} - - # Create model and move to CUDA - model = AutoModelForSequenceClassification.from_pretrained( - model_name, - num_labels=num_labels, - id2label=id2label, - label2id=label2id - ).to("cuda") - - # Set number of training epochs depending on dataset size - if size < 0: - num_train_epochs = 5 - elif size == 1000: - num_train_epochs = 10 - else: - num_train_epochs = 20 - - # Make tmp path for storing the model - tmp_path = f"tmp/{corpus_name.replace('/', '-')}-{size}-samples" - - # Training arguments - training_args = TrainingArguments( - output_dir=tmp_path, - learning_rate=2e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=num_train_epochs, - weight_decay=0.01, - save_total_limit=1, - evaluation_strategy="epoch", - push_to_hub=False, - ) - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_dataset["train"], - eval_dataset=tokenized_dataset["validation"], - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - trainer.train() - - results = trainer.predict(tokenized_dataset["test"]) - result_avg.append(results.metrics["test_accuracy"] * 100) - - # remove tmp path since we iterate over seeds, corpora and sizes - shutil.rmtree(tmp_path) - - # change -1 for logging to 'all' - if size > 0: - log_size = str(size) - else: - log_size = "all" - - # write results to log file - log_corpus_name = corpus_name.replace("whoisjones/", "") - file = f"{log_corpus_name}-{log_size}-samples" - with open(f"results/{file}.log", "w") as f: - f.write(f"Accuracy: {np.mean(result_avg)}\n") - f.write(f"Standard deviation: {np.std(result_avg)}\n") - - -if __name__ == "__main__": - # Run like 'python trec_annotation_train_model.py --corpora hfaccount/generated-model trec - parser = argparse.ArgumentParser() - parser.add_argument("--corpora", nargs='+') # a list of generated and gold-label corpus - arguments = parser.parse_args() - run(arguments) diff --git a/paper_experiments/trec_generate_dataset.py b/paper_experiments/trec_generate_dataset.py deleted file mode 100644 index 5706086..0000000 --- a/paper_experiments/trec_generate_dataset.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -from datasets import load_dataset, concatenate_datasets -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator, BasePrompt -from fabricator.dataset_transformations.text_classification import convert_label_ids_to_texts - - -def run(): - dataset = load_dataset("trec", split="train").shuffle(seed=42) - fewshot_datasets = [] - for label in range(6): - filtered_ds = dataset.filter(lambda x: x["coarse_label"] == label) - fewshot_datasets.append(filtered_ds.select(range(8))) - fewshot_dataset = concatenate_datasets(fewshot_datasets).shuffle(seed=42) - - extended_mapping = { - 0: "abbreviation", - 1: "entity", - 2: "description", - 3: "human", - 4: "location", - 5: "number" - } - - fewshot_dataset, label_options = convert_label_ids_to_texts( - fewshot_dataset, - "coarse_label", - expanded_label_mapping=extended_mapping, - return_label_options=True, - ) - - prompt = BasePrompt( - task_description="Generate a new question that asks about: {}. The new question should be very different from " - "the fewshot examples.", - label_options=label_options, - generate_data_for_column="text", - fewshot_formatting_template="Question: {text}", - target_formatting_template="Question: ", - ) - - prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo", - api_key=os.environ.get("OPENAI_API_KEY"), - max_length=100, - ) - - generator = DatasetGenerator(prompt_node) - generated_dataset = generator.generate( - prompt_template=prompt, - fewshot_dataset=fewshot_dataset, - fewshot_examples_per_class=3, - fewshot_sampling_strategy="uniform", - fewshot_sampling_column="coarse_label", - max_prompt_calls=10000, - num_samples_to_generate=10000, - ) - - generated_dataset.push_to_hub("trec_generate_48_fewshot_examples_3_per_prompt_stratified", private=True) - - -if __name__ == "__main__": - run() diff --git a/paper_experiments/trec_generation_train_model.py b/paper_experiments/trec_generation_train_model.py deleted file mode 100644 index d86cb25..0000000 --- a/paper_experiments/trec_generation_train_model.py +++ /dev/null @@ -1,148 +0,0 @@ -import argparse -import numpy as np -from datasets import load_dataset, ClassLabel -from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, \ - DataCollatorWithPadding -import evaluate -import shutil - - -def run(args): - # iterate over all corpora - for corpus_name in args.corpora: - # iterate over all sizes, -1 means we are taking all examples but at most 10k - for size in [-1, 50, 500, 1000]: - # Average results for corpus and size over 5 seeds - result_avg = [] - for seed in [41, 42, 43, 44, 45]: - label_alignment = { - "NUM": "number", - "ENTY": "entity", - "DESC": "description", - "ABBR": "abbreviation", - "HUM": "human", - "LOC": "location", - } - # Load the dataset - dataset = load_dataset(corpus_name, split="train").shuffle(seed=seed) - test_split = load_dataset("trec", split="test") - - # preprocess annotated dataset - ensure unified labels (lowercased and no whitespaces) + correct - # ClassLabel feature - if "annotated" in corpus_name: - original_labels = test_split.features["coarse_label"].names - - def clean_labels(examples): - label = examples["coarse_label"].replace("Class: ", "") - if label not in list(label_alignment.values()): - label = "remove" - examples["coarse_label"] = label - return examples - - dataset = dataset.map(clean_labels) - dataset = dataset.filter(lambda x: x["coarse_label"] != "remove") - - # Compose final training dataset + gold-labeled test split - if size > 0: - dataset = dataset.select(range(size)) - dataset = dataset.train_test_split(test_size=0.1) - dataset["validation"] = dataset["test"] - dataset["test"] = test_split - dataset = dataset.rename_column("coarse_label", "label") - num_labels = dataset["train"].features["label"].num_classes - - # Load the BERT tokenizer and model - model_name = "bert-base-uncased" - tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) - - # Preprocessing function - def preprocess_function(examples): - return tokenizer( - examples["text"], - padding=True, - truncation=True, - return_tensors="pt" - ) - - tokenized_dataset = dataset.map(preprocess_function, batched=True) - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - accuracy = evaluate.load("accuracy") - - def compute_metrics(eval_pred): - predictions, labels = eval_pred - predictions = np.argmax(predictions, axis=1) - return accuracy.compute(predictions=predictions, references=labels) - - id2label = dict(enumerate(dataset["train"].features["label"].names)) - label2id = {v: k for k, v in id2label.items()} - - # Create model and move to CUDA - model = AutoModelForSequenceClassification.from_pretrained( - model_name, - num_labels=num_labels, - id2label=id2label, - label2id=label2id - ).to("cuda") - - # Set number of training epochs depending on dataset size - if size < 0: - num_train_epochs = 5 - elif size == 1000: - num_train_epochs = 10 - else: - num_train_epochs = 20 - - # Make tmp path for storing the model - tmp_path = f"tmp/{corpus_name.replace('/', '-')}-{size}-samples" - - # Training arguments - training_args = TrainingArguments( - output_dir=tmp_path, - learning_rate=2e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=num_train_epochs, - weight_decay=0.01, - save_total_limit=1, - evaluation_strategy="epoch", - push_to_hub=False, - ) - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_dataset["train"], - eval_dataset=tokenized_dataset["validation"], - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - trainer.train() - - results = trainer.predict(tokenized_dataset["test"]) - result_avg.append(results.metrics["test_accuracy"] * 100) - - # remove tmp path since we iterate over seeds, corpora and sizes - shutil.rmtree(tmp_path) - - # change -1 for logging to 'all' - if size > 0: - log_size = str(size) - else: - log_size = "all" - - # write results to log file - log_corpus_name = corpus_name.replace("whoisjones/", "") - file = f"{log_corpus_name}-{log_size}-samples" - with open(f"results/{file}.log", "w") as f: - f.write(f"Accuracy: {np.mean(result_avg)}\n") - f.write(f"Standard deviation: {np.std(result_avg)}\n") - - -if __name__ == "__main__": - # Run like 'python trec_generation_train_model.py --corpora hfaccount/generated-model trec - parser = argparse.ArgumentParser() - parser.add_argument("--corpora", nargs='+') # a list of generated and gold-label corpus - arguments = parser.parse_args() - run(arguments) diff --git a/paper_experiments/trec_hyperparameter_annotate_dataset.py b/paper_experiments/trec_hyperparameter_annotate_dataset.py deleted file mode 100644 index 489ff38..0000000 --- a/paper_experiments/trec_hyperparameter_annotate_dataset.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -from datasets import load_dataset, concatenate_datasets -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator, BasePrompt -from fabricator.dataset_transformations.text_classification import convert_label_ids_to_texts - -def run(): - for possible_examples_per_class, fewshot_example_per_class in [(0,0), (2,1), (2,2), (4,1), (4,2), (4,3), (4,4), (8,1), (8,2), (8,3), - (8,4), (16,1), (16,2), (16,3), (16,4)]: - dataset = load_dataset("trec", split="train").shuffle(seed=42).train_test_split(500, stratify_by_column="coarse_label") - fewshot_dataset = dataset["train"] - annotation_dataset = dataset["test"] - fewshot_datasets = [] - for label in range(6): - filtered_ds = fewshot_dataset.filter(lambda x: x["coarse_label"] == label) - fewshot_datasets.append(filtered_ds.select(range(possible_examples_per_class))) - fewshot_dataset = concatenate_datasets(fewshot_datasets).shuffle(seed=42) - - extended_mapping = { - 0: "abbreviation", - 1: "entity", - 2: "description", - 3: "human", - 4: "location", - 5: "number" - } - - if possible_examples_per_class > 0: - fewshot_dataset = convert_label_ids_to_texts( - fewshot_dataset, - "coarse_label", - expanded_label_mapping=extended_mapping, - ) - - annotation_dataset, label_options = convert_label_ids_to_texts( - annotation_dataset, - "coarse_label", - expanded_label_mapping=extended_mapping, - return_label_options=True, - ) - - prompt = BasePrompt( - task_description="Classify the question into exactly one of the following classes: {}.", - label_options=label_options, - generate_data_for_column="coarse_label", - fewshot_example_columns="text", - fewshot_formatting_template="Question: {text}\nClass: {coarse_label}", - target_formatting_template="Question: {text}\nClass: ", - ) - - prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo", - api_key=os.environ.get("OPENAI_API_KEY"), - max_length=100, - ) - - generator = DatasetGenerator(prompt_node) - generated_dataset = generator.generate( - prompt_template=prompt, - fewshot_dataset=fewshot_dataset if possible_examples_per_class > 0 else None, - fewshot_examples_per_class=fewshot_example_per_class if possible_examples_per_class > 0 else 0, - fewshot_sampling_strategy="stratified" if possible_examples_per_class > 0 else None, - fewshot_sampling_column="coarse_label" if possible_examples_per_class > 0 else None, - unlabeled_dataset=annotation_dataset, - max_prompt_calls=len(annotation_dataset), - ) - - model_name = f"trec_hyperparameter_annotated_{possible_examples_per_class}_possible_examples_{fewshot_example_per_class}_used" - generated_dataset.push_to_hub(model_name, private=True) - - -if __name__ == "__main__": - run() diff --git a/paper_experiments/trec_hyperparameter_annotate_train_model.py b/paper_experiments/trec_hyperparameter_annotate_train_model.py deleted file mode 100644 index e0d812d..0000000 --- a/paper_experiments/trec_hyperparameter_annotate_train_model.py +++ /dev/null @@ -1,121 +0,0 @@ -import numpy as np -from datasets import load_dataset, ClassLabel -from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding -import evaluate -import shutil - - -def run(possible_examples_per_class, fewshot_example_per_class, seed): - corpus_name = f"whoisjones/trec_hyperparameter_annotated_{possible_examples_per_class}_possible_examples_{fewshot_example_per_class}_used" - - if "corpus_name" not in locals(): - raise Exception("Please insert the generated corpora before running this script.") - - label_alignment = { - "NUM": "number", - "ENTY": "entity", - "DESC": "description", - "ABBR": "abbreviation", - "HUM": "human", - "LOC": "location", - } - # Load the dataset - dataset = load_dataset(corpus_name, split="train").shuffle(seed=seed) - test_split = load_dataset("trec", split="test") - original_labels = test_split.features["coarse_label"].names - - def clean_labels(examples): - label = examples["coarse_label"].replace("Class: ", "") - if label not in list(label_alignment.values()): - label = "remove" - examples["coarse_label"] = label - return examples - - dataset = dataset.map(clean_labels) - dataset = dataset.filter(lambda x: x["coarse_label"] != "remove") - - dst_feat = ClassLabel(names=[label_alignment[k] for k in original_labels]) - dataset = dataset.map(lambda batch: { - "coarse_label": dst_feat.str2int(batch)}, input_columns="coarse_label", batched=True) - new_features = dataset.features.copy() - new_features["coarse_label"] = dst_feat - dataset = dataset.cast(new_features) - - dataset = dataset.train_test_split(test_size=0.1) - dataset["validation"] = dataset["test"] - dataset["test"] = test_split - dataset = dataset.rename_column("coarse_label", "label") - num_labels = dataset["train"].features["label"].num_classes - - # Load the BERT tokenizer and model - model_name = "bert-base-uncased" - tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) - - def preprocess_function(examples): - return tokenizer(examples["text"], padding=True, truncation=True, return_tensors="pt") - - tokenized_dataset = dataset.map(preprocess_function, batched=True) - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - accuracy = evaluate.load("accuracy") - - def compute_metrics(eval_pred): - predictions, labels = eval_pred - predictions = np.argmax(predictions, axis=1) - return accuracy.compute(predictions=predictions, references=labels) - - id2label = dict(enumerate(dataset["train"].features["label"].names)) - label2id = {v: k for k, v in id2label.items()} - - model = AutoModelForSequenceClassification.from_pretrained( - model_name, - num_labels=num_labels, - id2label=id2label, - label2id=label2id - ).to("cuda") - - num_train_epochs = 20 - - # Training arguments - training_args = TrainingArguments( - output_dir="output_model", - learning_rate=2e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=num_train_epochs, - weight_decay=0.01, - save_total_limit=1, - evaluation_strategy="epoch", - push_to_hub=False, - ) - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_dataset["train"], - eval_dataset=tokenized_dataset["validation"], - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - trainer.train() - - return trainer.predict(tokenized_dataset["test"]) - - -if __name__ == "__main__": - # for every combination of possible fewshot examples and fewshot examples used - for possible_examples_per_class, fewshot_example_per_class in [(0, 0), (2, 1), (2, 2), (4, 1), (4, 2), (4, 3), - (4, 4), (8, 1), (8, 2), (8, 3), (8, 4), (16, 1), - (16, 2), (16, 3), (16, 4)]: - result_avg = [] - # iterate over seeds - for seed in [41, 42, 43, 44, 45]: - results = run(possible_examples_per_class, fewshot_example_per_class, seed) - result_avg.append(results.metrics["test_accuracy"] * 100) - - # log for hyperparameter run - file = f"hyperparameter-trec-annotation-{possible_examples_per_class}-possible-{fewshot_example_per_class}-used" - with open(f"results/{file}.log", "w") as f: - f.write(f"Accuracy: {np.mean(result_avg)}\n") - f.write(f"Standard deviation: {np.std(result_avg)}\n") diff --git a/paper_experiments/trec_hyperparameter_generate_dataset.py b/paper_experiments/trec_hyperparameter_generate_dataset.py deleted file mode 100644 index ca00e4c..0000000 --- a/paper_experiments/trec_hyperparameter_generate_dataset.py +++ /dev/null @@ -1,70 +0,0 @@ -import os -from datasets import load_dataset, concatenate_datasets -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator, BasePrompt -from fabricator.dataset_transformations.text_classification import convert_label_ids_to_texts - -def run(): - for possible_examples_per_class, fewshot_example_per_class in [(0,0), (2,2), (4,2), (4,3), (4,4), (8,2), (8,3), - (8,4), (8,5), (16,2), (16,3), (16,4), (16,5)]: - dataset = load_dataset("trec", split="train").shuffle(seed=42) - if possible_examples_per_class > 0: - fewshot_datasets = [] - for label in range(6): - filtered_ds = dataset.filter(lambda x: x["coarse_label"] == label) - fewshot_datasets.append(filtered_ds.select(range(possible_examples_per_class))) - fewshot_dataset = concatenate_datasets(fewshot_datasets).shuffle(seed=42) - - extended_mapping = { - 0: "abbreviation", - 1: "entity", - 2: "description", - 3: "human", - 4: "location", - 5: "number" - } - - fewshot_dataset, label_options = convert_label_ids_to_texts( - fewshot_dataset, - "coarse_label", - expanded_label_mapping=extended_mapping, - return_label_options=True, - ) - task_description = "Generate a new question that asks about: {}. The new question should be very " \ - "different from the fewshot examples." - else: - task_description = "Generate a new question that asks about: {}." - label_options = ["abbreviation", "entity", "description", "human", "location", "number"] - fewshot_dataset = None - - prompt = BasePrompt( - task_description=task_description, - label_options=label_options, - generate_data_for_column="text", - fewshot_formatting_template="Question: {text}", - target_formatting_template="Question: ", - ) - - prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo", - api_key=os.environ.get("OPENAI_API_KEY"), - max_length=100, - ) - - generator = DatasetGenerator(prompt_node) - generated_dataset = generator.generate( - prompt_template=prompt, - fewshot_dataset=fewshot_dataset, - fewshot_examples_per_class=fewshot_example_per_class, - fewshot_sampling_strategy="uniform", - fewshot_sampling_column="coarse_label", - max_prompt_calls=500, - num_samples_to_generate=500, - ) - - model_name = f"trec_generated_{possible_examples_per_class}_possible_examples_{fewshot_example_per_class}_used" - generated_dataset.push_to_hub(model_name, private=True) - - -if __name__ == "__main__": - run() diff --git a/paper_experiments/trec_hyperparameter_generate_train_model.py b/paper_experiments/trec_hyperparameter_generate_train_model.py deleted file mode 100644 index e607872..0000000 --- a/paper_experiments/trec_hyperparameter_generate_train_model.py +++ /dev/null @@ -1,111 +0,0 @@ -import numpy as np -from datasets import load_dataset, ClassLabel -from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding -import evaluate -import shutil - - -def run(possible_examples_per_class, fewshot_example_per_class, seed): - # TODO insert the generated corpora here like - # corpus_name = "trec_{possible_examples_per_class}_{fewshot_example_per_class}" - - if "corpus_name" not in locals(): - raise Exception("Please insert the generated corpora before running this script.") - - label_alignment = { - "NUM": "number", - "ENTY": "entity", - "DESC": "description", - "ABBR": "abbreviation", - "HUM": "human", - "LOC": "location", - } - dataset = load_dataset(corpus_name, split="train").shuffle(seed=seed) - test_split = load_dataset("trec", split="test") - test_split = test_split.rename_column("coarse_label", "label") - original_labels = test_split.features["label"].names - - dst_feat = ClassLabel(names=[label_alignment[k] for k in original_labels]) - dataset = dataset.map(lambda batch: { - "label": dst_feat.str2int(batch)}, input_columns="label", batched=True) - new_features = dataset.features.copy() - new_features["label"] = dst_feat - dataset = dataset.cast(new_features) - - dataset = dataset.train_test_split(test_size=0.1) - dataset["validation"] = dataset["test"] - dataset["test"] = test_split - num_labels = dataset["train"].features["label"].num_classes - - # Load the BERT tokenizer and model - model_name = "bert-base-uncased" - tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) - - def preprocess_function(examples): - return tokenizer(examples["text"], padding=True, truncation=True, return_tensors="pt") - - tokenized_dataset = dataset.map(preprocess_function, batched=True) - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - accuracy = evaluate.load("accuracy") - - def compute_metrics(eval_pred): - predictions, labels = eval_pred - predictions = np.argmax(predictions, axis=1) - return accuracy.compute(predictions=predictions, references=labels) - - id2label = dict(enumerate(dataset["train"].features["label"].names)) - label2id = {v: k for k, v in id2label.items()} - - model = AutoModelForSequenceClassification.from_pretrained( - model_name, - num_labels=num_labels, - id2label=id2label, - label2id=label2id - ).to("cuda") - - num_train_epochs = 10 - - # Training arguments - training_args = TrainingArguments( - output_dir="output_model", - learning_rate=2e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=num_train_epochs, - weight_decay=0.01, - save_total_limit=1, - evaluation_strategy="epoch", - push_to_hub=False, - ) - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_dataset["train"], - eval_dataset=tokenized_dataset["validation"], - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - trainer.train() - - return trainer.predict(tokenized_dataset["test"]) - - -if __name__ == "__main__": - # for every combination of possible fewshot examples and fewshot examples used - for possible_examples_per_class, fewshot_example_per_class in [(0, 0), (2, 2), (4, 2), (4, 3), (4, 4), (8, 2), - (8, 3), (8, 4), (8, 5), (16, 2), (16, 3), (16, 4), - (16, 5)]: - result_avg = [] - # iterate over seeds - for seed in [41, 42, 43, 44, 45]: - results = run(possible_examples_per_class, fewshot_example_per_class, seed) - result_avg.append(results.metrics["test_accuracy"] * 100) - - # log for hyperparameter run - file = f"hyperparameter-trec-{possible_examples_per_class}-possible-{fewshot_example_per_class}-used" - with open(f"results/{file}.log", "w") as f: - f.write(f"Accuracy: {np.mean(result_avg)}\n") - f.write(f"Standard deviation: {np.std(result_avg)}\n") From 501583b5250b9e9dfc0f05fbaa4869a9cef50254 Mon Sep 17 00:00:00 2001 From: whoisjones Date: Tue, 24 Oct 2023 12:57:46 +0100 Subject: [PATCH 3/6] added news section to README.md --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 560b03c..bb23062 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,15 @@
+## News + +- **[10/23]** Our paper got accepted at EMNLP 2023. You can find the preprint [here](https://arxiv.org/abs/2309.09582). You can find the experimental scripts under release v0.1.0. +- **[09/23]** Support for `gpt-3.5-turbo-instruct` added in the new [Haystack](https://github.com/deepset-ai/haystack) release! +- **[08/23]** Added several experimental scripts to investigate the generation and annotation ability of `gpt-3.5-turbo` on various downstream tasks + the influence of few-shot examples on the performance for different downstream tasks. +- **[07/23]** Refactorings of majors classes - you can now simply use our BasePrompt class to create your own customized prompts for every downstream task! +- **[07/23]** Added dataset transformations for token classification to prompt LLMs with textual spans rather than with list of tags. +- **[06/23]** Initial version of fabricator supporting text classification and question answering tasks. + ## Overview This repository: From 75d09d2a2b8ede427d53af96801ec58937cb1711 Mon Sep 17 00:00:00 2001 From: whoisjones Date: Wed, 25 Oct 2023 12:58:01 +0200 Subject: [PATCH 4/6] improvement TUTORIAL-1_OVERVIEW.md --- resources/generation_workflow.png | Bin 0 -> 54842 bytes tutorials/TUTORIAL-1_OVERVIEW.md | 175 +++++++++++------------------- 2 files changed, 63 insertions(+), 112 deletions(-) create mode 100644 resources/generation_workflow.png diff --git a/resources/generation_workflow.png b/resources/generation_workflow.png new file mode 100644 index 0000000000000000000000000000000000000000..29deaeddf38e5086a152f382f3a44fb4156acf3c GIT binary patch literal 54842 zcmeFZbx>Sc7YB$XXmFR{ZXvie?$S8HNpJ`fTpEYq1PD%`3GNWwB>{qj5ZomlJh(gi zIy3Wq-_&l^*6yEMTlI##e)ZnH=l;&Q$L={3rJ=6y1cMv{0RiEOlA^2@0s;~g_|FpU zG4MpcLwOwd2hmMSK^mdzJLNV40u6$atdx$o>25Y^GNIheTW;YfODPS>1cvCm++1@S z3J|iCP7*!}UNTD(%AiyT23iOWK8lnqq8wfVLkNqof|R6dQhVF&CUIa~huVeeg{lYm z;@iR|M4@x3%%~}+Y~J@qZ;NoI%A`%+TM_}~zgPDoBvu-bNp39Nf1e?spddzq-BEB6 z5dU+{ph-en-;zHHMnDVt=aPiv;r!o_OkBwSu8jsXhWfw1jp-4Nw#n@YQT|I04Z=|r=zr*=StSPmnbF9n`41kz zO(gjAf9OL%{Qn{P|13ne+@NYaMpLHN)Gcz}|4IsOh~&_uEb9a!0otNs_Y92p2w(HF z4_)iqqDTKo#vyqyA#0SDO$geVVVmp%^-46DR_q~80F;dLkG$bVOJ;t<&HYH)H*s*s z3QS}Iii)%m^U#?j`m>R`yHKUZVXoMK_A~flV2fIU zkLKUveaH<6_d{d0{U|L+mmpmHAE^(f0JK1!wj!{Fw=nL1tH+)FyHY^m#w?>sSSb z?W0Dxke!iHqtSX;S@qP40<1dmGFf6cMN}adY;O;W*IV8va12pme)SRGhpAM!;}8FD{4mpb3+|uX^Xv9&lX}otkZA| zT?q}+2Gm>vfhvs*oM{>ciueAj zE(44Xm4`oKbnfsa=%=_&DDHA5YD%vB9%w(sR=@jZf3np?YTJbA(wTE7qBY7D`s6pR zy+d%$_K;Nw4OOBsE|}+v4Tb* z`~+5ph-JzU`$sio)~vCig;N;6G{$ULF~B)u*{^XbIaXBB?86Hat4GDm;x@kDv(8nY zB8jCxaisdNkyn}jT7~!1>--yfRy0{z1Q;(JgeQdz##@w0kn$!|lI17;@H@-7Y&q!# z^%1+-Zl2GS2UFit>C1(3zxWCIJ=u|^7mQJ&ps16AHO1tF<)xX!%b2yKxM)N)uL$?2SAk zsAt}R`a9~`ch10OIa<1(pO=B;aR&uGLDK6j)|;HmxNMOA9f~JzH?+wsSMWX@MFN5k znoMeFWV5EzY@2j&&oiCZpaoeOT%&zV#l-9*AJxH-^&FuKlMgX%%kCanaRG)2a-dKC zI6oEJH2t~Si$0%2dCWur9APa)mM6`%>AgZLnNQU6#{h<-25V;&C-}FsAfhH*1=9yzPXO( zog?65W`CB$kY@l{UCG~wGd|prAAP>?4 zjXB`z5cne}!d(Eg(4Z{X`!0D)ccg`~+~=ZuPj|~0>%Iu!6Jb{F&ec7>>t!kCZ(Uo& zY8o4K_n+5@@E7X9BgtovlQwA}+|L1@)6VLu%zx4JtTKFlU7NCI5gxMyqUf)YB*v`) zM0oa^zRZ{v3-c8`&_rg&!sU zx^bAZh=}aKrmfV7e#ZyM?AZ4bD@y!s(=MUc?xAeIFcr2?O=$sFDJ3_t1XlGVNl3qvOn&B(k|dYhNm}CNzJ}@0R{{x? zHR2%S7Po|R2c07s1pYI8KvcOK@t3mz83c4V} z2f;AoZeuoKYW2mA`;UEGB|);Q#Vu6*MdJLk_&o}Q?8=fxUmf2>#TEa_>(GB($`YCc zI)2Uh3O0@&c@Ese%1hZ(UgB2joxeOqTpSeKv)Bl|MpFgA$XC%}{s_aAox!ivCuNoT z#{*uZ?ZyKy|BhPs^zyzbBs7ZwqYH0ipp=-2P~M{aaTy|UkT}F>MSkU1?oB^u>MdSU zrWXesSwVE2a=o}(t6}+xV(>*#=pn`*5NKWj2;k9&!9OKJX>?vuUms2@E+}A_ud~6i z8qF~YH<58y z-N{r38|79iC_jgYO(*x)*LRo^b=D)wZC(fRrxVm8sNC=cbrS<48Z{@70=MW$^CiXg z@3x8w3SJVxDa!)`8}ix_0Y{Ri3f*P0&bY`W#vS=M|(?cBx9q^ z`&1=jXhD)3fJYG9rze*!*{y(LKUpHxb$3p2wmlIMPbJEv-e%Sb$?!efk$XbUmphiP za8n}S@|Sl+LkLi;5q9VYK=2j)gRtA%TFXH?6r26UmQtUSje3i(*ldRNCBLll5&m*x zywHJ2W|*Q7h-6S`bBUk`kN(I9ZWbW>7Ih|??6`-Fz{<=7C_X%f<7xyvcl;^+3bZg78G9u`RV0?6S%r&?I3OF7fYYfnN=OW#_?ZDW>ayMYh_1nL zA+a5ks`D}3u)w461)ITUr6shGis-$Xc ze9P!N_Wc@9wH?KrlS{~|l@Ksza{rKK7LYMssG{&C0;gEFTu*a-D5X>-jVl-;k9O*F zbM{-IT`kvhZ~pZD=0H1>&#pJH>w1TMD2*Fh^R>%D;z~(b89yN4bWEY%<)`#ej(A%M z=FH9cf-??bMnZxlug!PDPRF_0a8uuHM*Ep6ss_7>xAnvOKxm!c;4m{Dgotc=e|z=z zoCfq|_uBx#9CKwrX=Nh=yMz~o4i6d?6;=P{mpnPYJt38-XY8`~uP1DJ6{Jh<6Kc{B zr`Z~*$Qt{pvZ(9Y7l7?x_P;xunQZrQV>4;xqY`on>VHQK2G~Rdt5nT+UIaV&mcg6M z@1XaoDH;0aSO3`k-HjapqfV`*l1!1-fZa)ss8>Ab_E3iHMGcKfhi?inl_HMVA%g$O zh@dpfN#LA5akN>$6=Q?#Sjf&yHJH2o5Q$2}JyJfNQi)Z&r~sXSNh1oMQC5VwxLqRU zci)*wi)$fnhA}tswCgDI%{73FcBNs%)|?HL@jT}Do9olC>D+o5)zLLAgtDz z_(X_^4+N32kfHhC7ZBy6WL_I}+DZ5>O>*u18=i_4RUl#?EA^j7gg(k%V9j z#FZ~8+%QMXC#+*{p-FqG)#H7XsDF&2(RDjzrIn_z-8h3LlY+6qi{mZ>G0*))12l#B zM$pMO-qZcIgU5LvIlRsM&veT)J|dlMjme;ytAB;n``?_?QDKe)>xwL4#x!j7yF7ep zKU?!LvO>UNS{3a@!1o++_GUsB4ar#x+M(^er#i=h&azJck^KPdqisTiSs<*dQR`MZ zvOBA9C5+@)7IL*^0u86IGxw5V$^}$>h)%OR^)VpDrWDfPXd>Y@HJ)vA`56+ayfF$j zljZ5U*%#gU@i8=VAeBv*tvfZGk-93A*Crk}og-%jaxy|OuEetdsAgKGG%gc1Spn`l z5-wvQ10%I;5#7iajbCwki}$Kxv4HI^1^1f#0Da*7R+6uGGlgBhcl#1RE~;=F8gQ+& zTWo3e;xr=)+>zcu4?6QPSbOqREWHQ}nB$bn*x7bD=jS?|<7D$YcQS3z;21%;ox@bd= zKYsXR@l~tIX=!V1@cpY%W=x6ug9?MXt?goHJnPY*pNcS@VTbQo(B&kiX})@xIFyH{ zUIQKX5YX9RFhA-C#NbP&)uKz|4&R1Iq~men1OH@_@!|Dts8v%9|3C4Pc`qHW)+^TVlMg21n8a@a2HbkhQsH9!AzI| z2iwPPPJA*rA26Kw)Q#7h(7>?tDy>50Rr93tf?H-NBx$#MjLK_~_B_^lC|Rj~?&@T- zZ%XZh&DM{Py$kp?o4ghKv8n*zOn_1y;|C-`Oa@SYmb87t+Z>eC0hd3!bycqYQ`-fT zz`iSPqvEi7K?Ts}NES2p{$YShXW5_%Y~s7F3WSjWbAZGQqG|l`^ec5>q`i;omJ5%t~%)l66;X<Na zx)uN^g(qDj-(e0%us*sc)X!jdIgpighw2WdeGGAr9l|(Lo)9jR)?|*ZxgNS=9LhJp z$xWU=46#>h0LUd#NyFS@h&l1ozPc@4=4cS7LCmA7t&;Rrc#uJ7-_yq2L^{`UF2asW zeV1>_IxMO6Ij;gCA0N}K5&@RY)DHa;5s8LIJ$>_gwvq9hFx0;s5dlEy_glB+>CvpJ z0I&T&vU+%c*~0{|m88LjFItK|Kbi*c$N;lfSh&gzH+x}lcLL%$fDV|wbinMr%6<07 zt9uGJdp+{ODm1v&aI@F2uIUVzJ#V<#16Rhih6H^D=<;n3UVRE5h6Rw{QUp3x;I;&C zWlP`ER{*y`L22-pT0B-M(0a)LLnq|mqC)~u!v$ZHRbBo#%_`ieFTW=7{^KjK0nf_{ z(2W4OzymUsv1ruK;XYyzH$eS$UdAc|S}!WxUfSAB;se8U!iVv?$p<8r6&NPR9$N?* zZnNV7N7)^PJvIbbA6!#X+7JABeh*hR)#o8iG`JJMqAXf{G=<@=3ocv+@&3pl5EbyM z0a5~Y!+3x(<3EPK1YT(a-q^#J@rA#A2$*c}Lj#%EFo4iu|lc2hXXGgmPLch}B*1?HswA18)58quY z3Qx5LUBCyaR$Ik`4^qMuG*FB<*4r!F3b}gD$iVOeFo`<65%2^O0g}bn_(H&@!RDbd!6qZ@CX9NCMlj;LS5K(Ef;ci ztOc0%(-Nu3^|u#aLwR z04>$`a2O=XfC7_;+@H;u)w^w~ADw%CrSj4M>?8fpx9tGg@C#QT#oK6pxx=qduOUw^ zCi;pH2M`6CjlNC5NwXbJ<$Q(1jFTEm#{2dQx(}S$f38Sb6n6lgSKPLPNsRtv-wf() z@i>i7dG}^NWpkOfZVh^>rLYk#1^5N{7C;`A0Vt~Og}<{qy1G~ioM?1h*s5yV&ujNN z@vU|1Vf`D1L;B#mn~RC_y@l-@zx|J5KF61N7;c;23!ff%6&p8M=PQF#^79xmHQ&ZA z8>6eKDi++`V2=&SIcygTtCX57oKJqTP%v}BUbp-f4StH-ZgAasv!9Pe_B;$0wDO+- zYL=l}^T*lmnHa`P>2Qp>Z+lM1Ape^mO>Q=0elkYd8OMc-oMNBH)t^eJA-BaA)qUlj z*fm&@v*o;=v(SP#&DSd#Q}lc$z=6x!pN;E=ayj%_;{H;`cvPq21(xz}TV9WVF!nEg{3Li$ zXZ>Ueu*5J(cv?=SQIpmgbNP74nfBHOhoH}3Lk-04#@@M1qX?tHeu}N))^vbS8}=St zcY>IIbI`Sd`T8fYul|tMOB4*ao>4cF9qv@x6V`FtRmV+Z!*S4RHgx=1J-6zhw#M*K#BD*oJm)hX_@o4G< z{l%;AV~u@)Ut!x9O$5VD(%&EJ*IMN~UJof1ZSdZ6EHmwBb%sofkQ2a8 zV0|%pb!qJS@lLIS540XB;z|+*WOMNi61kpCG@=#s4HQcEipTS%`BU|tcZu7~7dfhD zITbbyq!aFJVWG-DiK8U+`Zl9@nM+KYd{2K3Q6n4pHf+f)WqH0!FSbYi#09dt(>bS| za>P~hgUqJk(2AeWT4+AVo*$UOkDP>j&NOYB8^2q z8l=d+BFF%&yH{w2^H9QA(V6Hp>RF5L)ec|bDB(~dV}$b#@i)!3&ja>_e-mn03C(u* zyiLs8h7vrG%CpMcEtd(^?Q~JCRL<*gg5rjzqQSa!%l3D>MQ))HiD1up0etE&+jX8~ z*W!1;LE9D(gScNTc_}ooootNc+to4u$o-U^zZ0?ZIq=pAz4G=YMcmbz_EeF{r%k^j zAHWxd-G~8u{x`<6F2X!B`xynr)6Y88cUTSW#SNp4>?xHDsZ#3}e8O*QOV8}i6Xs3_ zzky|w71-m~*YX@ux7#!=BPZ;BKOI7~(}%_nzc{_28y?>GJXSC)Hovm+kDX2La7VGQ z4F41mdw2}F+dorlVh`g@%tA-aC*ksUS7z5RoOF_^K8K{NVPeoK(JhD zP^~~y@V|?Pf^uF%l>B=1ICpDHbX$mZ{pvEeNvB;gQ>LBPv{xeH=%PJ;6O=iQTqmTQ zTPu#+RLesO3c8PY#*2rL=77aHQD7mZVGbRM+*yf);}7d;2ks#BVdR;yF2 zJY7+=v}67Wdla)$!%<2E)q|{_T5nq#d`z9+U3*ur{_*%EfRDGQ#CgKpBh7Kt?o_4A zH@JEa`8tg-pYfG0jQ2%>*@AUX`u1=9*layna`X)IJDOyKI30*c&u47Gav1OI3f|rd z1@n%@TK?4ebbQJSBVjG!F)DFioezE9TOT8IG8?QgVH3n$Gx#@#5ZB(5%cEb-qZ7Gp zVQmEkcl6xMiuvoHy8>GJXY;qNpI0761K!Mr<*W3(wm;c<$Q@pJqq~ksP-$iu>{x$l z-C$~%W!RHPgx}L!oPC?5Qs9A+vQolLdQ>x{NO6YF*gKLMErWh)9+Dbpqr;r9n9QL- zkvJL6k{|Hp+$u2DuHjB|e)3-R0DLcYg&c7kJA}sgLK;0^1Ex!aDY{s1R~ZdCf|zZum!w|{mKapLrxs#-s04VPAEkb zi#TS7M%9r^oMf^-`+5ELX^2v4J`r?RpXF1D^RQxHu{|+F@tb0#o3K1LDUeRjkW4!l zQg(7rGL~C#^*;q|e9N40^>YIjC9sjVh9Y!52>r>SU8Jh>14v9f@*9x$X9t0^MHo9- zhnW|TfRj5_lyH3L$JFVyuxuWG`7Z<5et9HJ)T-B#yD5Br1!~JPO!Nivesdw{secyL zxVK>K8~%={hlm6AI$0t~#Lf&Oc2=3mS|!U@$}$CyTD9(Ti2w6nKf^Au8ZGS^Kxf=| z*PKY)dj32=s7J1=EJ;;g+nv0P_8zKFj0R6}yg#KyNXg1M^P) zMdENuV4!qEQ^r=mv8AdxVDe$c@<5DlqJVO=E#j*3=vohQoEH1s=6fu`KGVqS#?6Up zm^OP+VZsl2wp$iZgXXl$j%7qtK<%U_#T~v|Ng4_fOxx&fv^exyF@G1QCv_>C7+M=I zFD*Lh8Lrq{Jooa8;tN@Ah=?wABv^%9N{Q|sK{c9?b?dw6fzGQ@Ama-btzT66ZFJOP zJ4OeOunX0)ro|0G>`D}a&og&p>4>dWs;|pXMHTbjC8Tq<{dRn8)c;(B$I)iAc*<9< z_H|X<=~0&qXPXMwd?R|Xy52FWgG9!Pz38V(MS5siU-7wIudR>SdTrYn6tUc|V&pn) z*_r`1VQYl2fWrimag=iLoika8SYJtnG_4VMr2~|f5ARzCo{!uf&++|?Zo6IYc|su| zyH5WMs5RjGv?mRmnYPe7FE%GW4Y+){Konn$rFU>;$>e(VW9vs$;*WZIP~5pI`Fax_ zsDVzJLKB1Ns@{-$FYJ~3R*->#xCsUC)p z&tDC>>P~*$38t?j!DROr&DqG(@K_Jh1gtHqS|;DeB(^(ocy@0n_DTzKys?f+!leNO zdnv(@x-#>MF$Pp;H|!(l_jb%AqA+bgNx6g2Y3mw^jgMjVp3nJ2y)6n&=g|_$sx?%K zKA!WEFCm%h=#_|s&%d|j)1*krU%DaVqObeiUV*{X_KQ2+4Vt)cl>^d zKhg;74{R6`!Fw^`%ETYfQ6gY3b&AdPISm|n%ePPS<_4(C_eL5kEXd9bQe=GPYN#qZS%gl{(RcVxxskM8*Iz>2BnSCfZOoweWA%w>751V z!slIJ7bXpEbtf(}tJfX;=%i2||9#P5-}kD-S83&R`$g`aa5M9+%Eky_#+dgyHP4q~ zqcUhdeV$Wfd1O&R?VxVl?rNg>J`$qF9|EqPcUI6?fQhporNpwq66mA9 zPw;H(vcw1P(&+UZuiaUmBmU2TIcqaiiv;VxG2;4`26F@0c)FGUPOD6(ltZ5o9f%u&liT`C_4$En8X~pWxD09J*ZZ| z+$=OSa`8*U(?o<%2;d2!w=?2vN#c3k(^v3J-{qHQ?c4jvHJ;g2V=V6&mtj`p8iiqv1ISF7*3xxX0fS1p*>n$feHA6XI_s z^1~ai^FCShm-6o9F-C$(Ugop>6Mg9e!RwCUQIPg~xH?n^* z&dST#Q&tw#F~{}yp$7uWt7srXZQYPv`@j4w5+X8-N&}(_bFe0d@nZbYWcZs{SxFjl zH^I~;dfR8Hb+6`@q?(`o$;`tibjgMfnFGYHBB`mIa{0(cp4f2S{1tLJ(cs3lP$Yu? z?)! zk6S>em;R#)U-9G$UtzH!q4QSZMK@5+mONOD^NYByC}oS zeT6GmqS{?%{E#3Lv|W*!V0r(kD0_+d5Yv*&RubddJ>>peN6Ud#TBJ|U9_y~U`|Vpp z@vd~nTz4F4j>$r{=d*`xf1FP96)asOvoYpn@3hfyv`Vd>wpdJVgp%{ME=jW`w53zX zv9TjtXpkwBM$;imypz(Mr2*+CGD!c$ti|2#?61>>5&M-=>nh?_a}o?Fh}`&>0e!L} zZ(z6{-fx;omUz$5Eq>^aJhBPJJ^W3oGgadBeQDfA7NpydV<;L4dAn+Gmu<8iW9ksa z5^i{0^B9SXK*(nUt>eMYKXckYbeK^=kFwv~@-|^X6y;Xru`XWNrH&jv00@KQ3)ebi zGw9{?+#xn_O5TP--l3nlfg`eJg8D@43+bt@ThAOW%B|32UH-BxI9tjLN63IF4Wl8_ z5Rr*H^C?H9fDe~6prw6AmQf|70LsuZZy*^&7X`>ok^o1`TrkhkoGmHCnHZvkI)pSd z=RnWp$74i^cqxi0%*@FVxDe@9Zp2c_f~X=PPKymm*wia3N=i!K>&YuZB-2rtAV6SO z#m|zkfRkn(K@Z}2{AHJBU_%H#gsui0Nx0N2QKSMudNOr}mv`lro=D_MU+ov&HW7c3 zI~ABGR9d1FHk06`34MQfz>)8s1Xf-0W2)>joZHbnznAaumx%&ST1q2niwKW z;aQa{xO4hd?)3Yq?hB8Epfv8DMmAt_s$H6AdqRlZ02@;x{)M7Z#-+f)eO&*y9rZ8s zqrjH}^LR=fa^2-X2(GWuN{RNVZ4__J`S!r<`{jTzOta?>?8sptO6V zHxeC1@oc?fmeP#JKZKF$=)Q~u>oNecr!fAOQBbrdC8}EL!p~02xicP}WNg7!p5crJ zk=ryRj%TYU9x08IezlLMq^uT@lSoLJJ-^K^&2cHsv5yI)%Bv{coCw93X| zQmb;e!yi`6-?Dr|Mrjv#tV{bZR;1BGnno*2WE_+Gco2qm9Y|vyafRWMvJ@V9FBR@;>%?w)j!_8LF?iLnY7eCPvSpB&X zqDaOu`jG#H zaBS45{1+O#MO+s**cZ~+ET{13dH8$vKlOeT{0oMk1I%!loA_eChO*d>BAN*R{Oq_B z3G+MfEwD7if59L`PC(7o(>O+!z{duEoxpPYFi&|H9zQaU2!1mA$=H zvxAMu%+=#*1bGE)Iem?2UX%}{p=Dy>|Jpfi!bd;xUsGfwD2b~`o-xh>TEfv@XhRdJ z41mG@Uob?_{gDZE?DBmA4IlKzEP&)6cby^l*|fQ?SY^k!|AJz7;}7TDkr>YS5L5o* zTUr79St zi>m>aoYH+p8oIOanD|#{P#I52H1WT{@$M_4w!OU#n5>>sjRaFTh857{+5_;*njr-r zqLq+`{!9t5;QHS^Q{R2|rCM}6n82^ZDjPAG^~;Fhng-ES{j0AbZHh=pIW)Ph!1t%J z??0nM@ic@V>*`R!v>)~T7WhB=4>2{&cc+MI_daMT+-HP;Y-FJ-(ZE$-J0f4h!A5Rs z1)2)G9Y<@7B@M0zQW-NSwx>jpmGY1bamwP^_>K%jUiq-=7VtY>7&a2dVSoE>(tgX$ zx%4azn{r7>X$BwpUfqZSIwu~Pxzb&UGf!i(PDzK2&ntbU#Nwt^1$WB_K_9e zhEpV%o%2@!IY%42d3+QYQ8qnN7|984EO=)8HE189EK-K*Erws#OAi{7%YrK3Xl1gI zQ;{Z){!nZ7UAOmb9IrcF$~)NVImXe_(!%F_uo5o|Bha-B>VRJy~SPJq0+9KW0KQW*huTJPu~Fub=!@@FMIC zyneRqrJ+I_qj7y1BZp2oTJ7vFQp6!u_>yM_>2 z4d*<8P4w{i?9KC&k9(mNs->ka?_8#`*nRtIvDN#DV6s$!Svh5=OQu|(%M35CK5+DY zver>05*)sVbX1C4B(04L#V~IL_8yX#3GYh=kO{jOHs-bCl{4R~Ei`iBfPzSS#}{qE z*B4vhFa22gVoe)(vO;WpUgyT-AakL;`A;#*scd8f@QrI2F%tuJXh~qBL~4h9;r4!~ z=bM~Uen)B2;hJ*o{+r(Plw81(MYUFPsmr+wx71ASO8Is-`(iox5(~9i=_x?dGonlY z8-$4wm72=dVNFF-SE|FG2-jB;P&?RcR5Gu=y8Wpyec6S*_)t2lMfYU*35jigG8v!$ zvHSJOg~axK?S=NzV~cV8Msav-gEIb=<5X_Fh|KF?v4pkL4x5QvbB`Q`h`7nfbMWk3 zEP)230s*C;A|;QA9|1@3!YX^qwhF)rI0l;Hy>PMPLLR5T*#0jHy7YOGK%RZf`3?&~ zkKYq<;{|Fm*NhVC!^SI+JvGgTGO#Yuds7Y|Qh4@w-PL*D$ZbLfM?#hw`#K?&)0C-y z;&-JP$pW=ju{xD)Epi|guH8leul{XC|AW28$@EBR?-C=Do%iCAN?*y(eA}K2%#ua= z^%9vT?E0MQ15A58qBanEl*AL;A9LmNHq#E7<0P52h5Cs0j|7`E3b#*53~ zlSfkwu$*__o>;=|nVe30;Hr0AP8I1|$xu$=lDl@~`JmUCTj%V!>_|T^a^UDg3#wFi zCmxrA7zPoBGvGp_bK5-78+E;WknGQud0QDrId$?Z(aX+UIp-U#vT&WJa;oGH&iuh> zD#SS*ZRbUgBsg-dD>KJTyjB>QxYaK{$8bjil$YHqeA0cdcM8egEQJveZ7;_FuEBq}d!CpEsF$H%qD5xYcq0tuv{0(qxj z;~anmtvZ!-!0H-g&nD^=> z?Vw}QB1RB)9#oQxAu3!rULhW50(0rz8r^)Iw}idF!&&FXRL%KDU4ERN{_X1J&w-F^ zdZu4R@tA08IWB=_S8YE*Jgk>8XP7NG7ZC>mxUlo>&72~Sz$yZ6mQOUGmiw`%p|vab zpRRE}K@Zm7a%-Pc&V{dT9`V1tMj11^`swK~lSj4Wwjv(P$@*dcVkLltn~a0<%An;n zV}sLt=1?j-`}p2+LsnzY`-|rrH!pCAPQaba5ugxDEybTx zD%Jiq%}kN_u0C?!-JZVD`YQgLh%?9_LEOq_W%(T_=MH5JHv zbamIXf_c+mR>gU3vA&Zc7l_qoIo!`ld&h3rzV+nhMdju2tpxRnu>bQTf}@Mz@Q1_T zFMTTr0fVV3>&J!lx;J`)yM?L#L1rkS;*?iwjcvfuZQPY@tT)*=4;8MTUODYPG$5Hf zsk$RSz|_|mtL~^AwV3xHl^Hga#iUmdt@ks|PK#Y}V_X@Y%p^tCnFZX-MLxYp9wwkw zAZV-KmZoQpEx>rEkGpa8ZXX6Eo-i6&#whqzGT4KRicGO7OoiF)wc?xRi?v0|)b4T) zIbNGBG$A2B=}QlqO`_ZF2snsBkDfTBUVqmgz`*2ibtv15`ug%+^M}bZNC6mGf>QBG z7C#{#ML@-TJq+5ptoPcjK7zSnqF5c3j+4^r<=9YS%F{c)$SAd^*SkZc+g;;~6-pGY$|ItvYdq;OjlAj)4; zQJ#(FBklWZPf)f#j9I~PEpg%q9}f%>`*_5_=RfZL3UNqf8Z3U1JgaTXbZVJAG89ZN zEOnezu!!16z>xM8Z_3DZg)a52PsXP{BhKC^%`3$o-St7XqC}+p_glv_K}1;}ZzmKW z!qQe#X2em;rxW)=lTb_5ADB?5$hfH+A}yIRhBHOD8m^LU57 z*MB7XL0*t0JzKWPf;Tzv=kfOx(X`oftwxB!QQlYX9uD`GZ0`m9u3KU@EyY-E(@OMy z)BCu=4~%Z+R$q2&5>3;;mR+uqz^2OewfmxH@*kYUaiLC$jr#FGei0~DDtSFun+)m* zta_yB0UR)%WTIp4QEzZ{v}$x+Tu0qyT~DN4ORS3F)oR?i!qPrJG9|Xe8zR6zn1V5X z7$^pn(d82Z#d;TZu7nBsq&=yhgjyHeB*C99^FEy8fGQa{#Q35he+S4+@f_(J zU#v`Y_PbHRiUpcRyL9pPJIm|E?Yx7!OEOU(jwOTL@0NpCa)(fv2L=L3N?~!&we;DU zZ<>wkDycDOfg`Q_$lDc>;JYE@i}&iz9(3^=#BJp-a=Rjqp*QvSJQjrZH<6Z~1!S&B zxSW@>%Bg<0`-gyYPgVxKRszzNUtbh?8<+FFy`A-$D+9aRdiK6^#tJXnJ$M)xN%*jfBL2?1#bRiVatMT=2S~}i0 zRQ8kuw)Fy3y_<29bS?ISRhNAB1eFTy?F%PMIkbJuUG8t{>&-3#V2_baD_0oaZC%T|8xs^d1%s`e^ybC;~Huk$%E6w;jkCZ_)tdh$i= zpvv``$J)8^!{)W+ji}FDP8#}E5Xk}OIM3bG8)*Gw4zY6+8oBr{Bkb7J zrL%QPao1!+zSp#-)7}y!Z+>DWH4npED6L-%}QN<+O5|dZ*ctYh`}I zkL@V186#_}w{V9?#57&rK;rO0*G-OZ`$TBJIvIOvf;5#7IcVcrmj!|wA{gtnx5V!v z8IqPN)H~(mF#{w{w`*F<<14 z=bD}98z7ZdG*k+66|$g0v|l7NrluRVG}MY~?4y^75uwso+)cl17y4ooO|m%fM{Xz5 zUZx1%u%A4Y@HB2GJW)mo%ne=$OIxV39Fzv@_(?}9=99P@=(Eq{Y~MIkD1ZWQ_5@ph zSGCE?E{&kVCe-GqhmQ*{*P4d=Vo0f837Oo{RiRAzRk&IV4a_V9`P>x+U%71(Ky%W(ba&*^7cYA4%>3_kTMvS7 zl1v&H4w}|hhFNePZcfOc%%YLB{vUWe9`nB**SI~}6i+t`I4Z<-6w^H6XY>TB=dyZk zCuSDn9f{+`qK>5q_Dn)JJg*po4~$ZVv#lLRJD==`M}lroWnec1XJ#VLhe@1=_toD? zmBZf4<%)g!&g}$|76(qOVIoC7b(@;iJsrLLHXKebF8Q5*7%!F%bDKtBKg}BjU8zDC zBQA{VF4wAcj+QDZkF1T_Uv=vXjARO^$d38QNy&qRGcVMCdxd$!7C2A!IR~xGebN{& zz7#nN5b4(qYF{lvE3TcHYA!dg9a%kJKRy{Pq1v6b{y7kPNt1tT9vU0zwA9W&mtr7q zp>pkc(cu*%8XMQ?Zkf!T3aBas4#RpDHaCIZV%@1#ClRw~uzWpG7zOlOuviVJj+nf9 znOd}i|GK1piH!*+)^#C_jX&c!fRLSMGyiF?}`;;uOu6IS<~4RzNM! zb<6BY-@A6;_rS4iDap%Q1+V)LSJI?Mj6;yf5Hrg9mB6&KM3peeVBJ^}@x#jvD|XPM zQIel-$q9E39z@ESRi)@yj;ivzOvrXvCsKimvBad#8qEN*W63+B{w# zE_GcW(hSBrA|u+R#XPILT+4Kvua5$17gQf%5TY4qax+G*0rGMtxi-PZTHqEPM8X8B zYi;3qOus%fb9;ynM0yF(6f3DAvnq^+8R7BTL$W8snIifS6u5#K4VV=0dpp8Tgt(eF-yn_UTz>8eE;og zt$7Ato?0KAc3iIgmP_R_QGO?OjAk`(xZ>5-5hdEJpMH5eYkxQwD_PIn>0N7ign)yxbD;JR}G-I`9 zX=j`&dH*ccBG}bk19>nHslExnH9N%!&j5QOP_CQjx_*B%>A)uiq-)=I`pg02wM4_C zWiwFgLFWw%P)N4S@bq)J27F8{7ZE2kN>PtUdDR2Obrqn0Xlmuis)OgT|J-Yr>j!jM zH!Pq;=AZUB_w8|?JT)4S_!yR;5FSv#>EN|9$!b_<9fzCS;z|~Hu#ly81l8~1>N#&c zIOGPQGccGIcAz%>I4}w*B7XQS$F1{v!vU8xw9~1x*&Q@*k&vIg(CC)?x^WuMLLoH?|; z5iZ2)zYCGO)4PFsN;{zO2^Pfd>^PiAA5p9+1@W!(fk{J@Pl0B^q7J_ct>tzf*1-GQ zoO!~%k^TRC9JB>=F1<2iRuH=a?IAtFxJ1GPNX=rc>SY#0G=mHyM;_3rWC>JWULz^U zb~Gx@XDvN4g_o}by=a(-@lMAx$X6JQ0YbKJH+xMtW!LcP{HkUG=)Co+_R+)b$vaYG z{c2MpKk2LJNT7Z?y6_6{Z?%V#KfQ6C$y&nsdnnl~guntuTH0WsS%Lu@=hN1jqRm?@ zGFg|m7aqqXD0i1YZF}7m%n)Lq6^8m0N8qVDkb^52PUDX08A#OntN1*DAUACv-Mx5& z^Dl0dRE}ge-Lfs=!V=3!upA9hR`8_`7U2ef;AduyG9A`8 z;`2c8jo*p42Jik1eqaWQ?MJHMe>{WrX>h$zQn}64fm&laQ>3*(zpLxiW(P_gQWOZ# zQZ|wG1ogk~*-uv}0WCyvC0;;lpY3Avn>Sw(h=BG*4HX2KHYZqj8h&ydMg`1KgYHsL z2eiMz)J+3iH@-z}+061K;-N@N0zHYR)@JZOZQbm+k?io6&I+MC=$olW8V0y7VL*KZ z{R?^lj3Y80X;OIQd{}oh5j(uqhuCY6iHRwipb@fwViwrwduG3T7mLDm93uIF;IQ_x z-BmbPk7HRKI9u5|+VRpX)0wXJG=Anq#+g+Y)gOPfdCmazTu`T%fY4;KBmsHqGaUs= zSJGcZ9%qN#64g#u8fj+l>mB6FpsH&Q>9CfV|1>@| z4;L-f7+aJIXNVcu%fcQn{z5!{IvQ9=`-@yFTYjC@!yc$qnJ4{IN>4+F1av0h!|H%0 zQM@@RYj{P9IZ|04Gv?Pbhx@#b2csXuSDc@R-TE%Doytd)C5t*wQv?L=4j5#De(IC+ zUowmMAGcrL{TQu^0n38MY~aNrG#EgSL_PwQT!S024VSM@*NbtPF*2z44 zF;AT+n;#>A7;=G(#0<9JeG9a;F;hP9w(%=|l)e#2-EqawkYhOTxCeZ?bjttsiU!2_ zQz7;81q%H^5z8Gu=&UT2d9Y&8R0`y@yka~|_ zQrfrg^tqJ|&i&)}TO=foY}6}3LN1mmgzz5FAW1ZUm@PU$&9K3HNh!~6-KI?KXD%`4 zTh3g%I}FdL^8`hwa_Jb4HMCygR~mN4fObzhgnFLtuxi2deCxD5e2>G&ldm>TKPIgl z0?EXMfrLN3+_6A|Or8zE+!BUdX~?nZs1NT$0rm*fSl1>O`h~K%M$qFxL+C!2zaP!3 zK{}7KV-vdUxtt#rbG&%eioPvJU*4i7RZLUC;64K~ysHCX2mF8WV+uu>zIH%zq=ler}#g(`U6zG|6&e=XFfvM5PrwOSO2PzTz<(&j4(O0N+|D!I=zjCv)IvIUjhhWG^>W?XZXQ{Z z_p0B}h@m%`G}W4{*?8;<^Kq)kai1NV*h{fKP;Gw{!>r84bdL@a`|>>j`hC#7J(1!9 zf2+#VO(LW5sBo_@v>emrmTsZ)KQr5cXWRz7K@i{UeKlz?V~9ekPZpzhVX=cgW;krc zK!o?V*cTsxIZgQgD|u6;e35Nf1=^)F&n)}JFUI;7md-l8ME8C%3^#ha3y)+1@y!6( z0`p(B2E}mb^XYp=2|&PO_o|#h*||GZ<1}Cdd}I6__rJS7LvT-uf%KF`?#vZtz_-NnOb+)73eObL2$v8&+6pMSyQ+bw>n;4|KAvtT?x@|0>I zE8mn)yIf1UPw&brHHEFk)7D^?n7&@uN@JB4beb_8zmY$boe8&@fsz!)q{py2cEVss z>cbRZH5FiLh9GX;eW02K@mU4Bx$I~bC?VGlILK4Mwkj3RWtOcgT=X-f}(1OF&4#sfElTI7N+rH2~GuQ(F$! z2epvRC*df~e03~7c1ut42Dzw5NO!k~>+LL4h{R`w@D{`krE77n^mW`o7fDJqmjhCpo5(9iuiG` zY8Ax9buN65GSig%+b^-yL0+iNL-QXAEKr(x44key*Nr=E{MSm&o18;s z9)3WqE)5}FV_y`+?jhI(%J5g??N4utg@AKu;oG~+b+A4MSin4qktmY+NHJWX^@M=j zq&(m60J#b7il`+fD>z3X7Msz6;1>aJ>UW)!U2aYskxTh4Fr~3?{!$za#3568L6z2L z{>t1<&KwBb4!{hXRppa4sx8V@PP!wy#o10Dt{oAwXX1*0s~Pd2Iv+*fE-^^8-~*!T z958c~rNrVn^hrr2*Cc^7?>%7be|*5a)($7W1pmwgGXL<^Dwt!4uS3vXBRcdcu=0$| zr#aL&u`+;=h%ojAjyEJ0`TmdeunbD4j_On^{YQG-W+ZN;?sv8ZUyIc?B2XMu4_R*~ z52uKzL-~2Gy*S@R+`*$^kAn#!tr#JKxLjjNqCf!eKMwCyY(M*1WE(MFvAcMgGb#5+ zMV;FHM@4=5M@6j`>rMAA6#B%ow+%iJ3)KTE@gl77jEOb(C&=jAyI^k#++BS9SoTYy zX8xneOrRMY-?#F+*J1F01UVG0TDG~}{vrSh9)1&gk;cR>FWE=5p?rUUy&>@GY=g@Y zUyjp8m>!ZNZuY}w-Mn}xC+1IwLf;Z~gC*TOcV?DQ2&jR|LGYto)&4Q}l$ zvO&5!`}7%aFky^Hjk7LqXf14Ztov@SPDr)wggO;wYejv^>_cxWdcc{@{bt>nw1rWM z7PICNrs;XGE9pF}cs$SaCL&_O@Q=J&$|xe4LK@<^mW+q(NL&HT2dXFg)+^4}M+&8h zW8!l?{qh^Aj5!heR=xmOW+_cLF(v2;_FiO&pBXDP=8|HeJ;17V{qZ3K8}-0#p`X647hcdSy-uk?c|_F7C@#s6 z(eY)c+e#3}zSn{AWx7S87rU+1rFw(30)t`FO%|U^y%m&{h_&$?hog3u7t<(3xw*oJ z|7gXB{%FN5IRI)1pYen;rR6rSq%9;6^WDz*27_gR$ARaP6GpL7Q+VzNeU&+tbJ4tat`u=u+KDgH&$~%7 zd9Cal4qNQE{wp~CiylmJ(@J)6L5Y)ngjjfEDxT=Ur*D25-fIHzwc-5eafV+k!>8ZF z25zsxR70V?-I8)YAh?~#aA@N`bu{e8V<3ExC_<_8?v(S^x25$U6m2;PT!0pLP}ZvSO`vmK~Uw`yLOvN*q@?|VdkE{cM8 z0Ii^jWA|C}+QQBKwfZ$1+vmRJYM$umwXV`5HRMqSep+=I!&CIx`-KQNbp6og2#{-P zFBv7$xpng#Q!^YVb)9R9ptS|1(XjjuM^(Dos!<(*N;Y57Ca^B^=Je{IRJe$DQJK7&LJxzsW z@E%fJJzDBcoAC;~5(5-ezz_c;m+b(FPU9F4`U~^!FCh>a@2|&VCSI&@uTOgERkS35 zXnJ;ik8!EX$5|Er;Myk)vMLzdbYUw^XE<4vpMRaFo#)o-REXdlQL=i+fcCiS@}YTM|uzDfphY z*tJB66t%S+zH`LQAB<+=n3~@M-iJqo~tF+OddpPt=z4;o?L|NCE**d?<|L_OU#kugddUR5UYFyi?^#WhQ+(Y(-8bur z>Ek92fp+J8!bhtT-V%u~5{Y7h#GUCT337ipbbq*hK`D`MUVEN`mY4!$lA`)GAGW-O zhzLJhcC^I~H?5HcST~KXW0kufEVGldYlDP|n+?$K!4~$9mFe+4kk?tL7~7_U z`X;&EV(ZS5Z8)^a*QyKNou^pHMR4j;N}0}x-uC|AXS@=)A8%z@>d9-}A9-_ZcI#Z^ znS*xVU=Gxz+pzlZ)g0~W!j%~2o-4I#bmG^1C99#FYFnJB|J)8>Rio`)-$eRjqdEt= ztQNFY;iMWE5!ybv#D}LROjFv7s90rNi+1Ez3#lK$+PiYx)wRjvsjy55Slw8#Dj~>E zKY&NV(?R)gOHYO4Z4- z!jX{QdX7-<^}-9gXI>Tz?b#pu7<7;;>f#2# z_vpjMvh31xaY3+NhF0HttZnXd#plFP7Y-R7&&mK}8o3w2ZEcKtHD0tIsSX%=(;7J^ zcEG`7L;)<5X0={|H&15-R(FR@ch)T3@B82~(T&2_J7yHus7LE_X4M&$utW7)*>KG5 z4H3U>LJs4q6yFo`;#X(K!HsgbiDC+CJmbPu?|;F0e3(QE$6C6B48U#9(*O1H`;EG# zIQ=}}A~frWoeFlh_p)@J_7+x)03* zfi#5R>K6}-5uHMVk_*REC;hU21@(04LE>SWQu%{KITgO{b%}4Ez0eGY^7*VGwaVW^ zzWsEsa$ix?hmbuw<{LI#?@9vl{%86E|&j=4NKS_kyX*Pxb+@8LKG!s}0Vy zxK32Qt5lac%gO#9?NT1ocY^+5{`{|2?TH#ljX0k}1q{0rXEBa1l>z3;y2^gnu8$J* z-HB@htw+5>*u-4-v(Uc ze||im>|S=?;?+EmWR)d$p_EulP$jx)fL;Qz`Rf}n4Fhim{}@c_5mx|`jFeZ|V*Pg$ z0M~#5OA^ho`+swzG5=TU<<9~XW(78PLjK5#-kwx_!2VYr^_>XwwLGk8dXb>u`YbC3 z;{Jg*w@Z~aTyGxH>>ZXVs4_yu|yX?#(YmT^iwO$O|jZo zrsg%7wa{Gt;jOGZDPM1X=vA=q6765?wo{!xx|PISd5QYu$m{Zb^*bWSXD}(X^wPAS ztvB`b)>oe_VXGmd|1pbA5K}kG*qeGq+!dGR=l4IZvAX)Y`OyTDS#CA2aje9@qupHa z(>~uQ(Q~qn`C;EfSjG^ynLQGYkbM#hpU*m1t+pK#3o9aFA+&#tV%?__c6uxGVFMbt z&HkxJCGWzXH>XWH9Too8xQ>%mUX0lNDQ@8!e{iwSU;Oykx3P-i8} zJ|SO7md2z&ihEB+CjYCM7EB?jl(#^B=DZ4ium^J?CsnNmgGcO zw+DZCvwpk87P578^NHfQjGB&5_KV3l-|p*DDBPmTX<{d!__o(gPP_MeYH^)_f%uiL zaki}SmvvQafM1F7T&1z>`h;28Z{W8IFAh#Hmf^Z*uFq5D`cmxc(mKOn#=6ZYqRL06 z3D!3MNjg?k0zE=VDY-8E?aBX+u=vEb{~MpH)m)@$+vlhIG|CaLT{mVT;y8`S)nOE| z)Jm{f!vc@=`#zP{(n(RfyhL$c1B^pf?MYEDhvG^}3iX~-_sBN>envPkd11b0mS~Y$ zD&uT%9aALN81leevYShX+VrM6Os}|9S8V8rwRY>#gJAZ8O#<*r*C{aIGk?}b_ja5b zea0=!&3g*MR$nDD?BL71w!a_Fi-rf;3RZ-*K5lTY97|Ve%j6vXLCiD%s=DDxd-RKk z6cc}ie>kx6V{VU$3nn(5OtZS35~W-Z;TEg3geRQv?u8F^d6A#K)a2^RBFX;o zZB;rKGD%UO+;f(Ya<%eZH9#I;4p{PN7zTqC`E#%W=qAC*-+vRsjoAe~mYx54$4cY; z@s8aqib}_Jd(-*2#hf&UM=05pI3fCxqKxR*bLEdJ6E@^l-TmQx5N(=`T2*51vW5DQ*5O$-B%8$!J~V17MaEkz=NapY}fw zRkYU{;Fa{O z@Ib9TSi{@IUl)>fAkz}HPkCz7kvO#B$M3etI1s1z+-8{5ZQZ7$zv#h(`#4`P13rQ6 zDw?6rZHt2Xa)sTnHd(xeh0zc#q7!j1Dh!7MF#;W}+=NmrqUhcE zQ(ht|w+B>fff+|q4FmG9663OW&n7F(t|a}gO-|Y$Gl)r29}UoAbJt(V&pn9ZtPl47 z{mb@=WZ5lQnb$J~{&=GK35{)M*6o z2XqjLFMpXO`Rk)yakE7^I0C~)BpJ=&GrbS6tm5|%Rjba4Bn=gbH~UnJ4HNnz-e6br zkNlFYX=53C@{lYGnzZ+dyHYl{M$b6etw!`i6g_|rq+v#N9Hz$7}}85E#u<1y1f4ap$e&M+~houQ=2Pu>J;I@atnNsca|6mo#=ZZ{jK0BhA`br zDLO!R*|~jbbNuwob53I*)7X*&FHWOP;{Nb%k^r5x1@h!d~ZvO0Se4CRT4 zp0@a)v2xW8rE6-H`?1>RpiUxOMVtd9WSE5JuT%X8YGzED2J_PC{2l7E7(70T0~%!$ zh1LF9v(?q6>%quM^I%k~g+!)rfw`4ewLQ7lV(9A_2JPifYm!X>9 zn|$^+s9WY@nBNb3I?omS|cxTkcGQ4jM`U|z68PIvp~{8F-weW#Upnod5B zUerzTwXj&nWIs1q*J!$!uv;0i0731;44+#-rJg`9dqMa~u~f$RahNe%xv_JV_hwD` zob{?U3Rq`K%_$iw8|wSWqa$Qu@7bx!P@iP2Yj>^`G51iR#;1Wt+ahdx=qdtd`?I)x zh&LU?XY}P8V8n(ltZieRX#J_>XmoI9aXR=~O$LpmHpW{w?a_%EU2@h5+Q_`N`no(O zI=uW9xYpz50sr>Pv|FId#^0gQ3RYX~3Xw&!o2XvVA=fg=Pmz45YTEQJhiym@1^re||Rz=dgH%qJA z?N0FxSxIBq;VZ{j!X3P=VGM7`M}I(5b&@9R1FZ(Ywvfg9Z&U|x1$8qzq?vmSh$=PC zdbHW?E}X@BQ%-|VAl`XgWNRB?w`8vgP=HXwW+18<Y%2Lgb(Jif&|?JGnq6^@24lZFUai<4i~kM z8?qUxLb%-ow3=RJEh%kjGgbS*IyWP*>QDIkUL4I*(k?NmvNv26o4)Y}^2oUNjTMr^ z1rxVIL@*pI)NELtfrYTWCHuBc&{l$>U?!a^N5Slpm$c0yFYf0Kx>oxgBLE~^s=(51 z4M3gfg2{<2s)7P=K^*ZfPMi7_N3vj zX?+gV^eRk4r#RC^K#~0+1HK)K)p3tL=h7S6-{MJWHTj;T%4#d%hE)jS;f71z190PF z_kg2a?BrJ)S=k5!<5XZZE1a2jWhvx=g^|kqn*@~~2nw`XXQ0S>W`DNBYj;t~`$kC= zpmcR)KY$@%z!yfktW!x~z1u84YPnV~~x=29~}ZIv>>lRp4gL zJpp>i@wSLUWo}U>+&V57dAwN3L{U2nW4i))Lrk-(ivGFC*B%g)uRnhb77raWR!_2E zGyoHyvrC*G??8c#T`s6qs4sMUv>GparI{`s@TG_!43h4*$juR?^IS>Rfp2gElF_@p zt3ChcCn3$um}fV;=0a8wF22J%cj?8?VHbp=n&tO$lsaoNC&8Yug_5vFuZzE!`U|zW z=pn$t^nt>kiShJiEOLKy4TpE}vXm*04}X3=8t^Y)c?9BoS^i(;O3a6)InvHjf2+F`ZUW1k@2YS8z<3Ww|u zn`jR#oo&25aKrtbHC#AjfSve znC+|;V_ymT6-`TSC8=bl=;MWpcsd|tB*ko`ND7H-7+;z_2eI5IhCjBA?}2Tjg{6!f z@b!F|g1&58_u|gi=KV9%Iwz**#74{WExac~XPQN(MjL(j1ED4>c^38LEvc~gA$BjK z2D9bOCP>@h|E;NkYJMIPgnd_y$|J^;;Rjk{0;v^P=J@w-`=~uPkVTt z8gzUL`Lt8a2_2j=Ov}7aKBQND`@fmz-yMDYz0s9mbTn2ryKlMpo%U9QyT`TPaD|EM z)ZM@m0~LJu(s`5clb3~^`muQ$X)CJ$G39s%aD;!;)W6#heHTTefwE6zaj&8vt4~>e zdu;SU)syUxGGmhyppAax$KH-(a2Oh(qV^WWL9H3ph|8mA1#r^@ZA}l%82d(Do{RcQgPA4CJs{u%XLZGgxSYzGR0uUpHGbR`(8!to3qWSEsDGscB3bd~2 z_0uNbpB%W>yx#}6VuW@VZ8%|9&%TFZQC5s)bCn$0w@^ug?a2$dZ&S6wLFzb`!@n?& zUF~n`i7Hq|L%OJ1?_+x*;7ALq)0;uiFHV}d4s%ThpgaKTtHF7)ku)JNw(YsURv>63 z3lH1psTWq`*m9w1?CC@Gs(qw*mIvy@yw<~y3v|je@ikelN*niY)MSnxon*SN z(a{|RDX5ja{2B4~5%%Lj0x2(&?C`vUlJ7b4T#w#9W|J0|QdML3AHy*E;iUG7v$4@3 zFy5{&xkt%-XVb^+6amQ4OPi)lr%VZ_h%N=S21DKMLhLtuRQ5kfu5i$U{qN*FC#kax{ z{IeeKgj`m|!AYJ8{!c$Bz*z$C_9{R`O%!rvi)GbdW`ywDO-g}RL9(#yZ2&T7-AEYU;HZ_u6iTNtOK}-ob23 zX;tr1rNn7SPI^)th;5#Ese(gSTme5s6Sh5G{tle^EJ#woQdH5cG-o2suCzdq8mZKV zb`0-#&We+k0}d>y;z$Bd+P*av&}*QNX);5XD4kSuz=2RYvagVR3V0NcsJR^Z0rSHC z+x#7H#rTjlA)m7<{8T{E&ZUa6PU;Pn{)+K%MUmODE8os{`M{blnH+5=lFZXzsn#y?@9 z>q=cIx^YJ(|$Hb;DWFeh9GIchj?00Yd0WREf7V8&lL?xYj!B_J#U?vQOP z<`xzBWgL*48$RAwWW0G}`UlO%$G2tL6%UlBW~JK|lfC6}SKgl752n}yo1?#f5W2)0 z_#5y^)`0UZys)H>O|vD6LAGu1G{t?E+;Spi3ow+ve=XJd4T_kN*re=vBx%WxtwFd4 zCkv4{QC-Lw$s3?zuf%RN;Y?wJ>U`}1+%t!PJ8u7=aJWgteloWiHXvBwAc-n@tnNuk z!nmo7-d@`e?Npimkn!?aKkFw@GWzYI+oW4xpTD;iuTj0J%kUad&Iur<=zb0Xe)psl zVG7XK{kXFBuSX{=2BX&xU`|321WO;K`R8DjAvud*6q-D45pg+Nygw0f+x&9x?(ZD; z5=w6yK~sNzwM#Hg{IoAI@{)9Vz@oxs_q*EJ-q7{n&nM0BUgJxJq~;6h3xlH|oVC3| z1bl-nCU>KROBq(Aenb@&s-nx;W-)kQjTaPd=(5o46VHTX{$YXm8TXx8j@QpU`D^N? zQrhE1dQzsEfP^S2?Yd(P6i$Zr;V~ve#dXKxS<2O@1V~E z%ts_?PVZEKXhwoE;#&g)IT)x8)1ft7snAZzLi;}$Byh190OB$aEHy$vIF z{{yfXS4kF_uiNZf1D!hb3BvV?^`4zAMnl`2+yUF!Kut;?wnr8%K;`d#?F@KP#xQ!f zUFQatHkM^eZnSGQVz$R6ZYTBIpY-m*J@6=Mv8dj4qCQX+c3RlkPgF>I(0C3-5vReX zD0$33>`LvGOEyp;GCzsJVWiAl@|AK+E-)%}z+4W!&Ee2BpE7^JPwD{LqmcO>@53r| z%pzI5#XJ*|+<#eges0$WlF-xSK|n9ak?$5>oRI8(9VK06CB*j#aVu6Bg|0MXRvp}s zgmwIo5+^AlD5Hl6HaU-Y>sy`*C#@dj55)*7n%BW6U0pzZ{jomZv1gG@TPG2`Z9vgR z+ZPHB(2Syq3e`hB{{F4wxD45<0O@IPr|S7yfG zYg_4zhFcwFe;uyuoyMoQVpCgqp@rcjC>h7n)a3V1hgS0iHpyy^@s{a z(z)9#3UF1b(C+DwdyIuAF~NhCKR=7cZgRAPpda4{V}LNtF11hrZ!G_z$He6cO5t1Ta>O2Gm{BoH%fsV zVpc{MXKw`fV}4eK0X|LXFzB7&_(qu47x9zt#|HTuRXV|YQsQ(3Z7jg^ZS|!ox+!0P z92QvqGJ0H#cW>g$DFL__HQ^v&%?dfc{S_l12KdP*e4dGrc_%)6kMZqETZQ*<9c|E` zY%dJ}p<)c#=+uJ+p92+&E4%EIu84)O07he!V91X^kRSfnE6|unJo#1q%T6T<-uc=x zt8vH*kt_$nb6Ail6`E)LcWW-GS9qA}N#Y`5P%3UbRKjE>Ve#Dn0pPVxZbAKa-Cz7I z5tx7|Pv!G>H&UVJO0U2V(xC_Y>a8Rn$Rjvzf?^8I0N??l#-HR7fpbj)(6^O-*-Pd; zr?W>F2rb6G1D)FCTfG2r1@yN%?Par`(^+F8bz!NZ-f_M&O+aKJpr9Sk5$I4aYe)S3 zFLVjZ1Ed#iyWkeff`$E>j7sD^|GPf#|`eutfN> zeY&G$oW{@DkyR9Pjp}Gog3o>@rj8rGOyD0Mzu%xS7ZC&4?7lMxScV|Pu4~eko2pZD zBhNowO?%QQyLM|egrshJ;EA4C-tJ0ak*&*37I6$4ExrmcqOOV=^*hNr@i|$PIq-;1 znm?_YK$ZP*;Lin^u(CW5A^H1b-2~0I;ye9L+*oio%R^qUPb})f&QtLQc~W+=-UjZ6 zxFox8xcl{It35%_f2+L+kK(5zRxU(saQWazxPLg5%r{V0Nsl)J#I1?SSRsOt{q_lhgUlVG0XD>{-oj)C%!rX=A)Cp;GAF0G}GXn1OlhLb^`{ zn6{w^hTzEXlOBT0_=j2S>g=?l2JlBdUtbCPT%NkwHQ{iwDZT3$%Mva$dV-Wu=4yi_ z0xc3cq1^=?2}0>HKn@n|&w)CfS)R+^7~)y$M)*F^id+!Zcc}}uuUk++uUHQ6bIQcO zVA6F0sx?QvcHU&q?3`H)-LxFLVmb{xDnJ94q%1~teTzu^6w&8W7LUHZzW@Uz7k@dF z>+Glr>>RuO2hpyG;)|HvL}p#jV_3BpbJ=ykltUy7Dt43lDH|pt7-#4E#+bB;vwkdW z3>CW@lMqH+FlFP#+2(E+n|ovTpWDR1Um#{>`KMi2f!dF$=!FC8UDvRK6g8UxuW{n! z-hiT^eorQ1)LwZE`zWS*+f8gnzp==T1@_4t0CpK>qFcI|Lwqhyf7VtXQ3&H0sNn*8cptU{>_Yxwg1yM49;slsHNw}iY^dR11h74M&GC}EkhO2l+Vj3KJzRN}#+EwE7} zq|MQs?+AaVImOS>2ItCr%p`D#^I9b9J*oLWKlhz2KxrbhBBsK6u8zAUj4VH`G3!hy1?#5!rnRZbkJ zx{V@Nt}FZ3UGoU+NY)7viK9|ft!+lCLpZ5W4RyYu(-4Gq$AgSDp6e|=WE;wPEokG( zRFPWH_!)?tEPD#ySq<;90?g<~*~XiT6D+;Ud6t98Dt^nQg2O8J9pAe3F2(%yOZR>a z{XaYoan47LbD#E@&)9o6+@phGot zgumeO0SXIsdW}iQyFBDQF$0A;O`RW6C_SSoAsX={RbFst-oJC`cPd!7J&8>aikS1e zjusL<&%{q*4=I@StH(EQzN(I1ZboqypRV#y?5b;9Ph>*OdMRctzGrsQhipwalMeMv zgj*hch|>J_=H#_uT&)EHQZScCM8csLQc{1^#j91jJz1FN^|iiju6pO>BL(le6p+1S zEJH<~2S-w6lrNYM-X@Dl$67EVJ$;w$T&)Y4zM4KH`|6bo0mSXMT?Rg$_`IN}DousT zvIl-rMg&vMPSc#s#UPJwi!GZ`(Y-dfE*Uwz<%_oJbC~r{+_;&KaE&`GANuz@zeZ|S znf-OMf*yiLbCERZx+yDmv1?3muHaYT(dufI2=X8+3W(EZkH;JyK2(>5uO~j(yGbog zk*3XE?-Gd`)o?TlE#IzNI8p#bx>tk@vduDzvxd?<@Q>;0Ibq>07vKFVeD7M1RX=@Z zb0(0GJ48Z=z~i6lVW7By6(cx0jr6?Fq*{|v;EPG92|IiCgu(Y`g}00QHJg~b(h^92<)wd#h`=D3=d)k{1z*~8C= zrwI3?Jn2G~cBt<>aH= zr#rqTPvGjC%Wu`tJgO^fYhG$CGVOfrGF8k!=>Oa#(EY1Z4)$=lk4AxEp&kXaAYb%| zu)^YKDZnJZi#m2KR_!tmv{pMf5~l6XX5F`(IpYFtjard4aUAP`U25^9(sPfyk-ld2 zxTwfc`}8AfB_rY4t4nx?}!f zR)-P^4T{$HqV#1?cU7dQUaZsNFPe2{m2Nu2)_TRf69c+=*-1K%#IDf5lz zfJa9iV$JV*ie!&@564izfkJ92IbD5vU!^qwDXsQ3<=c86~+6v{mbL{5FXA>V@_@o@)osco#|7F5q zIlKAvpltyU>kc=7g#=gI$%KgJoEw(&r$nBTCov!2M0nmqS$QSxDFdBe_+xCWOh(8^ zqQX;P3ti6pR!uo8OhQX!T@SYPY{_b8w%Kxpwi$k^%Xxjiw4%h$T*|7^7j_cEU$oP7 zE$^`yM*P$`$f(Ab;hFcDVBLJsM&4C)WXMrZYW^JmL_MP=ltOnjcLa_BWE-TQp!(359FUl*7l34Wl7 zPTqt@G?UO}sX#DA=baPkud5&*Z z#KH0fs)d18UQF;DaWt+;KI`Nx{glOb`i!SH<+vLE4!>5D5y;8z_B=~2r?9hs4oFAv z{lz|Ilb;A;uDdY2+SL|ymusIIqHuLr+b|{tZ@omu6z=u1>sItf82iG#aD@h)4budV zD@K=?P)@6SP}gIgYCCQ|rqc8yNi)EHA_D!nM^*3qW%?M>$lhhD zhouuW1onQ?ZB>@3k4ph@yhbNt1VicX?{j^~zTTRaE{)ihP2jO0<$vz0+M6OOMEvft zs9w!9?A`b$qTyM;rYW|z$=hVfTwO~w7b=J*DQ!bfKq3us=zx{~tX3Q1d@X2Tf8oe! zmKc}90^N481hOCw8jM?aBjGF?qxf$nStd`IFWaKGEPL`+hB$hieS#jOJr;q(60(Op zbsF1N2A6CkFg|yG${?K-E*U;ie&+w=D4Utz)slk;wA~ z#D44AcsbF5`gLChWW{MQh@pAB9a#Rn?}&zd=?Z4Y5g^S0x5G{0T<1IZ(Mk(+8czC5 zj|ioIR~P(HLGAkcrIy6()|;ONm63MNIBXCt`zMf-FWY`)QTUQG^!h+TpTa2-qr|b&d@j6P4$OnAWEplMH!F@@||D`T1qk9Zn8qW8DHB za15khL4~}xSfCH30DjM~j5vO@O+*vkcOK%Lx-Rf#Ru;T+qVs$61g4mFRz-#ZgZX9P}?X9LjY|ac@B7-t_|O& zqq5-r4T7a1cPCq)CE)}Vc+m*XwMcj8d6)ipG+SdCB_YuR5EmT}!*QFe^pcIH z^rm(wR1Y34N+k38fg}qdGoq68yC~PMx)htU7%CMdEQ}AweIbU z%6vo|+_*;7^1Y?h?tG=a@pz7CxxZ>GXfRgW;88;cz_WsX_i%mm-RJ$4H73aCzBNgd zaLI^*r9ymy+^(u4s~x74&yQ$w-v;jWn`x84_Z3O-3vZw^8tB-CT$YhnJV;zkgcf1D zFmz_zX#r1HMs90?Ya(fz#ipgW#_VAp`HAih`@LJ;rA;Jw6gAP`idx$d5T}&;_OCjvY>tMsT2LwB`T3$sxpyphi$zJ&-YM$s2rYX%*0zmN z;u@}5fSuX>q(R8aA_5`@HMX`hr#*XK&}f<9r#(;Ysve{WgU$poSdge|!4f-Qbu+8F z-}3M%SjN)vPTp(@I#jdwb;Z;s+ZPxFK#Y!WgWi=*r6&hZrtHS|oaVqzG~{{1b3s0~ zeuHposaKVpe1DA@LJP|a^^pI~@1ddgo$8yh*wpSb4lf%)4*jbreC%Rh<-0YTB|0h(NVno9!sb(p}SAWQKb0?_Ag4 zoN+^p1jPTQhqU@&NPXs#LGnzSxsN!LN30ChUeJQZ4IK#nO6J~F5+klz#+OdcagDoFYh`h}@s7jgKIU8KP2-3!J`Q|{cE zhP_-9=Y@Fvs{1LIdq30aTWJdI_d{38aP0?3(|o(P`y7`?I4um7&sl%z+!I>P)H1G3 z2#KGWGupd3;=B300JYnQd`H+o9~Rks{&J~K^#WeJvd-Sx=KC0m z5sqlaqt4R4!5-;y zuD55r+i;=28Nm+Xo8`!y46KDa_&@a$+3RJkp=1snhu3?B9K-2q76k8>t52ov*>-Kh zMUphAYq_}`rjhTE6Rn+OT$hz0izXK1l)o~$82V0Q@GMt?xFDQs38Rf|_O2zgKbgGj zFkj&b(zdr}+!uNAC5K|;{Av(S`U;Fw%yff)Q|@@yIcZ${sZGbdx<%yRTPwy(VyQO* z(jV8xO#=}3lPM!~zV8MR2;MOZ38kR45_&PymcfC3;4tpMPJw8yNTV(6nlzXF@#2~1 zKf&((WTVFBn1%o$Xe~ucK-0Qk=NV1|AOeUZGFYVJ<4&;ez9yLRJ6R$@t>tO=k6apJmmR1vb^ znRh07p!&7GT}9z&vomKTxL}ThZ%gyB-^aYf>301h%U7sSxhp5UG`nxR#W}oAI<#ID z+pWllj6^8Fnv~n@EWKMUhpYn>LOH6_Hkuj+*+})Tcek-ap%8{rv!q$1jL)$lf4DLa zvsvL=gDuL0cm$tMovvZ|P)uaX{N0XWR!IrriwT~gGDi!?*;h#~a3zfekj@-h^VW%d zf{+Gqe$dyVjF`GDTg*M;z#|vkk z+safni3ICcZ*#0i_#I9mz2&aV`)M}3`SgEk(@OuO{nwQIx!TyS&*jx*@w_~ zSF5owAE+A<3vp>zT!d#tQw)lZl;9kY+kigyZ@EKK9)>}6i_?R9Wb5Wbx?_WfHzNF< zX!y7Scm4dX23GT!Ncg=u1S1^Q5BS>%`y!~D!*Q85w-)cRdfRK#fA^dq>W~P)V_Kn} zgEcXSi;=ogos%uzqzx7A>kfwR&o4AQMfb{*xUu{ENHN|=YNxR|(OijIg-L;!D6>#| z?iKIOp?lt|OIb`>stGpxFHgyP zT(RbTF@nRRZ+===)wWB(5+isfcI+2^6qhhBv@W(QD-*SugO%ATl*n5^B2+7xnq6i$ z72!f_xtj#HKjs-KCol0)vwzE3?`(Ua?<@xA>^1KXC4x(7Y?rE~8d2bA0+^Ml;jSKXpqZGY+-_&sLqsxf#iXN7byxsx_r3?-A3K}6Xv>B^=a$DS}# z$1*cV5BaF3tF+1h;fN#V(J(^gJT1h;R*=ImLTa$=5mrRqWa|ySb{>B>7WD1La*GO} zpE-DWH$l(EjXsv);PPaJp4=betqAxohK7hh7udw*#_EVv8$sdWIPwL-FP>&KqWIn@FD-qk)EZ^}t;A3-`@BwzeVruxeh7lD z>Jy(W1{OQS0;;DdI@N>wtkDN?H(#>UI_Al7Ene|%C(91Ug#vqirMpH50&brJk+jYs zN|QWmP>z-@y+1+mtQc#=T9rH%QFix55w`@aBaF98{sxA+z#FAkg%<1KOnx;Fe_&dw zNNHVMge$+U)UC{gW(#9HdDKdiaC(wtOd}GzKyQk%_ppcjjv5 z9M5Wg(>bkd8HX|g={3w2tJq?8EcHCn>z^tEXfRHw0~0cVv|2YSGTRDN z$nzvt1Ir(Ivg&_vdkClla%hA=&m}p8Na-P2tt2o0`4*iXC|l7(uG4P3hGaeKe93Qx zwX5yD+@r7bkwSzS;u>#l!I)UBox9**eY`fpiA24eGHN3U%Lvf4_rCu$E6)-L2;YN0 zACB1HU79j|pZl05=pA)ZWH{e)n;znva^4cWU%~Okc1kfDmC$7e&GwZr^ts2Fcd|&t z>VEARa0I z%oILgS2HX3f9<_>SX5v8E{=$TAcCZTw1BjPbV^Hig9u1SNY_vz-Q6MGC7nts-OYdu z-45NHHJ^Cj?>WEo-|ye!HP>7)7u&tp+I!aXJomcq`}w~hiDj-I6SmIZIXrpf>h@c! z*j~!i-Be8&){@HQt(aq&ED4a*ugo5+o709t@_zFf7l6vi zoV-p=;p4xSTZ6G)m<@m5#uCoE=hrg^P_C`f6mvX0m5#POT9R5+vnpXK$Nn zNVXIn)p=gLxHn4>MgvBN6$;R^*Ly%j6oKgKak_J)1C!v2{1ASL{3Y} zZKu4h8{du8yYR!)FP5q4#%qW$#MK)sbUD3MK4HvNELYYD0&)gQ2iR{U;^H%t3{w2~ z^8PbW<|ARqSmXRweA*^-r?1mwg?X|)8o<(iojp>(M96F)nW(3iqElO{NF$3^pzW}j zEn&-?BY{CXBIvS2Ra|9rBWZbh?BLjmIjwv{NB-11RnUd4?9PN*-rG6}o2yASBu-KI zDg=;@BnLCx(^y`rf6gC<#BxDy<)xW=9rXwaLNK={bAgS`I0DW}tN$M$+?PMg6^D?! zPQ|a)+G*7#N81?^Xy~Q2pMzRjJn4hbb=mooy_Au% z`s+~8MO&QX19NtK$<*k81$9@b-yHV?^fW1up5zNOI$+FSzfl4$6q^2n&yD?q3I zKiJa*HCoO1`b;1*b`WS4Z-%y9q-tiq<9#btvUr!k(p<~~J^RE9nHPlJBd`Xlg;Cna zS7ay)2~c63X5aNeBC)kT17`b!g~m|g*Lzgr{YRd2tv8lme@i8oNxzLYTw2iekFDi&6}c%e2wGjcTN}2#-Doc3 zG+Q@s`u+LHzs&RMiPAR$s6RF_Z>dXR*v%D|%9R2u+A#UXhU^p^AjyFOpLa!X>~@CV z#C;e{p$R`XvFopE)E9f`gf(kWdx>i~(hz&TC}7J2Q&dW+x$6p!l{|ISH_>^>u^w(b zT(d86ak%!C@X9IQER%$yC3=@e=%A&=@%MMi?c-fp$I1A_py7CS>RQ8thSnJKL@;1F(UHFfmj(2|HLA=ffQ*S>R4_l<3deR#%3UETqS(mNXZrS%V>4p4iaAS#3 zU_jj)s+e)me9I&KQTRN9U}#ZYQ3EY>*9 zI|F1Hy^N8iT`P$5v=U6!IUE;Em0y3&1XcJQbQ@LS%jXNE<)RQwacl`B$1$ z4Iq!r-dSDMF*unI7Q1!5iD&gXZvmajE>|LE%@0)H02t$>G$||H~9kXfq@?-6L$q**`SChyS*UvZ3J0tJ%=rXLN#8A($-^6>{ zPUdq1oHn4mbv?u}J7vBbdB`GFLi;SkPhyuoR8c#^@JcaL$k;^D>?dNBiA#ZxQ>6tx zu=0vj1A;!G7%*tEp2~gGS~wkdo@-e{eILof+B{uy6;$BkXdlnTYjb|KbSD=(U1Mn4 zl(i?<7aSx>L*|QzlL;XQoHXt-t?J;#q}xZH^%pMFx1Z1D;t=uorC@yANf;m{N}EIz zt{%?q4ZKxKiTc)3z7)@_n zJbh2Ei=XMW=Yrd(;9&q8uXvoS8NfV{)EGtl8p2ewI@JCxF{X7?&>a*1b6RVjX`h|~ zw|cnUMU0?N%{1si{vpaHs+{r@x8o!5jo1hRTbd?4-|Atl?$m z_wU30|h?9Fa*HB{Q0Myhu#Q@XAkxvHbA%k+J%=d_YQh z+%-b-cZhkce(6?O&8w3>Pw5K(gbEKC+*ms0Q2XRTcp*(?=i3S;Pp%HtUyz5`WP} zkYnR$JH+XaVdI1@csQWp^F0Q+6rA4<?GSLLZ5}=pvJR*5WoAWO8ZTlswsG1?0+T&6IEz|kNq+23@DFPb$pq`O`RTQ3k zYdgFojbY|ZfQIG4A1G%olE=b?_cD`(t;?(nt@9N~JRWkIR7v=B8&tG)nlJ7gD0N;& zj>+CM2Q`QQAi~0R96gjzs2?!7V_`383Ah0SQ1}8$^MIV(#BlAjuS51{KeU$2!v_?a z%qW$qzvEBO<)LoB1U5FRo%naxltn`c6>^EOmPm>d6d=`m6>YZ(4>2$5dG4R2uxpNt z5AKA_{E}s*PhX$kr`^MPulTdSq>kZdZ8e4Q{-I9W`?47e;r&TM|Df9s<1R2i zq=kIIp;wx`I>07>qSdrZ?S61G$!Wa(iBiL9ah-xKGyi6LA<3iz2R1%65SSkH2$Aq! zjn*C9VcVr06?nh7yK>584dE|23=|I1U=?&p!!ZP{!rt{eCKS=mJ%A^h3kF(Q{kes1bUY@|%nO54Z6T}(oiwRzS8c2im9h_np_Dqu3*lb!@b zQObZW^+hl$>M06NIFk6=?2{yI&(VqxX;BO^XeRU$aa#*!{>Dxcpy~Jg_x^n+VyL)_ zCZlW~D@&NEOQeE=9K08k?}YKLT`Sx$?*2d8Mha{eUpd~O>bS?;E;AQ^-pZ2Z(&%0N zp=8GK`1G0;vePNW053n?TA3aJ1-yj270)HAE-Yl212uSP7fy;K4XAT}zoLGiRO3|kY&isZ{jZl6O_Xm(?EF)CI6W_XJ$uP{H!uFcI{#u^2!1oC49B_H z0z}84gGNK$Tjah3!H@4&u`_+6B`7;*#l%}4d!LHqg?0YhaM(wL0{Qt$GOu?4Q*Fsf z7|`(c^E)5E@I2~vcHamOe-m`;xtT;)H-Eq^mVTvC@U~u`Z1ZX#{C|?hak$^k1w7J% zhQu2wk^ogT=%Fvre2fDdW+`AP85!U<11MNQT)uMJK`%W9Txy$NGfu%&PXhMm+2XsAk0mDL_`2l@}4gxFQ9GG z0S2q;nSaiyAbBoe)G3!QER&`nyf)@8p$a-Div&%_eeWH;!w>&J-ZLS5lSCvzwLgHi z{Sp@?CQ*c`dt+Z1YmDxrS1xtnu(j6^*wTU8Z^Ww=2HhxzP_1*g{bbVlOY8_v$@VY; zCPLBss7nRnb#+oPu~oEHv&MLfOg$Q2Zy?!8`#P7gFRoI72sX~A@(fI?v^qGbXzs|I z8%;bAln{;$l9sRSge!oBLNsFL_JI{jkA7m-8;YnAP|>e^yF$DfDluY9%YY;4XZOrG z(Zs{SkrN}5CeL6TRFGf5PUu}MU=Yiv{Ka5}6J;8MsQiCn9fOF4X)PW=DMZ~7H~^-Y z3vlP=av$k}9|2dH_u>CM)%!$2tADuw|36O)NYQhpuiEx(V*fIt{on7EFJL9)FL6g1%NPfA}wk5eTee?@rV!n{>a0ID>c5xDVxUZIpPQr8AAu#r-Jrbi$L zxEVfZxZox@gAvUw|I&yc6MPHA1_hw4)~H#grzDUo z`ue%DqF=YhTp1WL;=i_COambq*RoPZe<(vx7U&4_fq)Ku-X9%58+3$M0Yt@3SSah0 zSw6389%j9UDB!8MH^)M1F@D+S2x!*=;iUWtz$=$_?q)(&vHQU|x!1XGhAZsRB{1?7 z=KyA`iWTj*eELq4>ZcF2K5HT7Bm(i~C0ZszL4$Y(x10&=p-oOXfH(*P;MmXRM5vOmaN;D&Z`>za8#sI0Cg! z;24vBlbWma#y~Q47coZJyiIdBpx#JO>S^nd@i|JniXjwoKm>>15n6%IsG(G*Q)>=N zcgJygZvVqN8z^7+=E@Q;fpWqRgSW9wpGdtf)(9yLzyZvCCPuxdftVzc^^5K$f4)~! zm_#06axY*b_c@uu`R!MwNByI$piltOz8s#dx{AS;h| z3KSQALsCtgM2P9!r7?kP=SoLAN-C!DP<~lOv=tEuES9nYkLX0HHb)b%zxDrpeLSw{ zf3ElK118k&Cus+P2ch=wbRkc^aO~^btAj1jZL0u8EDXw*f?t?iFP08r_S%r@R^YV% zY})?;&2(^LgIjR>yC0t|=6aA*TRnGAj1S833e$of9QCudGYQmtULM=wXYP(37qha$ zW#PZpemQsJs+LoC0q=qIW*Ns)uu=vxf~M*3kiGA2Tz(7mCit$Lxw^T4``SBI^L?)u zNW*^%r1>U}YmZdDGGWaOJ5URkf&XHQWmo8UF0SxSd~8Oh)zm0+~io@%K@WU>pyllu^)vi)fUc3|%27_t<5t;v^edqkX)h45{Aq+{#Pr3ZTRt?V(C+O$DqJ5@zk{ zFNwwl@-P=^4$lC9A&US`0jGL{jmzQOf}ALd2ZX*69VA9Nxw_m#N2#@bNOvpi17wfU`x4Gr(W1gs z@{KN$TXQf&P}d3#-M)8F-QdL3j%2^uqcq8$6!GM3JUt>ZdDz9m43cH7oEO-Cz`7iK zuLB{E0u+nIdxGX86qIRNuJ&?;)3oFUbL;2Q=_BaXyZZ~JT^aonp#;I8k>}9{hHc)G zBx!j+NQLRUUU@eLb~^#7WtYz~=>e0F+fUwlTX2`&(`Rtsui8*Vp8_326rOW*^Z(|} zS9jUsLf+agG442(n|sQbkFfB9L!|uA!!{O0p(-IrT7|J?KC>Rj|KF zwb(r*MbVoRCl|>Ln9OcsxER7-KVUQ;g$iOL-C#3ni-NW~gl>*_fiwcgBVwPYVcntK z`|hWX_Gv{m-N>o_wB)akS0aVSfCalhUEB(&l34fYg>JTYXockGpJL=Q^<6Njx1GN% zC9!RJD#c7QD)=ySS}uD?TvddE8mzt_Ufi71U$ves;%5AiPJlMP9uix>aHU+F>7+xK zw$0jq=R&~HVyxdIJBR&t_o<2RuZ$ta@`g8QjD5%|?yQY%v`$79bfFHXmiDdZ@9lc~ zPazovR*B#CAPs!=MZog7D{!G%zq-up#0fVue=q2P>gi1Ns|}?LU~Ul@(H@c+O_rMS zZdQ;h%`X9Y*i7lYG#0Wl5RwzoR6hCutZ_oggI2O8|6&^lP2ZYS9ZCbgLa4ZCh*1`VI@VoosRJ*qrk zyfg=a0$W!B>ap+ajGFEs8sq3ALbs5&dg5I}%wg=S;8zcKM{uFe9_(>Mb=zJPmp*Wfy29#OJcDq%(WSP6lAk7o= z>8A5e_m2$w5#NLuR{g=K{T29OkE7P;X@(*Rp3!|2ChXs=nSWf$DTSE7%e}+J&V01s zR_MNfkUloEJjMSfxT-{fG!fmoDZlrchDsJh3JNWL>5=L*BzaAkb}<5=`SJCS{XNEc^y z5pS%8>gYIy8fC*3+E@-6`AG5*! zmfSI!_{~Z%xCnuh9$_cnza#HIv&1Kmx#@@g@&j%>lXgr99&^mHQ*bVFGChZB zZ=8R%=@9KLH-RSm;29~u^N8;aYN4K+>vv-|XC+A!6!!ORs6_zn!PcpgO?8AF{o0~- zJU&*Y6tDv&m~sRh&D*aEv2CVl^XNln!d_y$dphGQ<;4a3df?Efkr_s}QI_ zmOXB!D3wY47NPPH_aT4l_H8|HH~AZC-$uf-BPhD$vwOaU*p-4y)I~24y&kd?xq-vl zvTESgfvUGuBo3l#)1v%jP+vZqrqvEfmIB)Wx#g(!-RWn^wfAi#MsKLFgLIKa*P90=5mlST zu*RDZMIj~kVYuMw%kKz}+5qgA3E!TXX3J%m8n>BAPiHA%S1NorX`5%kgN<`3^Pl3V zK*;z#!=9uGybDAs1(<}%vwNqI%k-Pziwq;E-)GV94EJI$Y-&ZQz~snJM6`YZ3h8`0 z^2X1Ib))K0((}&G7a232VRL>yFW#{)S&#qs?*bJ_v#ARrW+;&uK_x>ZNntnziR1rh zGknOl#8K*@^a@Rj1{_jGfHquUoi?=#teM5OY&Q!Gy|M{Bbw}#w+a_skj-{!$m)eBa z28`SFiP$X~$IEqAUmja9lrFpdnR$pasY3DhJ#ekn7+ZSspp+>%9q%et- z1u~JUm;CDe(rc(ua~rf+IpQafZ39|d2}u5SOPEBns@s0*fXlom5}2KAXD~Z+%jPEC zJNY^2-)#n)l{aY9#kicZC%2TByGXcfvxv(qbBjU~nd^Qeazq}_;0+n_;L(1!NWV*v zJuQ~{**L-Teh!V8-A%esLG(+lp~1yfZK9zp?v-M0KT8Xfe4)>rb9_Z<*m!>_x?dt+ zn|Q0#3BlsK6MvEX<@Tbr;>R;K1+?jfKmU+@#l~p&QqsPx)n$nt(h~4{|FqbMv!PSY zxSiN$yUQ1|LcA6bW|6Gdi!PWb*D3oftzMD2jPKUtEfSfHj<=A zabTguE>EHfI1eMTJb3n&8(tJbYNoC(x^hafEMUo3fb@h)s9#@Oq07H|Q`x%sy>y7| ziLJrO)H3&ANuB@B_+{_Z*UO>~l{4(AM@pvP-j>W=$MskD`Dtw&n4ml#j&(J0@>cxLc5Uk#x1Zq$RiRaXxjzTeWMGT1e^v93?wPe}!ajjMJw$F3M>~9n~-0 zT?<0l6o!(-hdvtDH#4Q~%pZ^DZ#nmQFkf6e>iTq8Y(17v@VmCO$esEoLAWR^JnjHDRbt$dT~ZQ$$dnfmu)1Qg2r&%;Ympli>>A${LO zovJd6pMk{Ml*p>yZm?$}S1Oh>)wbhl6wW&yxAdxnp+T)7OkzmFc0M_`KMb0dp%|Z- zN;lPibq{XANhnb3P zkkt4bqF7&V!7aL4`o)HKDBxU;T+1FeV*X)n)*|0{om;RvH@uKV#=RRN$sm+YW~QGi zmk14isDVt!?)HqP>0C`Ppt|Hb0qUf0*|clW`u1c*rEMt*i$RF(thJkNz3pu97g*8T z(!mPxznv3o{%((D5OWmIi`oKP1+sjbK#MDD=P_Rn=*zsRPPdk5yw;{9`5`)Em5X+3 z524xoGE6D2W|-YX%;-d@HdQ_E{^Q2c^qAk=nsVIFi(sk|SoLh=YLNH)9-)c@ogE91GHRoP=VF_;kWFqfzIq@+~X$gq-tGn;Cmt;iLClFO!qn6&*=TdX5BRUorS%9 z4Qt+YwzKhl=SauN&T#JaTVD1*5aLDY$1$Q1Vv*|f7x7(m-^Q)g#z@%@TncN(6D&K4 z5__fLbHXQxlrRe`-;5J#;T;-km+d$=psl3BgFHwM%6jie+|8y-g!~ThBJU_3DiOqp%0DO$+yW z+Z6;nWG)o|+xR&)qmmt>xbl$7=nYO#y&@egvUqIZ)AeQ-Mc7dTk?=&zo8!wUwu9o_^Ga6H|@W$8KHr(Nd4>SlY0Z69G0|4 z(192?hM>QWdITquoqNex^yv3JycQ36GD+;rjfTr~++9Lh(iWGf&N!s?HX?cuwmJ#eA0+g}R$46J*zMiWWc?Nc2ON?Z)fov6A^+;o4k8YR?0OM} z(4clVJ584vlSK{_NK`mc>VcCH1t$+p>^gXf6k|cG7Rq9|O`MCR`r@izYCDldkTKQz#ubu70T~-AJuc9e~ad=KX zNAkh)%t_z=#`-ErC+|(GA^AO-8+Knwd}mzZLM`=M3s zXh3?H`y)2|Rw8v_5-aQ3DAw0D8AiHmFRDqwliutJ8l?o!RVCKIj=ni%Di`dh7dn&- zO%zELIFdD>+D#2Tv)kbK_OSCk|cGdwxi@nQ*y= zDMvh=r~Dw;QgQKk;*xi#$2r}UCy`8D5w#%E=~G>(_mS~3mz&1#=e^i3iaFhd() z?9lVZ@cSUXVZ&!jWLe$a&ilKLKC_y=3}?VbCuCvjaJX)nlOe`x zW8g4QW9WFD?sl_(-^R(MWT`oHN#4OwefG#Qt|hQ<1kFirr-Ra}PJNm{U8LmXs5!8& zDMdiaisKKgYlyJj`JBxOccQ7vhDru}iG$NAED)N55dFDa{LHXF`&S3~zQb6#WUR;1 zxZ^Z*aYy-PhdtX6X&#Ae8x{Ku%pLBLef{R(71>Ey=JDaWfuCDixod-onhOhS-P}|L z_N!W4Cn_t8ptI9!R*VwRoGrM4ckRZ?dLsXt}gs^P7Tv^ll%!W&ny` z(T%igp_8idy|tyuI&y(eO4k=<+>38J^-huk`!xwkN<%3*OksG1)g9Yn(}P#r$8}A$ zGG}8n5{-S6j2UyQMtI_op$x9nrjsqnYbuBN^!hxvEys}U9I42V8LRd@s{-I>9z4fY za5kKa<@g!urU-YZR5AQ)DpXH+zgDq>sK5qZmCsrdvyy%Wl^J&N5K<#0i&ft*idi@4 z5iYip|CzNvYB7gpzd*>W5hry0yHn&$$?d=i10K_p{p0&AYjlm@PEiCoccgQJfkEs= z`A(p?b6P@Ey*-Vy^2S|H@XsL#%}a=zUym$L7OMPAzF5?7`H;p+|Ym%miPto z(?^E-`k!A-hd=dts=Shvj`bMY;KK5)kFz_3P!>0Vs@un^dH%3;$f2Zea@EL$LFmRN zc8D+42HP&g@k0mM)W<#pY6GM-y%<)}0X=lxcA-~m`E%=olKD5;##Bd#w%eAGErErs zC0;jld;B{aCOdV#viR{f#`H0D+b!pMX6ddmdz#&84hlQlFlR;Q>iQqwYlhEhUYYaH z)XleOa1W;pkw9m_^5AvEvv~l7ZNc5No_)cJQ2S(=tOkV|_f}JW+uWhRsLS81nqT$! zDza<#@X!vI@EO(RsF@tAeDhOv&htw`2-9CG+fG%tdD&jraD&a5ll;VJ0Y_^W-1vOE zR9eVQj|w)eP5qNt)KG`#jlozOX?8~sFQGjkT7i&I$~rA|qQ)-~YbgAO#Wp#R`!zg3 zk{}g0aWunmgghA6Zhm}8-M=-NV{JO+`I#fPg4dBRThesNdmw0Y?L;-A9VtC$$w#=b ze=tSv7{V z0|>>OEF2CDJ&Zd#<;{7xbsA5;9({d=xZ@q9)CMbhCt>`8&xTNwH9br`)1`=tWGiX= z+d)hxfsI?wCd@CpEW}*Cp;{3)6(IeY@a=r{T#^WPfXsVbDC&7@1FvCF#}*;CZ&3t) zJ^Z(eN!n6B1BP>bV+C$PaVuka)0@&4@M6FDJbne){MUP^^_N8shEo!R`9{@`N$KzF zHx3D5j6G4!=ZkDJEDX^#Oa>+D80z0(g34P+Q}?ZOoB@_>Y%0dQ-J%?H5ogj-*L#K` zBH*xTVo-crEaIra&1RCFCw-rG=yNKoQx4X&wvgJI+vldsl&)XJ)`#y47k_hkw$g74 zEDSjve{3GuIL&fi9Ff1+z7%>+J0`XrcC}VDn9+1b(7v&K;b4QU+)4Ra=sa)c2-g`D z(bC-onOY9>(kpmYOdaPjN0Hn~f4ti{uW7cSc`loJhTK0^rcfDlNx3t)A(nw0a=&&= zC*`BuT9Np?Ia)V(cYf}TwtnF^++Ikq^=or%tbHvljADngY0xC!;@gyXN%7cmE#*}E zg`E0NpWb|iij5?Y=}=qWR)GQyk;u?f3qjv|s}1{2+sV>$zpuxn8X{d#z={pFE-y$7 zjO{LAc$jXHm*s#iS#L_>l-xbbGJeSySg0$sQC){v3pbx^O`T-w64}6$mj6u3uYF2x z4#uc={)HNfr&KJ}IWd1QU16{RO%VtKK6*B?c>_?B(%!c9aCY+Y^H(p}^EP4k_X68^ zIPrun=1^@M#ONz9(z*?gnQ`<{JC-a0wo@BBdy1QQ7f zzf?wCRE?7cf8G%cJRNSMFyLid-lQeVTf}WM^O8&eQY2O1U`_ieG|;}szmKo~6B4&a zQ&V#=^`vZ;`8Z1l4gM| z8`OXyjB&^z{*%|*N8?r^cA1C!BKpcGm8)r7qoaz>TC;J5TlswB? z*am|w2FrAjRDUbs?bmRrd{2sTH@nyjD`qipl6wEhiSaNuNX zWpYo1yFYgCX-b5szOMY$kAnxQZBbkMhTG52ba*i~B(&LI?QxKe58=v8T$Ni%Ddi6+ z+f-iJ^t(Jw53~mOmn_BKS+DFP_}0b==ZR0Doi>A4J$0&Ml~S4_9RbuT^|{Aa4{09T zT`$?lA=3GlT)IR#Q!Ss~DkTHB=7Mv*^Pw&6sWto$4#aXkZ=-vw)xrHO)$)!LgNPEX z0h`Uj2eZ4)uJ*njW{t4}IcbYd*8Oo127}L!58b!P2Ab@aX&2X?JFv{7ZryKpV|5vH z;0e@;uxQ_$TS?HYJvhzYsVRu?NM72bSs>ysKH}Ooc-UcGVj-N=_O8#Pe&3bxcJSk{ zk1kO(`^ZUh>^Fr~RHI^#V2zE4ahcyV@*r&>wK$5z|L-96QeMG18<(AgnAGfkoL+9-I*VKnqo7bPhZ9xdM2aV##a$ue&>iKoIc}%ZbT5$Wa@Nsuj6j-)SuO?v}Rsx zYSz%JBetK@?U_HLkZEhHxNoXdA1{~V88&N$Wt7mL721Ahbgyx*(Lss!7Fsf+4bSg; zrjz}hyL8F_~WjSk7S@eOb=9*WKz^-T<_Zr2xQ?cjpU zQfdxhtLx_T25-TbN8SyS-5U>VZ^bd9&p=BO2NBL6lamvvOr@ zzEd9r?$ygnMo$fxv`Og%a0vfhJ|Q4RGz3;_g#F6x8SoIiN1!c2@#IkwSmP%H{=$F% zf>Mkg0POcM@?TZttM618XkC;Dkhpp7?;IQzZV^b0z>>d+eqS|00US^*lY72Af5F3= zAVwi@XRvwo=?gxnEMFmJ5o`t^g)&iK2mV!CS^^aAUm&9rhxVS3*#5so8Nosu3Qqn- z?hq|jFxlm{(%kjQ2fM|Lc~LkVz2mQQRkS8TPVBE5{A6V5TG)N}%iL`@XTC~XCC6>Z z<&cH`Q?AK0v8MPNPk$f-m;Fru;Q;=hD)j9$6vsfqX3rST$44@m5*g25?S)%#dcqvh z50jf+eT@+%Y*^yr+gtY}$gqKws7rudLtRH+sh4Dvm4Ed_t@Si#CexGY@<9kFUO%{j zDY2OQ2BsvEhUvkJ?-~5if@9hFfpAEU(!PHKoqPSgQ_W$G7x^ z6H5)*R`T=9?h)3u9lkS7*FW*trS+sqcG+GG=BwY9E72@{;zlFaUbS1wJ-@mWtrMwK z!2*)pWmXbzi!Sr;JrvOZO*H5(&|5bd{oy&1ZG>^@JYMf;-T2W0HV|xa+2Ll+oijWKE9=B6&@1ea4ywcyhe`|}YJH&=PD|Cw6KP?-T5=O!X&furXA~sc~@ps$f;u@ibs>T z=nd5RQ!i`SA2Em1n%RjmMeZL=(JUa8wS*|r3$U~w+UvCZUg$L7)=Wq4<|SW!Hg%Gk z<|nVi`Lw^#zFe*0>l^9a;)}HGfikPt=@vI8eC7Q$!Q*2C)&y1`HOgguN zyUNOzPf>??y_6F^E78rRpCN68=2#s~k=Ln&H-G$M^}uv@`XN+nnSIsM$FRxI%kw3{ z87ATj4YnTy3C5LvmTz@2AQq6zPU!Eyx$emTpO8+k{%GoW!ZdaHTS%1GHkI<$Z! z(B|y^#`6=|pDS8zf8ZF=hBp(5in9jwReO3r(6}r56kE@>_N7FxI^9@Jmy3p6Z2TO^ zn#vyXb_Py(SVghD8w>dCk4`(obXU-X0GU?6e%rqw6E!2BQQ=}PsNd>8&NlM?WESdu z5ut(~miIOh2TRJJIEK(eZcTo~)?&oXPs499?<{7I&%a(Fh`*n+il()Y@57<*#^gio z$dGu3fR-{nL;avHbEt=3t;aO0^^5<+b@=qF@NQ`lVV0xFgFTZFP52g9+eBiAyrC@) z*()eUl%Z?bb7~T9E4d>o*lTY2{)Eo%3zfnr)Mhui*cuzL^Payn3<^+3wS`~j(#G6= zYM$mPh*vn~OGg@NBsF=QR6{wc!L1emMs->SWRw4{ww%?lb@{jqPUya^#SFb36ecJi zai;WO8!*}4qB1t8-1W8@Vg7pL5IuLH)0^e>B3^NxB)7 zbHh5JX!wz;O9+6E`@a#&+)4u9>3L#~(DN1n;-MwS&prLA`5H?2<>cPycIILc3u&F) z$4{8~M!qZG4_+Bvc%L~F`!jL;>bqU+5GG-hX~Xc8XF|_Tf$WMIW00Bnv<%<9-vmur zQ^O^CW`!+896wSuyJ|p&6X7f(XKnLUu0-ou@R-4_osSf(rJiTJh!@jOToS^ zZQa85IzdjcvkMOy(Z#DoSqqzH^);0u;;AefBo54vLGV9eZgzj)H3LShLw}jmRt2Q( zH*9HtoJ#J~W>2auxX;>8v2LRKZjV0Zu6SGo#~D$Grav$}JZQqEP8WQg`@KXj7XNwe z(&w;yv20GW@rm7#h&iSaaKo4)t$!T-C#VwZE|bJAEUdIV>PuCuf_1;?P&{$`l_c6u z9d#}6f&I?yVmQYS^+`Y*kqn%L`=^^bRg_pxLFs?M#fsV1sUN~iD zrG}?vJWSV|W#0W!Q;}vsUjK|^%Z_b!&qiPq#)H_?BI_=&348USUP}u4fvi3!T5UNO zgJiV(K_9$kXITHMI;wtHzN%Uur`O3*3U1Ry&J_Px3#28EWk}8XJM{4BLz*?gZu4`C9shLa~xsaL*D~}`)1X%zqA7qHMP%1 z?CrUNeNT$(n|2mgH%p&k8I^P=3xvUxAt|nFL~(pm>AzbvUdnOhZo&MW_&d_K9Hl6> ztau~w95NDj@CmZow05+T*H2((?uUU45v{wt5;UO(&^&&8zwt&=LNmUHazEpG^HoY> zC$ibVvR=hwg#JI)g}V3EC1XUkxgOF*Y<}Cx(O#&<4iY!cquYHBiy{{fnT*Bq5+{xa z^?qw6RQVO~|iPL{L)`&fpd;e|B=ydQLy9&{b)wtIt! zWWkE6SnXZ=5Vvv9q#v^qHg-Qnt+&_9I37+^c-ic|)}{=4QDLF$cN*w^U~+J@8JyES z+dRaI$4h#T$(dKufUscLX>F;ZVH@3Uq-~t!Ig2s$`i!53GNOPVfjpc53dVopdKtyk z@H6$9NAfGM4i+S$vZo-4?M^+fnR0!G?2AW@U?)&(*V~X{5|3umwz=8dH>lRDQ1lGE zJ%_)WeJk7C89RCA<%I#h*_o(fP`a@09Zro9~*`XXg zZF{$B>4}SoC&1)iS{AVf@xgg>gtDp5is;hHW3D1B<&_g`c{Dk@e@J#)| z5flv<$5&(l2aSCxXEJCmQdsgLbS;Ze(j}isn_a3Tsc<&^wH;+R31{k}o5v%47pRjm zBOO5it_GyziYRp1mtBN{4@JxIu=C}w2M2(#B#nU#7o$kKAS;_Y^K8SCPZf$Mt}T~1 z6LjR)cN?S`W#|im>hs@a*D`XCTu(3NWcv>nX#Nb#!J&DpRh1qsiK7F_;BokTnm&^l zeXYa=ra%P9Z?s=ZQTCb=kJJaYWCX2 z9l27vCEoGl;?wIS*)*SU6<>)97OUjFMX7@^etZ+EC>gxga89gLh>3Ro^f!-U#2xxZ z2Lubz`9h@UX^=^qnO>Q)ea)j6E$3g5!cVb%J+Ujrd;}>jzO7;Dy7uXa_{?o#Y2EWR z$Eke1l%p**GsJ@|Ln%Z=egN5E{rV0S(t}AG_ZZm*g=d}7wywoDaSpm`)Mg9_OxHMo zX0=kESE${95+ezDiv*!~?IFn{^q3Hgu?Oe(n$zv)=&kb8tlDk}*4xKhq7?2Z*@83cKFZgwOnvZxXMYRNkK3n@05 zBn9=lM?+Jns%f@lHo_sdreAqm;0M%n{7%?7Abs_%8fA!|q literal 0 HcmV?d00001 diff --git a/tutorials/TUTORIAL-1_OVERVIEW.md b/tutorials/TUTORIAL-1_OVERVIEW.md index 8af77bf..f9a285c 100644 --- a/tutorials/TUTORIAL-1_OVERVIEW.md +++ b/tutorials/TUTORIAL-1_OVERVIEW.md @@ -1,11 +1,34 @@ -# Tutorial 1: Understanding the modules +# Tutorial 1: fabricator Introduction -## General +### Recipe for Dataset Generation 📚 +When starting from scratch, to generate an arbitrary dataset, you need to implement some instance of: -The modules are the main building blocks of the application. This tutorial will explain the basic concepts of the -modules and how to use them. +- **_Datasets_**: For few-shot examples and final storage of (pair-wise) data to train a small PLM. +- **_LLMs_**: To annotate existing, unlabeled datasets or generate completely new ones. +- **_Prompts_**: To format the provided inputs (task description, few-shot examples, etc.) to prompt the LLM. +- **_Orchestrator_**: To aligns all components and steer the generation process. -## Datasets +### Creating a workflow from scratch requires careful consideration of intricate details 👨‍🍳 +The following figure illustrates the typical generation workflow when using large language models as teachers for +smaller pre-trained language models (PLMs) like BERT. Establishing this workflow demands attention to implementation +details and requires boilerplate. Further, the setup process may vary based on a particular LLM or dataset format. + +
+ Generation Worklow +
The generation workflow when using LLMs as a teacher to smaller PLMs such as BERT.
+
+ +### Efficiently generate datasets with fabricator 🍜 + +With fabricator, you simply need to define your generation settings, +e.g. how many few-shot examples to include per prompt, how to sample few-shot instances from a pool of available +examples, or which LLM to use. In addition, everything is built on top of Hugging Face's +[datasets](https://github.com/huggingface/datasets) library that you can directly +incorporate the generated datasets in your usual training workflows or share them via the Hugging Face hub. + +## Fabricator Compoments + +### Datasets Datasets are build upon the `Dataset` class of the huggingface datasets library. They are used to store the data in a tabular format and provide a convenient way to access the data. The generated datasets will always be in that format such @@ -25,10 +48,10 @@ dataset = load_dataset("json", data_files="path/to/file.jsonl") dataset.push_to_hub("my-dataset") ``` -## LLMs -We use haystack's `PromptNode` to generate the data. The PromptNode is a wrapper for multiple LLMs such as the ones +### LLMs +We simply use haystack's `PromptNode` as our LLM interface. The PromptNode is a wrapper for multiple LLMs such as the ones from OpenAI or all available models on the huggingface hub. You can set all generation-related parameters such as -temperature, top_k, maximum generation length via the PromptNode. +temperature, top_k, maximum generation length via the PromptNode (see also the [documentation](https://docs.haystack.deepset.ai/docs/prompt_node)). ```python import os @@ -44,35 +67,30 @@ prompt_node = PromptNode( ) ``` -## Prompt Templates - -Prompt templates are used to generate the prompts for the LLMs. This class is highly flexible and is used to define (i) -the task description (what should the LLM generate), (ii) label options to choose from when i.e. classiyfing text and -(iii) the format of optional fewshot examples. The prompt template instance will be passed to the `DatasetGenerator` -class and, if fewshot examples are passed, create the final prompts based on the columns present in the dataset. +### Prompts -Note: Since the `DatasetGenerator` class parses the generated output directly into the target dataset format: Ensure that -your prompt generates exactly one data point per prompt since the output will be taken as is. +The `BasePrompt` class is used to format the prompts for the LLMs. This class is highly flexible and thus can be +adapted to various settings: +- define a `task_description` (e.g. "Generate a [_label_] movie review.") to generate data for certain class, e.g. a movie review for the label "positive". +- include pre-defined `label_options` (e.g. "Annotate the following review with one of the followings labels: positive, negative.") when annotating unlabeled datasets. +- customize format of fewshot examples inside the prompt -### Create a minimal prompt for generating text without fewshot examples +#### Prompt for generating plain text ```python from fabricator.prompts import BasePrompt -prompt_template = BasePrompt(task_description="Generate movie reviews.") +prompt_template = BasePrompt(task_description="Generate a movie review.") print(prompt_template.get_prompt_text()) ``` -Output: +Prompt during generation: ```console -Generate movies reviews. +Generate a movie reviews. text: ``` -### Create a prompt with label options -Label options are insert into a formattable task description to guide the LLM to generate the desired output. The label -options are a list of strings. When generating data, the `DatasetGenerator` class will uniformly sample -one of the label options and insert it into the task description such that the generated dataset is balanced. +#### Prompt for label-conditioned generation with label options ```python from fabricator.prompts import BasePrompt @@ -82,30 +100,25 @@ prompt_template = BasePrompt( task_description="Generate a {} movie review.", label_options=label_options, ) - -for label in label_options: - print(prompt_template.get_prompt_text(label) + "\n---\n") ``` -Output: + +Label-conditioned prompts during generation: ```console -Generate a positive movie reviews. +Generate a positive movie review. text: --- -Generate a negative movie reviews. +Generate a negative movie review. text: --- ``` -### Create a prompt that generates movie reviews with fewshot examples +Note: You can define in the orchestrator class the desired distribution of labels, e.g. uniformly +sampling from both labels in the examples in each iteration. -With fewshot examples, we are able to create a additional training examples for a tiny dataset. The -`generate_data_for_column` variable defines the column that should be generated by the LLM and can be any column from -the dataset. As previously introduced, the `DatasetGenerator` will create a balanced dataset by uniformly sampling from -the label options. At runtime, when generating additional data, the `DatasetGenerator` samples data points from the -fewshot dataset that have the same label as used in the task description as exemplarily shown in the output. +#### Prompt with few-shot examples ```python from datasets import Dataset @@ -123,12 +136,9 @@ prompt_template = BasePrompt( label_options=label_options, generate_data_for_column="text", ) - -for idx, label in enumerate(label_options): - print(prompt_template.get_prompt_text(label, fewshot_examples.select([idx])) + "\n---\n") ``` -Output: +Prompts with few-shot examples during generation: ```console Generate a positive movie review. @@ -145,86 +155,27 @@ text: --- ``` -### Create a prompt that annotates unlabeled movie reviews with fewshot examples -If you want to annotate unlabeled data, you can use the `fewshot_example_columns` attribute to define the columns that -should be used as fewshot examples. The `generate_data_for_column` variable now defines the column that should be -annotated by the LLM as illustrated in the previous example. +Note: The `generate_data_for_column` attribute defines the column of the few-shot dataset for which additional data is generated. +As previously shown, the orchestrator will sample a label and includes a matching few-shot example. -```python -from datasets import Dataset -from fabricator.prompts import BasePrompt - -label_options = ["positive", "negative"] - -fewshot_examples = Dataset.from_dict({ - "text": ["This movie is great!", "This movie is bad!"], - "label": label_options -}) - -prompt_template = BasePrompt( - task_description="Annotate movie reviews as either: {}.", - label_options=["positive", "negative"], - generate_data_for_column="label", - fewshot_example_columns="text", -) +### DatasetGenerator -print(prompt_template.get_prompt_text(label_options, fewshot_examples) + "\n---") - -invocation_context = {"text": "This movie was a blast!"} -print(prompt_template.get_prompt_text(label_options, fewshot_examples).format(**invocation_context)) -``` - -Output: -```console -Annotate movie reviews as either: positive, negative. - -text: This movie is great! -label: positive - -text: This movie is bad! -label: negative - -text: {text} -label: ---- -Annotate movie reviews as either: positive, negative. - -text: This movie is great! -label: positive - -text: This movie is bad! -label: negative - -text: This movie was a blast! -label: -``` - -## DatasetGenerator - -The `DatasetGenerator` class is used to generate the final dataset. It takes a `Dataset`, a `PromptNode` and a -`BasePrompt` as inputs and generates the final dataset based on the prompt template and the optionally provided -fewshot examples. The `generate` method returns a `Dataset` object that can be used with standard machine learning -frameworks such as `transformers`. +The `DatasetGenerator` class is fabricator's orchestrator. It takes a `Dataset`, a `PromptNode` and a +`BasePrompt` as inputs and generates the final dataset based on these instances. The `generate` method returns a `Dataset` object that can be used with standard machine learning +frameworks such as [flair](https://github.com/flairNLP/flair), deepset's [haystack](https://github.com/deepset-ai/haystack), or Hugging Face's [transformers](https://github.com/huggingface/transformers). ```python -from datasets import Dataset -from fabricator import DatasetGenerator +from fabricator import BasePrompt, DatasetGenerator -fewshot_examples = Dataset.from_dict({ - "text": ["This movie is great!", "This movie is bad!"], - "label": ["positive", "negative"] -}) +prompt_template = BasePrompt(task_description="Generate a movie review.") -unlabeled_dataset = Dataset.from_dict({ - "text": ["This movie was okay!", "This movie is better than I expected!"], -}) +prompt_node = PromptNode("google/flan-t5-base") generator = DatasetGenerator(prompt_node) generated_dataset = generator.generate( - fewshot_dataset=fewshot_examples, - unlabeled_dataset=unlabeled_dataset, - prompt_template=prompt_template, # from above - max_prompt_calls=5, # max number of calls to the LLM - fewshot_examples_per_class=1, # number of fewshot examples per class per prompt + prompt_template=prompt, + max_prompt_calls=10, ) ``` + +In the following [tutorial](TUTORIAL-2_SIMPLE-GENERATION.md), we introduce the different generation processes covered by fabricator. From 184582b46cb4d238e8d415193bbbbfd29fde40c6 Mon Sep 17 00:00:00 2001 From: whoisjones Date: Wed, 25 Oct 2023 15:25:41 +0200 Subject: [PATCH 5/6] improvement TUTORIAL 2 + 3 --- tutorials/TUTORIAL-1_OVERVIEW.md | 22 +-- tutorials/TUTORIAL-2_GENERATION_WORKFLOWS.md | 152 ++++++++++++++++ tutorials/TUTORIAL-2_SIMPLE-GENERATION.md | 179 ------------------- tutorials/TUTORIAL-3_ADVANCED-GENERATION.md | 21 ++- 4 files changed, 174 insertions(+), 200 deletions(-) create mode 100644 tutorials/TUTORIAL-2_GENERATION_WORKFLOWS.md delete mode 100644 tutorials/TUTORIAL-2_SIMPLE-GENERATION.md diff --git a/tutorials/TUTORIAL-1_OVERVIEW.md b/tutorials/TUTORIAL-1_OVERVIEW.md index f9a285c..50bd31b 100644 --- a/tutorials/TUTORIAL-1_OVERVIEW.md +++ b/tutorials/TUTORIAL-1_OVERVIEW.md @@ -1,6 +1,8 @@ -# Tutorial 1: fabricator Introduction +# Tutorial 1: Fabricator Introduction -### Recipe for Dataset Generation 📚 +## 1) Dataset Generation + +### 1.1) Recipe for Dataset Generation 📚 When starting from scratch, to generate an arbitrary dataset, you need to implement some instance of: - **_Datasets_**: For few-shot examples and final storage of (pair-wise) data to train a small PLM. @@ -8,7 +10,7 @@ When starting from scratch, to generate an arbitrary dataset, you need to implem - **_Prompts_**: To format the provided inputs (task description, few-shot examples, etc.) to prompt the LLM. - **_Orchestrator_**: To aligns all components and steer the generation process. -### Creating a workflow from scratch requires careful consideration of intricate details 👨‍🍳 +### 1.2) Creating a Workflow From Scratch Requires Careful Consideration of Intricate Details 👨‍🍳 The following figure illustrates the typical generation workflow when using large language models as teachers for smaller pre-trained language models (PLMs) like BERT. Establishing this workflow demands attention to implementation details and requires boilerplate. Further, the setup process may vary based on a particular LLM or dataset format. @@ -18,7 +20,7 @@ details and requires boilerplate. Further, the setup process may vary based on a
The generation workflow when using LLMs as a teacher to smaller PLMs such as BERT.
-### Efficiently generate datasets with fabricator 🍜 +### 1.3) Efficiently Generate Datasets With Fabricator 🍜 With fabricator, you simply need to define your generation settings, e.g. how many few-shot examples to include per prompt, how to sample few-shot instances from a pool of available @@ -26,9 +28,9 @@ examples, or which LLM to use. In addition, everything is built on top of Huggin [datasets](https://github.com/huggingface/datasets) library that you can directly incorporate the generated datasets in your usual training workflows or share them via the Hugging Face hub. -## Fabricator Compoments +## 2) Fabricator Compoments -### Datasets +### 2.1) Datasets Datasets are build upon the `Dataset` class of the huggingface datasets library. They are used to store the data in a tabular format and provide a convenient way to access the data. The generated datasets will always be in that format such @@ -48,7 +50,7 @@ dataset = load_dataset("json", data_files="path/to/file.jsonl") dataset.push_to_hub("my-dataset") ``` -### LLMs +### 2.2) LLMs We simply use haystack's `PromptNode` as our LLM interface. The PromptNode is a wrapper for multiple LLMs such as the ones from OpenAI or all available models on the huggingface hub. You can set all generation-related parameters such as temperature, top_k, maximum generation length via the PromptNode (see also the [documentation](https://docs.haystack.deepset.ai/docs/prompt_node)). @@ -67,7 +69,7 @@ prompt_node = PromptNode( ) ``` -### Prompts +### 2.3) Prompts The `BasePrompt` class is used to format the prompts for the LLMs. This class is highly flexible and thus can be adapted to various settings: @@ -158,7 +160,7 @@ text: Note: The `generate_data_for_column` attribute defines the column of the few-shot dataset for which additional data is generated. As previously shown, the orchestrator will sample a label and includes a matching few-shot example. -### DatasetGenerator +### 2.4) DatasetGenerator The `DatasetGenerator` class is fabricator's orchestrator. It takes a `Dataset`, a `PromptNode` and a `BasePrompt` as inputs and generates the final dataset based on these instances. The `generate` method returns a `Dataset` object that can be used with standard machine learning @@ -178,4 +180,4 @@ generated_dataset = generator.generate( ) ``` -In the following [tutorial](TUTORIAL-2_SIMPLE-GENERATION.md), we introduce the different generation processes covered by fabricator. +In the following [tutorial](TUTORIAL-2_GENERATION_WORKFLOWS.md), we introduce the different generation processes covered by fabricator. diff --git a/tutorials/TUTORIAL-2_GENERATION_WORKFLOWS.md b/tutorials/TUTORIAL-2_GENERATION_WORKFLOWS.md new file mode 100644 index 0000000..afc1390 --- /dev/null +++ b/tutorials/TUTORIAL-2_GENERATION_WORKFLOWS.md @@ -0,0 +1,152 @@ +# Tutorial 2: Generation Workflows + +In this tutorial, you will learn: +1. how to generate datasets +2. how to annotate unlabeled datasets +3. how to configure hyperparameters for your generation process + +## 1) Generating Datasets + +### 1.1) Generating Plain Text + +In this example, we demonstrate how to merge components created in previous tutorials by fabricators to create a +movie review dataset. We don't explicitly direct the Language Learning Model (LLM) to generate movie reviews for +specific labels (such as binary sentiment) or offer a few examples to guide the LLM in generating similar content. +Instead, all it requires is a task description. The LLM then produces a dataset containing movie reviews based on the +provided instructions. This dataset can be easily uploaded to the Hugging Face Hub. + +```python +import os +from haystack.nodes import PromptNode +from fabricator import DatasetGenerator +from fabricator.prompts import BasePrompt + +prompt = BasePrompt( + task_description="Generate a very very short movie review.", +) + +prompt_node = PromptNode( + model_name_or_path="gpt-3.5-turbo", + api_key=os.environ.get("OPENAI_API_KEY"), + max_length=100, +) + +generator = DatasetGenerator(prompt_node) +generated_dataset = generator.generate( + prompt_template=prompt, + max_prompt_calls=10, +) + +generated_dataset.push_to_hub("your-first-generated-dataset") +``` + +### 1.2) Generate Label-Conditioned Datasets With Label Options and Few-Shot Examples +To create datasets that are conditioned on specific labels and use few-shot examples, +we need a few-shot dataset that is already annotated. The prompt should have the same labels as those in the few-shot +dataset. Additionally, as explained in a previous tutorial, we must set the `generate_data_for_column` +parameter to specify the column in the dataset for which we want to generate text. + +In the dataset generator, we define certain hyperparameters for the generation process. `fewshot_examples_per_class` +determines how many few-shot examples are incorporated for each class per prompt. `fewshot_sampling_strategy` +can be set to either "uniform" if each label has an equal chance of being sampled, +or "stratified" if the distribution from the few-shot dataset needs to be preserved. +`fewshot_sampling_column` specifies the dataset column representing the classes. `max_prompt_calls` +sets the limit for how many prompts should be generated. + +Crucially, the prompt instance contains all the details about how a single prompt for a specific data point should +be structured. This includes information like which few-shot examples should appear alongside which task instruction. +On the other hand, the dataset generator defines the overall generation process, +such as determining the label distribution, specified by the `fewshot_sampling_column`. + +```python +import os +from datasets import Dataset +from haystack.nodes import PromptNode +from fabricator import DatasetGenerator +from fabricator.prompts import BasePrompt + +label_options = ["positive", "negative"] + +fewshot_dataset = Dataset.from_dict({ + "text": ["This movie is great!", "This movie is bad!"], + "label": label_options +}) + +prompt = BasePrompt( + task_description="Generate a {} movie review.", + label_options=label_options, + generate_data_for_column="text", +) + +prompt_node = PromptNode( + model_name_or_path="gpt-3.5-turbo", + api_key=os.environ.get("OPENAI_API_KEY"), + max_length=100, +) + +generator = DatasetGenerator(prompt_node) +generated_dataset = generator.generate( + prompt_template=prompt, + fewshot_dataset=fewshot_dataset, + fewshot_examples_per_class=1, + fewshot_sampling_strategy="uniform", + fewshot_sampling_column="label", + max_prompt_calls=10, +) + +generated_dataset.push_to_hub("your-first-generated-dataset") +``` + +## 2) Annotate unlabeled data with fewshot examples + +This example demonstrates how to add annotations to unlabeled data using few-shot examples. We have a few-shot dataset containing two columns: `text` and `label`, and an unlabeled dataset with only a `text` column. The goal is to annotate the unlabeled dataset using information from the few-shot dataset. + +To achieve this, we utilize the `DatasetGenerator.generate()` method. To begin, we provide the `unlabeled_dataset` argument, indicating the dataset we want to annotate. Additionally, we specify the `fewshot_examples_per_class` argument, determining how many few-shot examples to use for each class. In this scenario, we choose one example per class. + +The `fewshot_sampling_strategy` argument dictates how the few-shot dataset is sampled. In this case, we employ a stratified sampling strategy. This means that the generator will select precisely one example from each class within the few-shot dataset. + +It's worth noting that there's no need to explicitly specify the `fewshot_sampling_column` argument. By default, the generator uses the column specified in `generate_data_for_column` for this purpose. + +```python +import os +from datasets import Dataset +from haystack.nodes import PromptNode +from fabricator import DatasetGenerator +from fabricator.prompts import BasePrompt + +label_options = ["positive", "negative"] + +fewshot_dataset = Dataset.from_dict({ + "text": ["This movie is great!", "This movie is bad!"], + "label": label_options +}) + +unlabeled_dataset = Dataset.from_dict({ + "text": ["This movie was a blast!", "This movie was not bad!"], +}) + +prompt = BasePrompt( + task_description="Annotate movie reviews as either: {}.", + label_options=label_options, + generate_data_for_column="label", + fewshot_example_columns="text", +) + +prompt_node = PromptNode( + model_name_or_path="gpt-3.5-turbo", + api_key=os.environ.get("OPENAI_API_KEY"), + max_length=100, +) + +generator = DatasetGenerator(prompt_node) +generated_dataset = generator.generate( + prompt_template=prompt, + fewshot_dataset=fewshot_dataset, + fewshot_examples_per_class=1, + fewshot_sampling_strategy="stratified", + unlabeled_dataset=unlabeled_dataset, + max_prompt_calls=10, +) + +generated_dataset.push_to_hub("your-first-generated-dataset") +``` diff --git a/tutorials/TUTORIAL-2_SIMPLE-GENERATION.md b/tutorials/TUTORIAL-2_SIMPLE-GENERATION.md deleted file mode 100644 index f0d5837..0000000 --- a/tutorials/TUTORIAL-2_SIMPLE-GENERATION.md +++ /dev/null @@ -1,179 +0,0 @@ -# Tutorial 2: Generate simple datasets - -## Generate without fewshot examples - -This example shows how to generate a dataset without fewshot examples. It just take a task -description and returns a dataset with movie reviews which can be pushed -to the HuggingFace Hub. - -```python -import os -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator -from fabricator.prompts import BasePrompt - -prompt = BasePrompt( - task_description="Generate a very very short movie review.", -) - -prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo", - api_key=os.environ.get("OPENAI_API_KEY"), - max_length=100, -) - -generator = DatasetGenerator(prompt_node) -generated_dataset = generator.generate( - prompt_template=prompt, - max_prompt_calls=10, -) - -generated_dataset.push_to_hub("your-first-generated-dataset") -``` - -## Generate with label options - -This example shows how to generate a dataset with label options. As introduced in previous tutorial, -this can be achieved by providing a `label_options` argument to the `BasePrompt` constructor. - -```python -import os -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator -from fabricator.prompts import BasePrompt - -label_options = ["positive", "negative"] - -prompt = BasePrompt( - task_description="Generate a very very short, {} movie review.", - label_options=label_options, -) - -prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo", - api_key=os.environ.get("OPENAI_API_KEY"), - max_length=100, -) - -generator = DatasetGenerator(prompt_node) -generated_dataset = generator.generate( - prompt_template=prompt, - max_prompt_calls=10, -) - -generated_dataset.push_to_hub("your-first-generated-dataset") -``` - -With label_options specified, the generator will uniformly sample one of the label options and insert it into the task -description which ensure that the generated dataset is balanced. - - -## Generate with fewshot examples -This example shows how to generate a dataset with fewshot examples. As introduced in previous tutorial, this can be -achieved by providing a `fewshot_dataset` argument to the `DatasetGenerator.generate()` method. - -First, we create an annotated `fewshot_dataset` with two columns: `text` and `label`. In order to generate new movie -reviews and provide the LLM with examples, we need to specify the `generate_data_for_column` argument in the -`BasePrompt` constructor. This argument tells the generator which column to generate data for. - -Since we are using fewshot examples, we can control the prompt generation through different sampling strategies and -number of examples per class. We pass the `fewshot_dataset` to the generate function and specify to use one fewshot -example per class per prompt. The `fewshot_sampling_strategy` argument specifies how to sample from the fewshot -dataset. In this case, we use a uniform sampling strategy which means that the generator will uniformly sample one -example per class from the fewshot dataset. The `fewshot_sampling_column` argument specifies which column to use for -sampling. In this case, we use the `label` column. - -```python -import os -from datasets import Dataset -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator -from fabricator.prompts import BasePrompt - -label_options = ["positive", "negative"] - -fewshot_dataset = Dataset.from_dict({ - "text": ["This movie is great!", "This movie is bad!"], - "label": label_options -}) - -prompt = BasePrompt( - task_description="Generate a {} movie review.", - label_options=label_options, - generate_data_for_column="text", -) - -prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo", - api_key=os.environ.get("OPENAI_API_KEY"), - max_length=100, -) - -generator = DatasetGenerator(prompt_node) -generated_dataset = generator.generate( - prompt_template=prompt, - fewshot_dataset=fewshot_dataset, - fewshot_examples_per_class=1, - fewshot_sampling_strategy="uniform", - fewshot_sampling_column="label", - max_prompt_calls=10, -) - -generated_dataset.push_to_hub("your-first-generated-dataset") -``` - -## Annotate unlabeled data with fewshot examples - -This example shows how to annotate unlabeled data with fewshot examples. In this case, we have a fewshot dataset with -two columns: `text` and `label`. We also have an unlabeled dataset with only a `text` column. We want to annotate the -unlabeled dataset with the fewshot dataset. In order to do this, we need to specify the `unlabeled_dataset` argument -to the `DatasetGenerator.generate()` method. We also need to specify the `fewshot_examples_per_class` argument to -specify how many fewshot examples to use per class. In this case, we use one example per class. The -`fewshot_sampling_strategy` argument specifies how to sample from the fewshot dataset. -In this case, we use a stratfied sampling strategy which means that the generator will sample exactly one example from -each class from the fewshot dataset. In this case, we do not need to explicitly specify the `fewshot_sampling_column` -argument since the generator will use the column specified in `generate_data_for_column` by default. - -```python -import os -from datasets import Dataset -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator -from fabricator.prompts import BasePrompt - -label_options = ["positive", "negative"] - -fewshot_dataset = Dataset.from_dict({ - "text": ["This movie is great!", "This movie is bad!"], - "label": label_options -}) - -unlabeled_dataset = Dataset.from_dict({ - "text": ["This movie was a blast!", "This movie was not bad!"], -}) - -prompt = BasePrompt( - task_description="Annotate movie reviews as either: {}.", - label_options=label_options, - generate_data_for_column="label", - fewshot_example_columns="text", -) - -prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo", - api_key=os.environ.get("OPENAI_API_KEY"), - max_length=100, -) - -generator = DatasetGenerator(prompt_node) -generated_dataset = generator.generate( - prompt_template=prompt, - fewshot_dataset=fewshot_dataset, - fewshot_examples_per_class=1, - fewshot_sampling_strategy="stratified", - unlabeled_dataset=unlabeled_dataset, - max_prompt_calls=10, -) - -generated_dataset.push_to_hub("your-first-generated-dataset") -``` diff --git a/tutorials/TUTORIAL-3_ADVANCED-GENERATION.md b/tutorials/TUTORIAL-3_ADVANCED-GENERATION.md index 9f99f1d..024745d 100644 --- a/tutorials/TUTORIAL-3_ADVANCED-GENERATION.md +++ b/tutorials/TUTORIAL-3_ADVANCED-GENERATION.md @@ -1,6 +1,6 @@ -# Tutorial 3: Generate advanced datasets +# Tutorial 3: Advanced Dataset Generation -## Customize prompts +## Customizing Prompts Sometimes, you want to customize your prompt to your specific needs. For example, you might want to add a custom formatting template (the default takes the column names of the dataset): @@ -43,13 +43,12 @@ Movie Review: {text} Sentiment: ``` -## Infer prompt from dataset +## Inferring the prompt from dataset info Huggingface Dataset objects provide the possibility to infer a prompt from the dataset. This can be achieved by using -the `infer_prompt_from_dataset` function from the `ai_dataset_generator.prompts` module. This function takes a dataset +the `infer_prompt_from_dataset` function. This function takes a dataset as input and returns a `BasePrompt` object. The `BasePrompt` object contains the task description, the label options -and the respective columns which can be used to generate a dataset with the `DatasetGenerator` class. The default -assignment is that data is going to be generated for the label column specified in the `task_template`. +and the respective columns which can be used to generate a dataset with the `DatasetGenerator` class. ```python from datasets import load_dataset @@ -90,7 +89,7 @@ extractive question answering: from datasets import load_dataset from fabricator.prompts import infer_prompt_from_dataset -dataset = load_dataset("imdb", split="train") +dataset = load_dataset("squad_v2", split="train") prompt = infer_prompt_from_dataset(dataset) print(prompt.get_prompt_text() + "\n---") @@ -123,10 +122,10 @@ answers: ## Preprocess datasets -The previous example emphasized how easy prompts can be generated using huggingface Datasets. However, -there is a potential mismatch between label names and label IDs in the fewshot data points -or structured answers as for the example of SQuAD. In order to best utilize the LLM, datasets will often -require some sort of preprocessing. +In the previous example, we highlighted the simplicity of generating prompts using Hugging Face Datasets information. +However, for optimal utilization of LLMs in generating text, it's essential to incorporate label names instead of IDs +for text classification. Similarly, for question answering tasks, plain substrings are preferred over JSON-formatted +strings. We'll elaborate on these limitations in the following example. ```text Classify the following texts exactly into one of the following categories: **neg, pos**. From 511b47a095ac938b0c458ff260711b401d5f4146 Mon Sep 17 00:00:00 2001 From: whoisjones Date: Wed, 25 Oct 2023 17:05:44 +0200 Subject: [PATCH 6/6] finalize tutorial 3 for release --- tutorials/TUTORIAL-3_ADVANCED-GENERATION.md | 113 ++++---------------- 1 file changed, 23 insertions(+), 90 deletions(-) diff --git a/tutorials/TUTORIAL-3_ADVANCED-GENERATION.md b/tutorials/TUTORIAL-3_ADVANCED-GENERATION.md index 024745d..feb8a9c 100644 --- a/tutorials/TUTORIAL-3_ADVANCED-GENERATION.md +++ b/tutorials/TUTORIAL-3_ADVANCED-GENERATION.md @@ -43,7 +43,7 @@ Movie Review: {text} Sentiment: ``` -## Inferring the prompt from dataset info +## Inferring the Prompt rom Dataset Info Huggingface Dataset objects provide the possibility to infer a prompt from the dataset. This can be achieved by using the `infer_prompt_from_dataset` function. This function takes a dataset @@ -142,15 +142,12 @@ question: What term characterizes the intersection of the rites with the Roman C answers: **{'text': ['full union'], 'answer_start': [104]}** ``` -To do so, we provide a range of preprocessing functions for downstream tasks. +To overcome this, we provide a range of preprocessing functions for various downstream tasks. ### Text Classification -The `convert_label_ids_to_texts` function transform your text classification dataset with label IDs into textual -labels. The default will be the label names specified in the features column. You can also directly return the label -options if you want to create a custom prompt from `BasePrompt` class. In the example, we only -return them for logging since we use our `infer_prompt_from_dataset` method which automatically -uses the label names specified in the dataset. +The `convert_label_ids_to_texts` function transforms your text classification dataset with label IDs into textual +labels. The default will be the label names specified in the features column. ```python from datasets import load_dataset @@ -207,9 +204,7 @@ text: {text} label: ``` -During the dataset generation we expect the model to generate the labels in the explicitly defined label options but -do not filter if this is not the case. We observed in our experiments that this does not occur if the prompt is precise -and consistent. Once the dataset is generated, one can easily convert the string labels back to label IDs by +Once the dataset is generated, one can easily convert the string labels back to label IDs by using huggingface's `class_encode_labels` function. ```python @@ -221,23 +216,27 @@ print("Features: " + str(dataset.features["label"])) Which yields: ```text -Labels: [0, 0, 0, 0, 0] +Labels: [0, 1, 1, 0, 0] Features: ClassLabel(names=['negative', 'positive'], id=None) ``` +Note: While generating the dataset, the model is supposed to assign labels based on the specific options +provided. However, we do not filter the data if it doesn't adhere to these predefined labels. +Therefore, it's important to double-check if the annotations match the expected label options. +If they don't, you should make corrections accordingly. + ### Question Answering (Extractive) -For question answering, we provide two functions to preprocess and postprocess the dataset. The preprocessing function -will convert datasets in SQuAD-format to a flat format such that the inputs to the LLM will be strings. -The postprocessing function will convert the predictions back to the SQuAD-format by calculating the answer -start and log if this answer can't be found in the context or if the answer occurs multiple times. +In question answering tasks, we offer two functions to handle dataset processing: preprocessing and postprocessing. +The preprocessing function is responsible for transforming datasets from SQuAD format into flat strings. +On the other hand, the postprocessing function reverses this process by converting flat predictions back into +SQuAD format. It achieves this by determining the starting point of the answer and checking if the answer cannot be +found in the given context or if it occurs multiple times. ```python from datasets import load_dataset from fabricator.prompts import infer_prompt_from_dataset -from fabricator.dataset_transformations.question_answering import preprocess_squad_format, - -postprocess_squad_format +from fabricator.dataset_transformations.question_answering import preprocess_squad_format, postprocess_squad_format dataset = load_dataset("squad_v2", split="train") prompt = infer_prompt_from_dataset(dataset) @@ -268,8 +267,8 @@ answers: ### Named Entity Recognition -If you want to generate a dataset for named entity recognition without any preprocesing, the prompt would be difficult -to understand for the LLM. +If you attempt to create a dataset for named entity recognition without any preprocessing, the prompt might be +challenging for the language model to understand. ```python from datasets import load_dataset @@ -298,8 +297,9 @@ tokens: {tokens} ner_tags: ``` -To make the prompt more understandable, we can preprocess the dataset such that the labels are converted to spans. -This can be done by using the `convert_token_labels_to_spans` function. The function will also return the label options +To enhance prompt clarity, we can preprocess the dataset by converting labels into spans. This conversion can be +accomplished using the `convert_token_labels_to_spans` function. Additionally, the function will provide the +available label options: ```python from datasets import load_dataset @@ -329,7 +329,7 @@ tokens: {tokens} ner_tags: ``` -As in text classification, we can also specfiy more semantically precise labels with the `expanded_label_mapping`: +As in text classification, we can also specify more semantically precise labels with the `expanded_label_mapping`: ```python expanded_label_mapping = { @@ -535,70 +535,3 @@ generated_dataset = generated_dataset.class_encode_column("label") generated_dataset.push_to_hub("your-first-generated-dataset") ``` - -### Token classification -Note: Token classification is currently still in development and often yields instable results. In particular the -conversion between spans and token labels welcomes contributions for a more stable implementation. - -```python -import os -from datasets import load_dataset -from haystack.nodes import PromptNode -from fabricator import DatasetGenerator -from fabricator.prompts import BasePrompt -from fabricator.dataset_transformations import convert_token_labels_to_spans, convert_spans_to_token_labels - -dataset = load_dataset("conll2003", split="train") -expanded_label_mapping = { - 0: "O", - 1: "B-person", - 2: "I-person", - 3: "B-location", - 4: "I-location", - 5: "B-organization", - 6: "I-organization", - 7: "B-miscellaneous", - 8: "I-miscellaneous", -} -dataset, label_options = convert_token_labels_to_spans(dataset, "tokens", "ner_tags", expanded_label_mapping) - -fewshot_dataset = dataset.select(range(10)) -unlabeled_dataset = dataset.select(range(10, 20)) - -prompt = BasePrompt( - task_description="Annotate each token with its named entity label: {}.", - generate_data_for_column="ner_tags", - fewshot_example_columns="tokens", - label_options=list(expanded_label_mapping.values()), -) - -prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo", - api_key=os.environ.get("OPENAI_API_KEY"), - max_length=100, -) - -generator = DatasetGenerator(prompt_node) -generated_dataset, original_dataset = generator.generate( - prompt_template=prompt, - fewshot_dataset=fewshot_dataset, - fewshot_examples_per_class=3, # Take 3 fewshot examples (for token-level class we do not sample per class) - unlabeled_dataset=unlabeled_dataset, - max_prompt_calls=10, - return_unlabeled_dataset=True, -) - -generated_dataset = convert_spans_to_token_labels( - dataset=generated_dataset, - token_column="tokens", - label_column="ner_tags", - id2label=expanded_label_mapping -) - -original_dataset = convert_spans_to_token_labels( - dataset=original_dataset, - token_column="tokens", - label_column="ner_tags", - id2label=expanded_label_mapping -) -```