diff --git a/src/fabricator/dataset_generator.py b/src/fabricator/dataset_generator.py index 6b688a7..405b3bb 100644 --- a/src/fabricator/dataset_generator.py +++ b/src/fabricator/dataset_generator.py @@ -17,9 +17,11 @@ from .samplers import single_label_stratified_sample from .utils import log_dir, create_timestamp_path + class DatasetGenerator: """The DatasetGenerator class is the main class of the fabricator package. - It generates datasets based on a prompt template. The main function is generate().""" + It generates datasets based on a prompt template. The main function is generate(). + """ def __init__(self, prompt_node: PromptNode, max_tries: int = 10): """Initialize the DatasetGenerator with a prompt node. @@ -63,7 +65,7 @@ def generate( train_small_model_every_X_generations: Optional[int] = None, timeout_per_prompt: Optional[int] = None, log_every_n_api_calls: int = 25, - dummy_response: Optional[Union[str, Callable]] = None + dummy_response: Optional[Union[str, Callable]] = None, ) -> Union[Dataset, Tuple[Dataset, Dataset]]: """Generate a dataset based on a prompt template and support examples. Optionally, unlabeled examples can be provided to annotate unlabeled data. @@ -96,16 +98,23 @@ def generate( dataset. """ if fewshot_dataset: - self._assert_fewshot_dataset_matches_prompt(prompt_template, fewshot_dataset) + self._assert_fewshot_dataset_matches_prompt( + prompt_template, fewshot_dataset + ) - assert fewshot_sampling_strategy in [None, "uniform", "stratified"], \ - "Sampling strategy must be 'uniform' or 'stratified'" + assert fewshot_sampling_strategy in [ + None, + "uniform", + "stratified", + ], "Sampling strategy must be 'uniform' or 'stratified'" if fewshot_dataset and not fewshot_sampling_column: fewshot_sampling_column = prompt_template.generate_data_for_column[0] - assert small_model_training in [None, "text_classification"], \ - "Task for small model training must be available in 'src/small_model_training' e.g. 'text_classification'" + assert small_model_training in [ + None, + "text_classification", + ], "Task for small model training must be available in 'src/small_model_training' e.g. 'text_classification'" generated_dataset, original_dataset = self._inner_generate_loop( prompt_template, @@ -121,7 +130,7 @@ def generate( train_small_model_every_X_generations, timeout_per_prompt, log_every_n_api_calls, - dummy_response + dummy_response, ) if return_unlabeled_dataset: @@ -130,7 +139,10 @@ def generate( return generated_dataset def _try_generate( - self, prompt_text: str, invocation_context: Dict, dummy_response: Optional[Union[str, Callable]] + self, + prompt_text: str, + invocation_context: Dict, + dummy_response: Optional[Union[str, Callable]], ) -> Optional[str]: """Tries to generate a single example. Restrict the time spent on this. @@ -184,7 +196,7 @@ def _inner_generate_loop( train_small_model_every_x_generations: Optional[int], timeout_per_prompt: Optional[int], log_every_n_api_calls: int = 25, - dummy_response: Optional[Union[str, Callable]] = None + dummy_response: Optional[Union[str, Callable]] = None, ): current_tries_left = self._max_tries current_log_file = self._setup_log(prompt_template) @@ -198,7 +210,9 @@ def _inner_generate_loop( api_calls = range(min(max_prompt_calls, num_samples_to_generate)) for prompt_call_idx, unlabeled_example_idx in tqdm( - enumerate(api_calls, start=1), desc="Generating dataset", total=len(api_calls) + enumerate(api_calls, start=1), + desc="Generating dataset", + total=len(api_calls), ): fewshot_examples = None unlabeled_example = None @@ -211,16 +225,25 @@ def _inner_generate_loop( prompt_labels = prompt_template.label_options # label-conditioned generation with label options - if fewshot_dataset is None and isinstance(prompt_labels,list) or prompt_call_idx > train_small_model_every_x_generations: + if ( + fewshot_dataset is None + and isinstance(prompt_labels, list) + or prompt_call_idx > train_small_model_every_x_generations + ): prompt_labels = choice(prompt_labels, 1)[0] - if fewshot_dataset and prompt_labels in fewshot_dataset['label']: + if fewshot_dataset and prompt_labels in fewshot_dataset["label"]: prompt_labels, fewshot_examples = self._sample_fewshot_examples( - prompt_labels, fewshot_dataset, fewshot_sampling_strategy, fewshot_examples_per_class, - fewshot_sampling_column + prompt_labels, + fewshot_dataset, + fewshot_sampling_strategy, + fewshot_examples_per_class, + fewshot_sampling_column, ) - prompt_text = prompt_template.get_prompt_text(prompt_labels, fewshot_examples) + prompt_text = prompt_template.get_prompt_text( + prompt_labels, fewshot_examples + ) if unlabeled_dataset: unlabeled_example = unlabeled_dataset[unlabeled_example_idx] @@ -236,7 +259,9 @@ def _inner_generate_loop( f"Invocation context: {invocation_context} \n" ) - prediction = self._try_generate(prompt_text, invocation_context, dummy_response) + prediction = self._try_generate( + prompt_text, invocation_context, dummy_response + ) if prediction is None: current_tries_left -= 1 @@ -267,20 +292,28 @@ def _inner_generate_loop( prediction, type(prompt_template.generate_data_for_column[0]) ) - generated_dataset[prompt_template.generate_data_for_column[0]].append(prediction) + generated_dataset[prompt_template.generate_data_for_column[0]].append( + prediction + ) else: - generated_dataset[prompt_template.DEFAULT_TEXT_COLUMN[0]].append(prediction) + generated_dataset[prompt_template.DEFAULT_TEXT_COLUMN[0]].append( + prediction + ) if prompt_labels and isinstance(prompt_labels, str): - generated_dataset[prompt_template.DEFAULT_LABEL_COLUMN[0]].append(prompt_labels) + generated_dataset[prompt_template.DEFAULT_LABEL_COLUMN[0]].append( + prompt_labels + ) log_entry = { "prompt": prompt_text, "invocation_context": invocation_context, "prediction": prediction, - "target": prompt_template.generate_data_for_column[0] - if prompt_template.generate_data_for_column - else prompt_template.DEFAULT_TEXT_COLUMN[0], + "target": ( + prompt_template.generate_data_for_column[0] + if prompt_template.generate_data_for_column + else prompt_template.DEFAULT_TEXT_COLUMN[0] + ), } with open(current_log_file, "a", encoding="utf-8") as log_file: log_file.write(f"{json.dumps(log_entry)}\n") @@ -290,7 +323,9 @@ def _inner_generate_loop( original_dataset[key].append(value) if prompt_call_idx >= max_prompt_calls: - logger.info("Reached maximum number of prompt calls ({}).", max_prompt_calls) + logger.info( + "Reached maximum number of prompt calls ({}).", max_prompt_calls + ) break if len(generated_dataset) >= num_samples_to_generate: @@ -300,10 +335,15 @@ def _inner_generate_loop( if timeout_per_prompt is not None: time.sleep(timeout_per_prompt) - if train_small_model_every_x_generations is not None and train_small_model_every_x_generations > 0: + if ( + train_small_model_every_x_generations is not None + and train_small_model_every_x_generations > 0 + ): if prompt_call_idx % train_small_model_every_x_generations == 0: logger.info("Commencing small model training.") - small_model = import_module("src.small_model_training." + small_model_training, __package__) + small_model = import_module( + "src.small_model_training." + small_model_training, __package__ + ) inf_subset = small_model.get_influential_subset(generated_dataset) fewshot_dataset = inf_subset logger.info("Continuing generation.") @@ -335,7 +375,9 @@ def _convert_prediction(self, prediction: str, target_type: type) -> Any: except ValueError: logger.warning( "Could not convert prediction {} to type {}. " - "Returning original prediction.", repr(prediction), target_type + "Returning original prediction.", + repr(prediction), + target_type, ) return prediction @@ -345,7 +387,7 @@ def _sample_fewshot_examples( fewshot_dataset: Dataset, fewshot_sampling_strategy: str, fewshot_examples_per_class: int, - fewshot_sampling_column: str + fewshot_sampling_column: str, ) -> Tuple[Union[List[str], str], Dataset]: if fewshot_sampling_strategy == "uniform": @@ -354,31 +396,38 @@ def _sample_fewshot_examples( lambda example: example[fewshot_sampling_column] == prompt_labels ) fewshot_examples = fewshot_examples.shuffle().select( - range(fewshot_examples_per_class) if fewshot_examples_per_class is not None else range(len(fewshot_examples)) + range(fewshot_examples_per_class) + if fewshot_examples_per_class is not None + else range(len(fewshot_examples)) ) elif fewshot_sampling_strategy == "stratified": fewshot_examples = single_label_stratified_sample( - fewshot_dataset, - fewshot_sampling_column, - fewshot_examples_per_class + fewshot_dataset, fewshot_sampling_column, fewshot_examples_per_class ) else: if fewshot_examples_per_class: - fewshot_examples = fewshot_dataset.shuffle().select(range(fewshot_examples_per_class)) + fewshot_examples = fewshot_dataset.shuffle().select( + range(fewshot_examples_per_class) + ) else: fewshot_examples = fewshot_dataset.shuffle() - assert len(fewshot_examples) > 0, f"Could not find any fewshot examples for label(s) {prompt_labels}." \ - f"Ensure that labels of fewshot examples match the label_options " \ - f"from the prompt." + assert len(fewshot_examples) > 0, ( + f"Could not find any fewshot examples for label(s) {prompt_labels}." + f"Ensure that labels of fewshot examples match the label_options " + f"from the prompt." + ) return prompt_labels, fewshot_examples @staticmethod - def _assert_fewshot_dataset_matches_prompt(prompt_template: BasePrompt, fewshot_dataset: Dataset) -> None: + def _assert_fewshot_dataset_matches_prompt( + prompt_template: BasePrompt, fewshot_dataset: Dataset + ) -> None: """Asserts that the prompt template is valid and all columns are present in the fewshot dataset.""" assert all( - field in fewshot_dataset.column_names for field in prompt_template.relevant_columns_for_fewshot_examples + field in fewshot_dataset.column_names + for field in prompt_template.relevant_columns_for_fewshot_examples ), "Not all required variables of the prompt template occur in the support examples." diff --git a/src/fabricator/prompts/base.py b/src/fabricator/prompts/base.py index 7c6e549..d001930 100644 --- a/src/fabricator/prompts/base.py +++ b/src/fabricator/prompts/base.py @@ -6,7 +6,8 @@ class BasePrompt: """Base class for prompt generation. This class formats the prompt for the fewshot / support set examples - and the target variable such that the dataset generator can simply put in the invocation context.""" + and the target variable such that the dataset generator can simply put in the invocation context. + """ DEFAULT_TEXT_COLUMN = ["text"] DEFAULT_LABEL_COLUMN = ["label"] @@ -63,7 +64,9 @@ def __init__( self.inner_fewshot_example_separator = inner_fewshot_example_separator if fewshot_example_columns: - self.relevant_columns_for_fewshot_examples = self.fewshot_example_columns + self.generate_data_for_column + self.relevant_columns_for_fewshot_examples = ( + self.fewshot_example_columns + self.generate_data_for_column + ) elif generate_data_for_column: self.relevant_columns_for_fewshot_examples = self.generate_data_for_column else: @@ -73,7 +76,10 @@ def __init__( if self.relevant_columns_for_fewshot_examples: if fewshot_formatting_template is None: self.fewshot_prompt = self.inner_fewshot_example_separator.join( - [f"{var}: {{{var}}}" for var in self.relevant_columns_for_fewshot_examples] + [ + f"{var}: {{{var}}}" + for var in self.relevant_columns_for_fewshot_examples + ] ) else: self.fewshot_prompt = fewshot_formatting_template @@ -94,8 +100,10 @@ def _assert_task_description_is_formattable(task_description: str) -> None: task_description (str): Task description for the prompt (prefix). """ if "testxyz" not in task_description.format("testxyz"): - raise KeyError("If you provide label_options, you need the task_description to be formattable like" - " 'Generate a {} text.'") + raise KeyError( + "If you provide label_options, you need the task_description to be formattable like" + " 'Generate a {} text.'" + ) def _infer_target_formatting_template(self) -> str: """Infer target formatting template from input columns and label column. @@ -105,8 +113,8 @@ def _infer_target_formatting_template(self) -> str: """ if self.generate_data_for_column and self.fewshot_example_columns: target_template = self.inner_fewshot_example_separator.join( - [f"{var}: {{{var}}}" for var in self.fewshot_example_columns] + - [f"{self.generate_data_for_column[0]}: "] + [f"{var}: {{{var}}}" for var in self.fewshot_example_columns] + + [f"{self.generate_data_for_column[0]}: "] ) elif self.generate_data_for_column and not self.fewshot_example_columns: @@ -116,8 +124,10 @@ def _infer_target_formatting_template(self) -> str: target_template = f"{self.DEFAULT_TEXT_COLUMN[0]}: " else: - raise ValueError("Either generate_data_for_column or generate_data_for_column + fewshot_example_columns " - "must be provided to infer target template.") + raise ValueError( + "Either generate_data_for_column or generate_data_for_column + fewshot_example_columns " + "must be provided to infer target template." + ) return target_template @@ -139,11 +149,19 @@ def _log_prompt(self) -> str: fewshot_examples[column] = [f"EXAMPLE TEXT FOR COLUMN {column}"] fewshot_examples = Dataset.from_dict(fewshot_examples) - return "\nThe prompt to the LLM will be like:\n" + 10*"-" + "\n"\ - + self.get_prompt_text(label, fewshot_examples) + "\n" + 10*"-" + return ( + "\nThe prompt to the LLM will be like:\n" + + 10 * "-" + + "\n" + + self.get_prompt_text(label, fewshot_examples) + + "\n" + + 10 * "-" + ) @staticmethod - def filter_example_by_columns(example: Dict[str, str], columns: List[str]) -> Dict[str, str]: + def filter_example_by_columns( + example: Dict[str, str], columns: List[str] + ) -> Dict[str, str]: """Filter single example by columns. Args: @@ -153,10 +171,14 @@ def filter_example_by_columns(example: Dict[str, str], columns: List[str]) -> Di Returns: Dict[str, str]: Filtered example """ - filtered_example = {key: value for key, value in example.items() if key in columns} + filtered_example = { + key: value for key, value in example.items() if key in columns + } return filtered_example - def filter_examples_by_columns(self, dataset: Dataset, columns: List[str]) -> List[Dict[str, str]]: + def filter_examples_by_columns( + self, dataset: Dataset, columns: List[str] + ) -> List[Dict[str, str]]: """Filter examples by columns. Args: @@ -171,7 +193,9 @@ def filter_examples_by_columns(self, dataset: Dataset, columns: List[str]) -> Li filtered_inputs.append(self.filter_example_by_columns(example, columns)) return filtered_inputs - def get_prompt_text(self, labels: Union[str, List[str]] = None, examples: Optional[Dataset] = None) -> str: + def get_prompt_text( + self, labels: Union[str, List[str]] = None, examples: Optional[Dataset] = None + ) -> str: """Get prompt text for the given examples. Args: @@ -188,21 +212,27 @@ def get_prompt_text(self, labels: Union[str, List[str]] = None, examples: Option task_description = self.task_description.format(labels) # due to small_model training fewshot_examples may have different labels if examples: - examples = examples.filter(lambda example: example['label'] == labels) + examples = examples.filter(lambda example: example["label"] == labels) else: task_description = self.task_description if examples: # due to small_model training fewshot_examples appear after some iterations if self.relevant_columns_for_fewshot_examples is None: - self.relevant_columns_for_fewshot_examples = ['text'] + self.relevant_columns_for_fewshot_examples = ["text"] self.fewshot_prompt = self.inner_fewshot_example_separator.join( [f"{{{var}}}" for var in self.relevant_columns_for_fewshot_examples] ) - examples = self.filter_examples_by_columns(examples, self.relevant_columns_for_fewshot_examples) - formatted_examples = [self.fewshot_prompt.format(**example) for example in examples] + examples = self.filter_examples_by_columns( + examples, self.relevant_columns_for_fewshot_examples + ) + formatted_examples = [ + self.fewshot_prompt.format(**example) for example in examples + ] prompt_text = self.fewshot_example_separator.join( - [task_description] + formatted_examples + [self.target_formatting_template] + [task_description] + + formatted_examples + + [self.target_formatting_template] ) else: prompt_text = self.fewshot_example_separator.join( diff --git a/src/small_model_training/create_inf_subset.py b/src/small_model_training/create_inf_subset.py deleted file mode 100644 index 0d67f76..0000000 --- a/src/small_model_training/create_inf_subset.py +++ /dev/null @@ -1,63 +0,0 @@ -import torch.nn -import numpy as np -from transformers import AutoTokenizer -from transformers import AutoModelForSequenceClassification -from transformers import DataCollatorWithPadding -from transformers import Trainer -from datasets import load_dataset -import evaluate - -def compute_metrics(eval_pred): - predictions, labels = eval_pred - predictions = np.argmax(predictions, axis=1) - return accuracy.compute(predictions=predictions, references=labels) - -accuracy = evaluate.load("accuracy") - -model = AutoModelForSequenceClassification.from_pretrained("./imdb_model") - -imdb_train = load_dataset("imdb", split="train[:10%]") -imdb_test = load_dataset("imdb", split="test[:1%]") - -tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") - -def preprocess_function(examples): - return tokenizer(examples["text"], truncation=True) - - -tokenized_imdb_train = imdb_train.map(preprocess_function, batched=True) -tokenized_imdb_test = imdb_test.map(preprocess_function, batched=True) - -data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - -trainer = Trainer( - model=model, - train_dataset=tokenized_imdb_train, - eval_dataset=tokenized_imdb_test, - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, -) - -outputs = trainer.predict(tokenized_imdb_test) - -logits = outputs[0] - -logits = torch.from_numpy(logits) - -scores = torch.nn.functional.softmax(logits, dim=-1) - -first_values = scores[:, 0] -second_values = scores[:, 1] - -distance = first_values-second_values - -distance = torch.abs(first_values-second_values) -knn_values, knn_indices = distance.topk(5, largest=False) - -fewshot_examples = [] - -for elem in knn_indices: - fewshot_examples.append(imdb_test[elem.item()]["text"]) - -print(fewshot_examples) diff --git a/src/small_model_training/project_experiments/cola/cola_gold_downstream.py b/src/small_model_training/project_experiments/cola/cola_gold_downstream.py index 6250cce..e43e9f5 100644 --- a/src/small_model_training/project_experiments/cola/cola_gold_downstream.py +++ b/src/small_model_training/project_experiments/cola/cola_gold_downstream.py @@ -1,3 +1,3 @@ from datasets import load_dataset -gold_dataset = load_dataset('glue', 'cola', split='train') \ No newline at end of file +gold_dataset = load_dataset("glue", "cola", split="train") diff --git a/src/small_model_training/project_experiments/imdb/imdb_baseline_downstream.py b/src/small_model_training/project_experiments/imdb/imdb_baseline_downstream.py index a032d29..8f72027 100644 --- a/src/small_model_training/project_experiments/imdb/imdb_baseline_downstream.py +++ b/src/small_model_training/project_experiments/imdb/imdb_baseline_downstream.py @@ -12,27 +12,34 @@ model_name = "bert-base-uncased" -label2id = {"positive":1, "negative":0, "pos":1, "neg":0} +label2id = {"positive": 1, "negative": 0, "pos": 1, "neg": 0} id2label = {1: "positive", 0: "negative"} # setup preprocessing tokenizer = AutoTokenizer.from_pretrained(model_name) + def preprocess_text(batch): preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True) return preprocessed_tokens + + def preprocess_text_and_labels(batch): preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True) preprocessed_tokens["label"] = [label2id[label] for label in batch["label"]] return preprocessed_tokens + # setup compute_metrics accuracy = evaluate.load("accuracy") + + def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return accuracy.compute(predictions=predictions, references=labels) + generated_dataset = generated_dataset.train_test_split(test_size=0.1) train_split = generated_dataset["train"] val_split = generated_dataset["test"] @@ -59,7 +66,7 @@ def compute_metrics(eval_pred): weight_decay=0.01, evaluation_strategy="epoch", save_strategy="epoch", - load_best_model_at_end=True + load_best_model_at_end=True, ), train_dataset=tokenized_train, eval_dataset=tokenized_val, @@ -78,9 +85,6 @@ def compute_metrics(eval_pred): results_timestamp = datetime.datetime.now() -with open(results_path, 'a') as file: +with open(results_path, "a") as file: file.write(f"{results_timestamp} - imdb_baseline_downstream\n") file.write(f"accuracy: {test_accuracy}\n") - - - diff --git a/src/small_model_training/project_experiments/imdb/imdb_baseline_generation.py b/src/small_model_training/project_experiments/imdb/imdb_baseline_generation.py index 274b519..2eb2caa 100644 --- a/src/small_model_training/project_experiments/imdb/imdb_baseline_generation.py +++ b/src/small_model_training/project_experiments/imdb/imdb_baseline_generation.py @@ -14,7 +14,6 @@ model_name_or_path="mistralai/Mistral-7B-Instruct-v0.1", max_length=300, model_kwargs={"model_kwargs": {"do_sample": True, "temperature": 0.4, "top_p": 1}}, - ) generator = DatasetGenerator(prompt_node) @@ -24,4 +23,4 @@ num_samples_to_generate=500, ) -generated_dataset.push_to_hub("500-movie-reviews-baseline") \ No newline at end of file +generated_dataset.push_to_hub("500-movie-reviews-baseline") diff --git a/src/small_model_training/project_experiments/imdb/imdb_gold_downstream.py b/src/small_model_training/project_experiments/imdb/imdb_gold_downstream.py index 35f2251..88926cd 100644 --- a/src/small_model_training/project_experiments/imdb/imdb_gold_downstream.py +++ b/src/small_model_training/project_experiments/imdb/imdb_gold_downstream.py @@ -7,7 +7,7 @@ from datasets import load_dataset import evaluate -gold_dataset = load_dataset('imdb',split='train') +gold_dataset = load_dataset("imdb", split="train") gold_dataset = gold_dataset.train_test_split(test_size=0.1) train_split = gold_dataset["train"] val_split = gold_dataset["test"] @@ -15,23 +15,28 @@ model_name = "bert-base-uncased" -label2id = {"positive":1, "negative":0, "pos":1, "neg":0} +label2id = {"positive": 1, "negative": 0, "pos": 1, "neg": 0} id2label = {1: "positive", 0: "negative"} # setup preprocessing tokenizer = AutoTokenizer.from_pretrained(model_name) + def preprocess_text(batch): preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True) return preprocessed_tokens + # setup compute_metrics accuracy = evaluate.load("accuracy") + + def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return accuracy.compute(predictions=predictions, references=labels) + tokenized_train = train_split.map(preprocess_text, batched=True) tokenized_val = val_split.map(preprocess_text, batched=True) tokenized_test = test_split.map(preprocess_text, batched=True) @@ -54,7 +59,7 @@ def compute_metrics(eval_pred): weight_decay=0.01, evaluation_strategy="epoch", save_strategy="epoch", - load_best_model_at_end=True + load_best_model_at_end=True, ), train_dataset=tokenized_train, eval_dataset=tokenized_val, @@ -73,9 +78,6 @@ def compute_metrics(eval_pred): results_timestamp = datetime.datetime.now() -with open(results_path, 'a') as file: +with open(results_path, "a") as file: file.write(f"{results_timestamp} - imdb_gold_downstream\n") file.write(f"accuracy: {test_accuracy}\n") - - - diff --git a/src/small_model_training/project_experiments/imdb/imdb_smt_downstream.py b/src/small_model_training/project_experiments/imdb/imdb_smt_downstream.py index 80f7b97..9b7cc1b 100644 --- a/src/small_model_training/project_experiments/imdb/imdb_smt_downstream.py +++ b/src/small_model_training/project_experiments/imdb/imdb_smt_downstream.py @@ -12,27 +12,34 @@ model_name = "bert-base-uncased" -label2id = {"positive":1, "negative":0, "pos":1, "neg":0} +label2id = {"positive": 1, "negative": 0, "pos": 1, "neg": 0} id2label = {1: "positive", 0: "negative"} # setup preprocessing tokenizer = AutoTokenizer.from_pretrained(model_name) + def preprocess_text(batch): preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True) return preprocessed_tokens + + def preprocess_text_and_labels(batch): preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True) preprocessed_tokens["label"] = [label2id[label] for label in batch["label"]] return preprocessed_tokens + # setup compute_metrics accuracy = evaluate.load("accuracy") + + def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return accuracy.compute(predictions=predictions, references=labels) + generated_dataset = generated_dataset.train_test_split(test_size=0.1) train_split = generated_dataset["train"] val_split = generated_dataset["test"] @@ -59,7 +66,7 @@ def compute_metrics(eval_pred): weight_decay=0.01, evaluation_strategy="epoch", save_strategy="epoch", - load_best_model_at_end=True + load_best_model_at_end=True, ), train_dataset=tokenized_train, eval_dataset=tokenized_val, @@ -78,9 +85,6 @@ def compute_metrics(eval_pred): results_timestamp = datetime.datetime.now() -with open(results_path, 'a') as file: +with open(results_path, "a") as file: file.write(f"{results_timestamp} - imdb_smt_downstream\n") file.write(f"accuracy: {test_accuracy}\n") - - - diff --git a/src/small_model_training/project_experiments/imdb/imdb_smt_generation.py b/src/small_model_training/project_experiments/imdb/imdb_smt_generation.py index dd08d15..1dc5928 100644 --- a/src/small_model_training/project_experiments/imdb/imdb_smt_generation.py +++ b/src/small_model_training/project_experiments/imdb/imdb_smt_generation.py @@ -14,7 +14,6 @@ model_name_or_path="mistralai/Mistral-7B-Instruct-v0.1", max_length=300, model_kwargs={"model_kwargs": {"do_sample": True, "temperature": 0.4, "top_p": 1}}, - ) generator = DatasetGenerator(prompt_node) @@ -22,8 +21,8 @@ prompt_template=prompt, max_prompt_calls=500, num_samples_to_generate=500, - small_model_training='text_classification', + small_model_training="text_classification", train_small_model_every_X_generations=50, ) -generated_dataset.push_to_hub("500-movie-reviews-smt") \ No newline at end of file +generated_dataset.push_to_hub("500-movie-reviews-smt") diff --git a/src/small_model_training/text_classification.py b/src/small_model_training/text_classification.py index 98a39a8..8242a01 100644 --- a/src/small_model_training/text_classification.py +++ b/src/small_model_training/text_classification.py @@ -5,15 +5,25 @@ from dataclasses import dataclass, field from transformers import AutoTokenizer from transformers import DataCollatorWithPadding -from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, HfArgumentParser +from transformers import ( + AutoModelForSequenceClassification, + TrainingArguments, + Trainer, + HfArgumentParser, +) from datasets import load_dataset, Dataset import evaluate + + @dataclass class ModelArguments: model_name_or_path: str = field( - metadata={"help": "Path to pretrained model or model from huggingface.co/models"} + metadata={ + "help": "Path to pretrained model or model from huggingface.co/models" + } ) + @dataclass class FewshotArguments: avoid_duplicate_fewshots: bool = field( @@ -23,6 +33,7 @@ class FewshotArguments: metadata={"help": "Limit for the size of the influential subset"} ) + def get_influential_subset(dataset): id2label = {} label2id = {} @@ -37,7 +48,7 @@ def get_influential_subset(dataset): id2label[counter] = entry["label"] counter += 1 - #get path to config + # get path to config def get_project_root() -> str: current_dir = os.path.abspath(os.path.dirname(__file__)) while not os.path.isfile(os.path.join(current_dir, "README.md")): @@ -64,6 +75,7 @@ def preprocess_function(batch): # setup compute_metrics accuracy = evaluate.load("accuracy") + def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) @@ -81,7 +93,10 @@ def compute_metrics(eval_pred): data_collator = DataCollatorWithPadding(tokenizer=tokenizer) model = AutoModelForSequenceClassification.from_pretrained( - model_args.model_name_or_path, num_labels=counter, id2label=id2label, label2id=label2id + model_args.model_name_or_path, + num_labels=counter, + id2label=id2label, + label2id=label2id, ) # initialize training @@ -100,7 +115,10 @@ def compute_metrics(eval_pred): inf_subset = [] - indices = calculate_candidates_entropy_knn(outputs, min(math.ceil(len(dataset["test"])/2),fewshot_args.max_inf_subset_size)) + indices = calculate_candidates_entropy_knn( + outputs, + min(math.ceil(len(dataset["test"]) / 2), fewshot_args.max_inf_subset_size), + ) for elem in indices: inf_subset.append(dataset["test"][elem.item()]) @@ -121,13 +139,14 @@ def compute_metrics(eval_pred): return inf_subset -#TO-DO: dynamic variant for calculating inf_subset + def calculate_candidates_entropy_knn(model_outputs, num_candidates): logits = model_outputs[0] logits = torch.from_numpy(logits) scores = torch.nn.functional.softmax(logits, dim=-1) + def entropy(probabilities): return -torch.sum(probabilities * torch.log2(probabilities))