Skip to content

Commit

Permalink
create more files for experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
stolzenp committed Apr 20, 2024
1 parent d793040 commit 0958f34
Show file tree
Hide file tree
Showing 18 changed files with 256 additions and 11 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from datasets import load_dataset

gold_dataset = load_dataset('glue', 'cola', split='train')
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import numpy as np
import datetime
import os
from transformers import AutoTokenizer, TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, Trainer
from datasets import load_dataset
import evaluate

generated_dataset = load_dataset("stolzenp/500-movie-reviews-baseline", split="train")
test_split = load_dataset("imdb", split="test")

model_name = "bert-base-uncased"

label2id = {"positive":1, "negative":0, "pos":1, "neg":0}
id2label = {1: "positive", 0: "negative"}

# setup preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_text(batch):
preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True)
return preprocessed_tokens
def preprocess_text_and_labels(batch):
preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True)
preprocessed_tokens["label"] = [label2id[label] for label in batch["label"]]
return preprocessed_tokens

# setup compute_metrics
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)

generated_dataset = generated_dataset.train_test_split(test_size=0.1)
train_split = generated_dataset["train"]
val_split = generated_dataset["test"]

tokenized_train = train_split.map(preprocess_text_and_labels, batched=True)
tokenized_val = val_split.map(preprocess_text_and_labels, batched=True)
tokenized_test = test_split.map(preprocess_text, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=2, id2label=id2label, label2id=label2id
)

# initialize training
trainer = Trainer(
model=model,
args=TrainingArguments(
output_dir="baseline_downstream_model",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True
),
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

trainer.train()
outputs = trainer.predict(tokenized_test)
test_accuracy = outputs[2]["test_accuracy"]

results_file = "results.txt"
experiments_directory = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
results_path = f"{experiments_directory}/{results_file}"

results_timestamp = datetime.datetime.now()

with open(results_path, 'a') as file:
file.write(f"{results_timestamp} - imdb_baseline_downstream\n")
file.write(f"accuracy: {test_accuracy}\n")



Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import numpy as np
import datetime
import os
from transformers import AutoTokenizer, TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, Trainer
from datasets import load_dataset
import evaluate

gold_dataset = load_dataset('imdb',split='train')
gold_dataset = gold_dataset.train_test_split(test_size=0.1)
train_split = gold_dataset["train"]
val_split = gold_dataset["test"]
test_split = load_dataset("imdb", split="test")

model_name = "bert-base-uncased"

label2id = {"positive":1, "negative":0, "pos":1, "neg":0}
id2label = {1: "positive", 0: "negative"}

# setup preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_text(batch):
preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True)
return preprocessed_tokens

# setup compute_metrics
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)

tokenized_train = train_split.map(preprocess_text, batched=True)
tokenized_val = val_split.map(preprocess_text, batched=True)
tokenized_test = test_split.map(preprocess_text, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=2, id2label=id2label, label2id=label2id
)

# initialize training
trainer = Trainer(
model=model,
args=TrainingArguments(
output_dir="gold_downstream_model",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True
),
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

trainer.train()
outputs = trainer.predict(tokenized_test)
test_accuracy = outputs[2]["test_accuracy"]

results_file = "results.txt"
experiments_directory = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
results_path = f"{experiments_directory}/{results_file}"

results_timestamp = datetime.datetime.now()

with open(results_path, 'a') as file:
file.write(f"{results_timestamp} - imdb_gold_downstream\n")
file.write(f"accuracy: {test_accuracy}\n")



Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import numpy as np
import datetime
import os
from transformers import AutoTokenizer, TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, Trainer
from datasets import load_dataset
import evaluate

generated_dataset = load_dataset("stolzenp/500-movie-reviews-smt", split="train")
test_split = load_dataset("imdb", split="test")

model_name = "bert-base-uncased"

label2id = {"positive":1, "negative":0, "pos":1, "neg":0}
id2label = {1: "positive", 0: "negative"}

# setup preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_text(batch):
preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True)
return preprocessed_tokens
def preprocess_text_and_labels(batch):
preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True)
preprocessed_tokens["label"] = [label2id[label] for label in batch["label"]]
return preprocessed_tokens

# setup compute_metrics
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)

generated_dataset = generated_dataset.train_test_split(test_size=0.1)
train_split = generated_dataset["train"]
val_split = generated_dataset["test"]

tokenized_train = train_split.map(preprocess_text_and_labels, batched=True)
tokenized_val = val_split.map(preprocess_text_and_labels, batched=True)
tokenized_test = test_split.map(preprocess_text, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=2, id2label=id2label, label2id=label2id
)

# initialize training
trainer = Trainer(
model=model,
args=TrainingArguments(
output_dir="smt_downstream_model",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True
),
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

trainer.train()
outputs = trainer.predict(tokenized_test)
test_accuracy = outputs[2]["test_accuracy"]

results_file = "results.txt"
experiments_directory = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
results_path = f"{experiments_directory}/{results_file}"

results_timestamp = datetime.datetime.now()

with open(results_path, 'a') as file:
file.write(f"{results_timestamp} - imdb_smt_downstream\n")
file.write(f"accuracy: {test_accuracy}\n")



This file was deleted.

Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
1 change: 0 additions & 1 deletion src/small_model_training/text_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ def get_path_to_json_file(relative_path_to_json_file) -> str:
# setup preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)


def preprocess_function(batch):
preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True)
preprocessed_tokens["label"] = [label2id[label] for label in batch["label"]]
Expand Down

0 comments on commit 0958f34

Please sign in to comment.