Skip to content

Commit

Permalink
IDK
Browse files Browse the repository at this point in the history
  • Loading branch information
jcollopy-tulane committed Apr 29, 2024
1 parent 71825c7 commit dcf0ee3
Showing 1 changed file with 16 additions and 83 deletions.
99 changes: 16 additions & 83 deletions notebooks/New_Bert.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 48,
"id": "9a807a6a-bfb2-44f7-ac15-24043f0b388f",
"metadata": {},
"outputs": [],
Expand All @@ -19,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 50,
"id": "883141b4-cd6d-45d0-ad6b-caa3b905ae10",
"metadata": {},
"outputs": [],
Expand All @@ -31,7 +31,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 53,
"id": "74ac3c58-156e-4f6e-9fa0-a522f4f21449",
"metadata": {},
"outputs": [],
Expand All @@ -45,16 +45,8 @@
" padding=\"max_length\",\n",
" max_length=max_length,\n",
" return_tensors=\"pt\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "6457cfad-6088-4f25-b30e-7105e5aee114",
"metadata": {},
"outputs": [],
"source": [
" )\n",
"\n",
"class CommentsDataset(Dataset):\n",
" def __init__(self, encodings, labels):\n",
" self.encodings = encodings\n",
Expand All @@ -66,76 +58,27 @@
" return item\n",
"\n",
" def __len__(self):\n",
" return len(self.labels)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "514cb603-8131-4195-b297-aae6169d9424",
"metadata": {},
"outputs": [],
"source": [
" return len(self.labels)\n",
"\n",
"train_encodings = tokenize(train_df)\n",
"val_encodings = tokenize(val_df)\n",
"test_encodings = tokenize(test_df)\n",
"\n",
"train_dataset = CommentsDataset(train_encodings, train_df['Result_Bin'])\n",
"val_dataset = CommentsDataset(val_encodings, val_df['Result_Bin'])\n",
"test_dataset = CommentsDataset(test_encodings, test_df['Result_Bin'])"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "b2171c6a-a7ea-4d98-815f-e04d5e428a7a",
"metadata": {},
"outputs": [],
"source": [
"test_dataset = CommentsDataset(test_encodings, test_df['Result_Bin'])\n",
"\n",
"train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)\n",
"val_loader = DataLoader(val_dataset, batch_size=10)\n",
"test_loader = DataLoader(test_dataset, batch_size=10)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "8d1851fc-f761-45ad-a7fa-c779fa6a3ecd",
"execution_count": null,
"id": "3a7f59a5-371e-4c5c-9c6c-6a60448bf365",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"/Users/jackiecollopy/nlp-virtual/lib/python3.11/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
" warnings.warn(\n",
"Epoch 1/6: 0%| | 0/683 [00:00<?, ?batch/s]/var/folders/hs/br_4rpdj68nc3sfdpgv0xgn80000gn/T/ipykernel_6841/2057042811.py:7: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
" item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
"Epoch 1/6: 100%|██████████████████████████████████████████████████████| 683/683 [01:16<00:00, 8.91batch/s, loss=0.7616]\n",
"Epoch 2/6: 100%|██████████████████████████████████████████████████████| 683/683 [01:23<00:00, 8.19batch/s, loss=0.5100]\n",
"Epoch 3/6: 100%|██████████████████████████████████████████████████████| 683/683 [01:24<00:00, 8.09batch/s, loss=0.3012]\n",
"Epoch 4/6: 100%|██████████████████████████████████████████████████████| 683/683 [01:19<00:00, 8.54batch/s, loss=0.7624]\n",
"Epoch 5/6: 100%|██████████████████████████████████████████████████████| 683/683 [01:24<00:00, 8.11batch/s, loss=0.4034]\n",
"Epoch 6/6: 100%|██████████████████████████████████████████████████████| 683/683 [01:25<00:00, 7.99batch/s, loss=0.0319]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training completed.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"outputs": [],
"source": [
"model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-mini', num_labels=2)\n",
"optimizer = AdamW(model.parameters(), lr=5e-5)\n",
Expand All @@ -144,7 +87,6 @@
"model.to(device)\n",
"model.train()\n",
"\n",
"# Define the total number of epochs\n",
"total_epochs = 6\n",
"\n",
"for epoch in range(total_epochs):\n",
Expand All @@ -157,10 +99,11 @@
" optimizer.step()\n",
" optimizer.zero_grad()\n",
"\n",
" # Update the progress bar with loss information\n",
" pbar.set_postfix(loss=f\"{loss.item():.4f}\")\n",
"\n",
"print(\"Training completed.\")"
"print(\"Training completed.\")\n",
"\n",
"model.save_pretrained('./bert_pth')"
]
},
{
Expand Down Expand Up @@ -203,17 +146,7 @@
"f1 = f1_score(true_labels, predictions)\n",
"print(\"Precision on Test:\", round(precision,3))\n",
"print(\"Recall on Test:\", round(recall,3))\n",
"print(\"F1 Score on Test:\", round(f1,3))"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "0c82c5f8-2de1-4cc5-aed1-1bbdce628e8e",
"metadata": {},
"outputs": [],
"source": [
"torch.save(model.state_dict(), 'bert_model.pth')"
"print(\"F1 Score on Test:\", round(f1,3)"
]
}
],
Expand Down

0 comments on commit dcf0ee3

Please sign in to comment.