From dcf0ee3da16febe115667560f7e006e2a9800838 Mon Sep 17 00:00:00 2001
From: jcollopy-tulane <jcollopy@tulane.edu>
Date: Mon, 29 Apr 2024 16:55:32 -0500
Subject: [PATCH] IDK

---
 notebooks/New_Bert.ipynb | 99 +++++++---------------------------------
 1 file changed, 16 insertions(+), 83 deletions(-)

diff --git a/notebooks/New_Bert.ipynb b/notebooks/New_Bert.ipynb
index 47ec344..e97e746 100644
--- a/notebooks/New_Bert.ipynb
+++ b/notebooks/New_Bert.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 48,
    "id": "9a807a6a-bfb2-44f7-ac15-24043f0b388f",
    "metadata": {},
    "outputs": [],
@@ -19,7 +19,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 50,
    "id": "883141b4-cd6d-45d0-ad6b-caa3b905ae10",
    "metadata": {},
    "outputs": [],
@@ -31,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 53,
    "id": "74ac3c58-156e-4f6e-9fa0-a522f4f21449",
    "metadata": {},
    "outputs": [],
@@ -45,16 +45,8 @@
     "        padding=\"max_length\",\n",
     "        max_length=max_length,\n",
     "        return_tensors=\"pt\"\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "id": "6457cfad-6088-4f25-b30e-7105e5aee114",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "    )\n",
+    "\n",
     "class CommentsDataset(Dataset):\n",
     "    def __init__(self, encodings, labels):\n",
     "        self.encodings = encodings\n",
@@ -66,32 +58,16 @@
     "        return item\n",
     "\n",
     "    def __len__(self):\n",
-    "        return len(self.labels)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "514cb603-8131-4195-b297-aae6169d9424",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "        return len(self.labels)\n",
+    "\n",
     "train_encodings = tokenize(train_df)\n",
     "val_encodings = tokenize(val_df)\n",
     "test_encodings = tokenize(test_df)\n",
     "\n",
     "train_dataset = CommentsDataset(train_encodings, train_df['Result_Bin'])\n",
     "val_dataset = CommentsDataset(val_encodings, val_df['Result_Bin'])\n",
-    "test_dataset = CommentsDataset(test_encodings, test_df['Result_Bin'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "id": "b2171c6a-a7ea-4d98-815f-e04d5e428a7a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "test_dataset = CommentsDataset(test_encodings, test_df['Result_Bin'])\n",
+    "\n",
     "train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)\n",
     "val_loader = DataLoader(val_dataset, batch_size=10)\n",
     "test_loader = DataLoader(test_dataset, batch_size=10)"
@@ -99,43 +75,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
-   "id": "8d1851fc-f761-45ad-a7fa-c779fa6a3ecd",
+   "execution_count": null,
+   "id": "3a7f59a5-371e-4c5c-9c6c-6a60448bf365",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "/Users/jackiecollopy/nlp-virtual/lib/python3.11/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
-      "  warnings.warn(\n",
-      "Epoch 1/6:   0%|                                                                             | 0/683 [00:00<?, ?batch/s]/var/folders/hs/br_4rpdj68nc3sfdpgv0xgn80000gn/T/ipykernel_6841/2057042811.py:7: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
-      "  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
-      "Epoch 1/6: 100%|██████████████████████████████████████████████████████| 683/683 [01:16<00:00,  8.91batch/s, loss=0.7616]\n",
-      "Epoch 2/6: 100%|██████████████████████████████████████████████████████| 683/683 [01:23<00:00,  8.19batch/s, loss=0.5100]\n",
-      "Epoch 3/6: 100%|██████████████████████████████████████████████████████| 683/683 [01:24<00:00,  8.09batch/s, loss=0.3012]\n",
-      "Epoch 4/6: 100%|██████████████████████████████████████████████████████| 683/683 [01:19<00:00,  8.54batch/s, loss=0.7624]\n",
-      "Epoch 5/6: 100%|██████████████████████████████████████████████████████| 683/683 [01:24<00:00,  8.11batch/s, loss=0.4034]\n",
-      "Epoch 6/6: 100%|██████████████████████████████████████████████████████| 683/683 [01:25<00:00,  7.99batch/s, loss=0.0319]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Training completed.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-mini', num_labels=2)\n",
     "optimizer = AdamW(model.parameters(), lr=5e-5)\n",
@@ -144,7 +87,6 @@
     "model.to(device)\n",
     "model.train()\n",
     "\n",
-    "# Define the total number of epochs\n",
     "total_epochs = 6\n",
     "\n",
     "for epoch in range(total_epochs):\n",
@@ -157,10 +99,11 @@
     "            optimizer.step()\n",
     "            optimizer.zero_grad()\n",
     "\n",
-    "            # Update the progress bar with loss information\n",
     "            pbar.set_postfix(loss=f\"{loss.item():.4f}\")\n",
     "\n",
-    "print(\"Training completed.\")"
+    "print(\"Training completed.\")\n",
+    "\n",
+    "model.save_pretrained('./bert_pth')"
    ]
   },
   {
@@ -203,17 +146,7 @@
     "f1 = f1_score(true_labels, predictions)\n",
     "print(\"Precision on Test:\", round(precision,3))\n",
     "print(\"Recall on Test:\", round(recall,3))\n",
-    "print(\"F1 Score on Test:\", round(f1,3))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "0c82c5f8-2de1-4cc5-aed1-1bbdce628e8e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "torch.save(model.state_dict(), 'bert_model.pth')"
+    "print(\"F1 Score on Test:\", round(f1,3)"
    ]
   }
  ],