Ideas

tulane-cmps6730 · Apr 30, 2024 · ed2617c · ed2617c
1 parent 39c7fe2
commit ed2617c
Show file tree

Hide file tree

Showing 8 changed files with 253 additions and 69 deletions.
diff --git a/nlp/cli.py b/nlp/cli.py
@@ -94,13 +94,13 @@ def train_nb():
     Naive Bayes Model (Do Third)
     """
     train_df = pd.read_csv(config.get('data', 'file1'))
-    val_df = pd.read_csv(config.get('data', 'file2'))
+    test_df = pd.read_csv(config.get('data', 'file3'))
 
     train_df["Comment"] = train_df["Comment"].apply(process_text)
     bnb_vectorizer = CountVectorizer()
     X_train = bnb_vectorizer.fit_transform(train_df["Comment"])
     y_train = train_df["Result_Bin"]
-    val_df["Comment"] = val_df["Comment"].apply(process_text)
+    test_df["Comment"] = val_df["Comment"].apply(process_text)
     X_val = bnb_vectorizer.transform(val_df["Comment"]) 
     y_val = val_df["Result_Bin"]
 

diff --git a/notebooks/Experiment-CNN-1.ipynb b/notebooks/Experiment-CNN-1.ipynb
@@ -41,7 +41,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 34,
    "id": "1eac01e8-3c62-469b-96ea-d4babd8f9348",
    "metadata": {},
    "outputs": [
@@ -72,8 +72,8 @@
     "X_train = pad_sequences(X_train_sequences, padding='post', maxlen=87)\n",
     "X_val = pad_sequences(X_val_sequences, padding='post', maxlen=87)\n",
     "label_encoder = LabelEncoder()\n",
-    "y_train = label_encoder.fit_transform(train_df[\"Result_Bin\"])\n",
-    "y_val = label_encoder.fit_transform(val_df[\"Result_Bin\"])"
+    "y_train = train_df[\"Result_Bin\"]\n",
+    "y_val = val_df[\"Result_Bin\"]"
    ]
   },
   {
@@ -444,7 +444,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 32,
    "id": "3a809d75-24a8-4e56-8776-ac4899c95e15",
    "metadata": {},
    "outputs": [],
@@ -454,12 +454,12 @@
     "X_test = pad_sequences(X_test_sequences, padding='post', maxlen=87)\n",
     "label_encoder = LabelEncoder()\n",
     "\n",
-    "y_test = label_encoder.fit_transform(test_df[\"Result_Bin\"])"
+    "y_test = test_df[\"Result_Bin\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 33,
    "id": "9612ac53-cc11-4ee6-acff-d33cabf98e73",
    "metadata": {},
    "outputs": [
@@ -561,20 +561,154 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 30,
    "id": "e96b0c61-e235-4fb2-88af-6149c17184a6",
    "metadata": {},
    "outputs": [
     {
-     "ename": "NameError",
-     "evalue": "name 'cnn_model' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[28], line 6\u001b[0m\n\u001b[1;32m      4\u001b[0m results_df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mActual_Label\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m y_test\n\u001b[1;32m      5\u001b[0m results_df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPredicted_Label\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m preds\n\u001b[0;32m----> 6\u001b[0m probs \u001b[38;5;241m=\u001b[39m \u001b[43mcnn_model\u001b[49m\u001b[38;5;241m.\u001b[39mpredict(X_test)\n\u001b[1;32m      7\u001b[0m results_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpredict_proba\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m probs\n\u001b[1;32m      8\u001b[0m results_df\u001b[38;5;241m.\u001b[39mhead()\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'cnn_model' is not defined"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "72/72 [==============================] - 0s 2ms/step\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Comment</th>\n",
+       "      <th>Result</th>\n",
+       "      <th>Comment_Adj</th>\n",
+       "      <th>No_Stop</th>\n",
+       "      <th>Stemmed</th>\n",
+       "      <th>Result_Bin</th>\n",
+       "      <th>Actual_Label</th>\n",
+       "      <th>Predicted_Label</th>\n",
+       "      <th>predict_proba</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>I feel like we became too dependent on our rec...</td>\n",
+       "      <td>Loss</td>\n",
+       "      <td>i feel like we became too dependent on our rec...</td>\n",
+       "      <td>feel like became dependent recent late-game he...</td>\n",
+       "      <td>feel like becam depend recent late-gam heroic ...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.416957</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>I like it even more when I don't think we're g...</td>\n",
+       "      <td>Win</td>\n",
+       "      <td>i like it even more when i don't think we're g...</td>\n",
+       "      <td>like even n't think 're gon na win win anyways</td>\n",
+       "      <td>like even n't think 're gon na win win anyway</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.844099</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>This game confirmed everything I already knew ...</td>\n",
+       "      <td>Loss</td>\n",
+       "      <td>this game confirmed everything i already knew ...</td>\n",
+       "      <td>game confirmed everything already knew bobby ’...</td>\n",
+       "      <td>game confirm everyth alreadi knew bobbi ’ play...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.533056</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>I’m in shock as a Blazers fan. I know for a fa...</td>\n",
+       "      <td>Loss</td>\n",
+       "      <td>i’m in shock as a blazers fan i know for a fac...</td>\n",
+       "      <td>’ shock blazers fan know fact lillard play way...</td>\n",
+       "      <td>’ shock blazer fan know fact lillard play way ...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.542013</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Can we please change the banner to our current...</td>\n",
+       "      <td>Win</td>\n",
+       "      <td>can we please change the banner to our current...</td>\n",
+       "      <td>please change banner current bucks roster ’ wa...</td>\n",
+       "      <td>pleas chang banner current buck roster ’ want ...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.706892</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                             Comment Result  \\\n",
+       "0  I feel like we became too dependent on our rec...   Loss   \n",
+       "1  I like it even more when I don't think we're g...    Win   \n",
+       "2  This game confirmed everything I already knew ...   Loss   \n",
+       "3  I’m in shock as a Blazers fan. I know for a fa...   Loss   \n",
+       "4  Can we please change the banner to our current...    Win   \n",
+       "\n",
+       "                                         Comment_Adj  \\\n",
+       "0  i feel like we became too dependent on our rec...   \n",
+       "1  i like it even more when i don't think we're g...   \n",
+       "2  this game confirmed everything i already knew ...   \n",
+       "3  i’m in shock as a blazers fan i know for a fac...   \n",
+       "4  can we please change the banner to our current...   \n",
+       "\n",
+       "                                             No_Stop  \\\n",
+       "0  feel like became dependent recent late-game he...   \n",
+       "1     like even n't think 're gon na win win anyways   \n",
+       "2  game confirmed everything already knew bobby ’...   \n",
+       "3  ’ shock blazers fan know fact lillard play way...   \n",
+       "4  please change banner current bucks roster ’ wa...   \n",
+       "\n",
+       "                                             Stemmed  Result_Bin  \\\n",
+       "0  feel like becam depend recent late-gam heroic ...           0   \n",
+       "1      like even n't think 're gon na win win anyway           1   \n",
+       "2  game confirm everyth alreadi knew bobbi ’ play...           0   \n",
+       "3  ’ shock blazer fan know fact lillard play way ...           0   \n",
+       "4  pleas chang banner current buck roster ’ want ...           1   \n",
+       "\n",
+       "   Actual_Label  Predicted_Label  predict_proba  \n",
+       "0             0                1       0.416957  \n",
+       "1             1                1       0.844099  \n",
+       "2             0                0       0.533056  \n",
+       "3             0                0       0.542013  \n",
+       "4             1                1       0.706892  "
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -583,7 +717,7 @@
     "results_df = test_df.copy()\n",
     "results_df[\"Actual_Label\"] = y_test\n",
     "results_df[\"Predicted_Label\"] = preds\n",
-    "probs = cnn_model.predict(X_test)\n",
+    "probs = model.predict(X_test)\n",
     "results_df['predict_proba'] = probs\n",
     "results_df.head()"
    ]

diff --git a/notebooks/Experiment-Logistic_Regression.ipynb b/notebooks/Experiment-Logistic_Regression.ipynb
diff --git a/notebooks/Experiment-Naive_Bayes.ipynb b/notebooks/Experiment-Naive_Bayes.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "f7c7072e-41a8-4a68-816a-f138dfe2f713",
    "metadata": {},
    "outputs": [],
@@ -19,7 +19,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "a2007cdd-38d5-40a5-9ed7-4f593d081a2e",
    "metadata": {},
    "outputs": [],
@@ -39,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "f2c29088-90a7-4084-8b2c-be0651c36c20",
    "metadata": {},
    "outputs": [
@@ -73,7 +73,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "id": "48c88dff-14a5-4975-9670-1cc33cc611df",
    "metadata": {},
    "outputs": [
@@ -122,7 +122,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "id": "649a1ffa-5717-4efa-ba97-6d6109e7cac7",
    "metadata": {},
    "outputs": [

diff --git a/notebooks/New_Bert.ipynb b/notebooks/New_Bert.ipynb
diff --git a/notebooks/lrt_confusion_matrix.pdf b/notebooks/lrt_confusion_matrix.pdf
diff --git a/notebooks/lrv_confusion_matrix.pdf b/notebooks/lrv_confusion_matrix.pdf
diff --git a/notebooks/nbv_confusion_matrix.pdf b/notebooks/nbv_confusion_matrix.pdf