Idea:

tulane-cmps6730 · Apr 30, 2024 · 2c940fa · 2c940fa
1 parent b8522ca
commit 2c940fa
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 2 deletions.
diff --git a/nlp/cli.py b/nlp/cli.py
@@ -26,7 +26,6 @@
 from tensorflow.keras.optimizers import Adam
 from sklearn.preprocessing import LabelEncoder
 import torch
-
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from transformers import BertTokenizerFast
 from transformers import BertForSequenceClassification, AdamW  

diff --git a/notebooks/Experiment-Stress_Check.ipynb → notebooks/Experiment-Error_Analysis.ipynb b/notebooks/Experiment-Stress_Check.ipynb → notebooks/Experiment-Error_Analysis.ipynb
@@ -1060,6 +1060,72 @@
     "# Print the found comment\n",
     "print(found_comment)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b5d582d2",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "attempted relative import with no known parent package",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpickle\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfunctions\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfunctions_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m process_text, basic_process, cnn_process\n",
+      "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package"
+     ]
+    }
+   ],
+   "source": [
+    "import pickle\n",
+    "from ...functions.functions_utils import process_text, basic_process, cnn_process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0423d0d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def return_coef_bnb(text):\n",
+    "\n",
+    "    model = bnb\n",
+    "    text = process_text(input_field)\n",
+    "    text = bnb_vectorizer.transform([text])\n",
+    "\n",
+    "    vocabulary = vectorizer.get_feature_names_out()\n",
+    "    coefficients = model.coef_[0]\n",
+    "\n",
+    "    # Map coefficients to words\n",
+    "    word_coefficient_map = {word: coef for word, coef in zip(vocabulary, coefficients)}\n",
+    "\n",
+    "    # Print coefficients for words in the input text\n",
+    "    for word in input_text.split():\n",
+    "        if word in word_coefficient_map:\n",
+    "            print(f\"Word: {word}, Coefficient: {word_coefficient_map[word]}\")\n",
+    "        else:\n",
+    "            print(f\"Word: {word}, Coefficient: 0\")  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ddb1cd57",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6f2f431",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -1078,7 +1144,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "version": "3.9.16"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/functions/__init__.py b/notebooks/functions/__init__.py
diff --git a/notebooks/functions/functions_utils.py b/notebooks/functions/functions_utils.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import re
+import pandas as pd
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+
+def process_text(document):
+    # Tokenize the document
+    tokens = document.split()
+    tokens = [re.sub(r'^\W+|\W+$', '', token) for token in tokens]
+    tokens = [token.lower() for token in tokens]
+
+    # Remove stopwords
+    stop_words = set(stopwords.words('english'))
+    tokens = [token for token in tokens if token not in stop_words]
+
+    # Stem the tokens
+    stemmer = PorterStemmer()
+    stemmed_tokens = [stemmer.stem(token) for token in tokens]
+
+    # Return the processed text
+    return ' '.join(stemmed_tokens)
+
+
+tokenizer = Tokenizer()
+train_df = pd.read_csv("/Users/jackiecollopy/Downloads/project-reddit/data/train.csv")
+val_df = pd.read_csv("/Users/jackiecollopy/Downloads/project-reddit/data/train.csv")
+test_df = pd.read_csv("/Users/jackiecollopy/Downloads/project-reddit/data/train.csv")
+
+def basic_process(document):
+    # Tokenize the document
+    tokens = document.split()
+    # Remove punctuation at the start and end of each token and convert to lowercase
+    tokens = [re.sub(r'^\W+|\W+$', '', token).lower() for token in tokens]
+    # Join processed tokens back into a string
+    processed_text = ' '.join(tokens)
+    return processed_text
+
+def cnn_process(document):
+
+    processed_document = basic_process(document)
+    tokenizer = Tokenizer()
+
+    texts = pd.concat([train_df["Comment_Adj"], val_df["Comment_Adj"], test_df["Comment_Adj"]])
+    tokenizer.fit_on_texts(texts)
+
+    all_sequences = tokenizer.texts_to_sequences(texts)
+    sequences = tokenizer.texts_to_sequences([processed_document])
+
+    padded_sequences = pad_sequences(sequences, maxlen=87, padding='post')
+    return padded_sequences
+
+
+def bert_process(document):
+    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    inputs = tokenizer.encode_plus(
+        comment,
+        add_special_tokens=True,
+        max_length=128,
+        padding='max_length',
+        return_attention_mask=True,
+        truncation=True,
+        return_tensors='tf'
+    )
+
+    input_ids = inputs['input_ids']
+    attention_mask = inputs['attention_mask']
+
+    return input_ids, attention_mask
+
+
+
+
+
+