From dec6b45308e7524be95fe4f668deb7a6f6fd68e7 Mon Sep 17 00:00:00 2001
From: jcollopy-tulane <jcollopy@tulane.edu>
Date: Sat, 27 Apr 2024 19:15:01 -0500
Subject: [PATCH] NB and LR

---
 nlp/cli.py | 47 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/nlp/cli.py b/nlp/cli.py
index efed0ec..5941104 100644
--- a/nlp/cli.py
+++ b/nlp/cli.py
@@ -33,7 +33,7 @@ def web(port):
 @main.command('dl-data')
 def dl_data():
     """
-    Get data. (Do First)
+    Get data (Do First)
     """
 
     print("Configuration file path:", config_path)
@@ -49,7 +49,7 @@ def dl_data():
 @main.command('data2df')
 def data2df():
     """
-    Get Dataframes. (Do Second)
+    Get Dataframes (Do Second)
     """
     train_df = pd.read_csv(config.get('data', 'file1'))
     val_df = pd.read_csv(config.get('data', 'file2'))
@@ -66,7 +66,17 @@ def data2df():
     # Return the DataFrames as separate variables
     return train_df, val_df, test_df
 
-def process_text(document):
+
+
+@main.command('train_bn')
+def train_nb(train_df, val_df):
+    """
+    Naive Bayes Model (Do Third)
+    """
+    train_df = pd.read_csv(config.get('data', 'file1'))
+    val_df = pd.read_csv(config.get('data', 'file2'))
+
+    def process_text(document):
     # Tokenize the document
     tokens = document.split()
     tokens = [re.sub(r'^\W+|\W+$', '', token) for token in tokens]
@@ -83,13 +93,8 @@ def process_text(document):
     # Join the tokens back into a string
     processed_text = ' '.join(stemmed_tokens)
     
-    return processed_text
-
-@main.command('train_bn')
-def train_nb(train_df, val_df):
-    """
-    Naive Bayes Model (Do Third)
-    """
+        return processed_text
+    
     bnb = BernoulliNB()
     vec_1 = CountVectorizer(tokenizer=process_text)
     X = vec_1.fit_transform(train_df["Comment"])
@@ -106,6 +111,28 @@ def train_lr(train_df, val_df):
     """
     Logistic Regression Model (Do Fourth)
     """
+
+    train_df = pd.read_csv(config.get('data', 'file1'))
+    val_df = pd.read_csv(config.get('data', 'file2'))
+
+    def process_text(document):
+    # Tokenize the document
+    tokens = document.split()
+    tokens = [re.sub(r'^\W+|\W+$', '', token) for token in tokens]
+    tokens = [token.lower() for token in tokens]
+    
+    # Remove stopwords
+    stop_words = set(stopwords.words('english'))
+    tokens = [token for token in tokens if token not in stop_words]
+    
+    # Stem the tokens
+    stemmer = PorterStemmer()
+    stemmed_tokens = [stemmer.stem(token) for token in tokens]
+    
+    # Join the tokens back into a string
+    processed_text = ' '.join(stemmed_tokens)
+    
+        return processed_text
     lr = LogisticRegression()
     vec_2 = CountVectorizer(tokenizer=process_text)
     X = vec_2.fit_transform(train_df["Comment"])