From bfff90e1f0d072da53ac5ce6c30d988125835de9 Mon Sep 17 00:00:00 2001
From: jcollopy-tulane <jcollopy@tulane.edu>
Date: Sat, 27 Apr 2024 19:54:00 -0500
Subject: [PATCH] Retokenizing

---
 nlp/cli.py | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/nlp/cli.py b/nlp/cli.py
index 5a435cf..92c6dea 100644
--- a/nlp/cli.py
+++ b/nlp/cli.py
@@ -97,16 +97,20 @@ def process_text(document):
         
         return processed_text
     
-    bnb = BernoulliNB()
     vec_1 = CountVectorizer(tokenizer=process_text)
-    X = vec_1.fit_transform(train_df["Comment"])
-    y = train_df["Result_Bin"]
-    bnb.fit(X,y)
-    y_pred = bnb.predict(val_df["Stemmed"])
+    X_train = vec_1.fit_transform(train_df["Comment"])
+    y_train = train_df["Result_Bin"]
+    X_val = vec_1.transform(val_df["Comment"]) 
     y_val = val_df["Result_Bin"]
-    # Calculate F1
+
+    # Training the model
+    bnb = BernoulliNB()
+    bnb.fit(X_train, y_train)
+
+    # Predicting and evaluating
+    y_pred = bnb.predict(X_val)
     f1 = f1_score(y_val, y_pred)
-    print("F1 Score:", round(f1,3))
+    print("F1 Score:", round(f1, 3))
 
 @main.command('train_lr')
 def train_lr():
@@ -136,16 +140,20 @@ def process_text(document):
         
         return processed_text
         
-    lr = LogisticRegression()
-    vec_2 = CountVectorizer(tokenizer=process_text)
-    X = vec_2.fit_transform(train_df["Comment"])
-    y = train_df["Result_Bin"]
-    lr.fit(X,y)
-    y_pred = lr.predict(val_df["Stemmed"])
+    vec_1 = CountVectorizer(tokenizer=process_text)
+    X_train = vec_1.fit_transform(train_df["Comment"])
+    y_train = train_df["Result_Bin"]
+    X_val = vec_1.transform(val_df["Comment"]) 
     y_val = val_df["Result_Bin"]
-    # Calculate F1
+
+    # Training the model
+    lr = LogisticRegression()
+    lr.fit(X_train, y_train)
+
+    # Predicting and evaluating
+    y_pred = lr.predict(X_val)
     f1 = f1_score(y_val, y_pred)
-    print("F1 Score:", round(f1,3))
+    print("F1 Score:", round(f1, 3))
 
 
 if __name__ == "__main__":