From bfff90e1f0d072da53ac5ce6c30d988125835de9 Mon Sep 17 00:00:00 2001 From: jcollopy-tulane Date: Sat, 27 Apr 2024 19:54:00 -0500 Subject: [PATCH] Retokenizing --- nlp/cli.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/nlp/cli.py b/nlp/cli.py index 5a435cf..92c6dea 100644 --- a/nlp/cli.py +++ b/nlp/cli.py @@ -97,16 +97,20 @@ def process_text(document): return processed_text - bnb = BernoulliNB() vec_1 = CountVectorizer(tokenizer=process_text) - X = vec_1.fit_transform(train_df["Comment"]) - y = train_df["Result_Bin"] - bnb.fit(X,y) - y_pred = bnb.predict(val_df["Stemmed"]) + X_train = vec_1.fit_transform(train_df["Comment"]) + y_train = train_df["Result_Bin"] + X_val = vec_1.transform(val_df["Comment"]) y_val = val_df["Result_Bin"] - # Calculate F1 + + # Training the model + bnb = BernoulliNB() + bnb.fit(X_train, y_train) + + # Predicting and evaluating + y_pred = bnb.predict(X_val) f1 = f1_score(y_val, y_pred) - print("F1 Score:", round(f1,3)) + print("F1 Score:", round(f1, 3)) @main.command('train_lr') def train_lr(): @@ -136,16 +140,20 @@ def process_text(document): return processed_text - lr = LogisticRegression() - vec_2 = CountVectorizer(tokenizer=process_text) - X = vec_2.fit_transform(train_df["Comment"]) - y = train_df["Result_Bin"] - lr.fit(X,y) - y_pred = lr.predict(val_df["Stemmed"]) + vec_1 = CountVectorizer(tokenizer=process_text) + X_train = vec_1.fit_transform(train_df["Comment"]) + y_train = train_df["Result_Bin"] + X_val = vec_1.transform(val_df["Comment"]) y_val = val_df["Result_Bin"] - # Calculate F1 + + # Training the model + lr = LogisticRegression() + lr.fit(X_train, y_train) + + # Predicting and evaluating + y_pred = lr.predict(X_val) f1 = f1_score(y_val, y_pred) - print("F1 Score:", round(f1,3)) + print("F1 Score:", round(f1, 3)) if __name__ == "__main__":