From 862bdc858df0f7326f795e07d33e89b05b1868da Mon Sep 17 00:00:00 2001
From: Merrilee Montgomery <109876041+CappucciNOPE@users.noreply.github.com>
Date: Thu, 2 May 2024 12:02:39 -0500
Subject: [PATCH] Add files via upload

---
 nlp/__init__.py | 13 +++----
 nlp/cli.py      | 94 ++++++++++++++++++++++++++++++++++---------------
 2 files changed, 73 insertions(+), 34 deletions(-)
diff --git a/nlp/__init__.py b/nlp/__init__.py
index db614b4..a731e7b 100644
--- a/nlp/__init__.py
+++ b/nlp/__init__.py
@@ -3,8 +3,8 @@
 """Top-level package for nlp."""
 
 __author__ = """A Student"""
-__email__ = 'student@example.com'
-__version__ = '0.1.0'
+__email__ = 'mmontgomery1@tulane.edu'
+__version__ = '0.1.0'##I guess?
 
 # -*- coding: utf-8 -*-
 import configparser
@@ -15,9 +15,9 @@
 # here is an example.
 def write_default_config(path):
 	w = open(path, 'wt')
-	w.write('[data]\n')
-	w.write('url = https://www.dropbox.com/s/o0nxd8pnwy809u2/headlines.csv?dl=1\n')
-	w.write('file = %s%s%s\n' % (nlp_path, os.path.sep, 'headlines.csv'))
+	w.write('[data]\n')#<a data-resin-target="openfile" class="item-link item-link " href="/file/1519554536417">implicit_hate_v1_stg1_posts_embedded.tsv</a>
+	w.write('url = https://app.box.com/index.php?rm=box_download_shared_file&shared_name=1ru8jihn2vi8qpruzbgc7dq73pnnqj4u&file_id=f_1519554536417\n')
+	w.write('file = %s%s%s\n' % (nlp_path, os.path.sep, 'hate_speech_dataset_embeddings.tsv'))
 	w.close()
 
 # Find NLP_HOME path
@@ -43,4 +43,5 @@ def write_default_config(path):
 
 # config variable now accessible throughout project.
 config = configparser.RawConfigParser()
-config.read(config_path)
\ No newline at end of file
+config.read(config_path)
+print(config)
\ No newline at end of file
diff --git a/nlp/cli.py b/nlp/cli.py
index 15d9be4..04dceeb 100644
--- a/nlp/cli.py
+++ b/nlp/cli.py
@@ -5,18 +5,21 @@
 import glob
 import pickle
 import sys
-
+import torch
 import numpy as np
 import pandas as pd
 import re
 import requests
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import StratifiedKFold
-from sklearn.metrics import accuracy_score, classification_report
-
+from sentence_transformers import SentenceTransformer
+from torch.optim.lr_scheduler import StepLR
+#from sklearn.feature_extraction.text import CountVectorizer
+#from sklearn.linear_model import LogisticRegression
+#from sklearn.model_selection import StratifiedKFold
+#from sklearn.metrics import accuracy_score, classification_report
+import torch.nn as nn
 from . import clf_path, config
 
+
 @click.group()
 def main(args=None):
     """Console script for nlp."""
@@ -45,7 +48,7 @@ def dl_data():
     
 
 def data2df():
-    return pd.read_csv(config.get('data', 'file'))
+    return pd.read_csv(config.get('data', 'file'),sep="\t")
 
 @main.command('stats')
 def stats():
@@ -55,37 +58,72 @@ def stats():
     df = data2df()
     print('%d rows' % len(df))
     print('label counts:')
-    print(df.partisan.value_counts())    
+    print(df['class'].value_counts())    
 
 @main.command('train')
 def train():
     """
     Train a classifier and save it.
-    """
-    # (1) Read the data...
-    df = data2df()    
-    # (2) Create classifier and vectorizer.
-    clf = LogisticRegression(max_iter=1000, C=1, class_weight='balanced')         
-    vec = CountVectorizer(min_df=5, ngram_range=(1,3), binary=True, stop_words='english')
-    X = vec.fit_transform(df.title)
-    y = df.partisan.values
-    # (3) do cross-validation and print out validation metrics
-    # (classification_report)
-    do_cross_validation(clf, X, y)
-    # (4) Finally, train on ALL data one final time and
-    # train. Save the classifier to disk.
-    clf.fit(X, y)
-    pickle.dump((clf, vec), open(clf_path, 'wb'))
-    top_coef(clf, vec)
+    """    
+    class IDHate_simple(nn.Module):
+        def __init__(self):
+            super(IDHate_simple,self).__init__()
+            self.softmax = nn.Softmax(dim=0)#Need this for the probabilities at the end
+            self.sigmoid = nn.Sigmoid()#Need this to normalize at each layer
+            self.tanh = nn.Tanh()#Need this for weights; want to be able to weight a component but in a way that doesn't affect the classification
+            self.W = nn.Parameter(torch.zeros((384,3),dtype=torch.float64),requires_grad=True)#used to make X-based weights
+            self.S = torch.zeros((1,3),requires_grad=False,dtype=torch.float64)
+            self.labels = ['implicit_hate', 'not_hate', 'explicit_hate']
+            self.output = torch.zeros((1,3))
+            return
+        
+        def forward(self,x):        
+            self.S= x @ self.W
+            self.output = self.softmax(self.S)
+            return self.output
 
-def do_cross_validation(clf, X, y):
+        def _print(self):
+            print("W","\n",self.W)
+            print("S","\n",self.S)
+            print("p","\n",self.output)
+            return
+        def _train(self,data,epochs,learning_rate):
+            criterion = nn.CrossEntropyLoss()
+            torch.random.manual_seed(42)  
+            np.random.seed(42)
+            optimizer = torch.optim.Adam(self.parameters(),
+                                            lr=learning_rate)
+            sched = StepLR(optimizer,gamma=0.3,step_size=10)
+            loss_val = []
+            # main training loop
+            for epoch in range(epochs):
+                optimizer.zero_grad()
+                np.random.shuffle(data)
+                losses = []
+                for batch, (X, y) in enumerate(data[:300]):
+                    #print(y)
+                    result = self.forward(X)
+                    loss = criterion(result[0],y)
+                    loss.backward()      # computes all the gradients
+                    optimizer.step()
+                    losses.append(loss.item())
+                loss_val.append(np.mean(losses))
+                sched.step()
+            return
+    df = data2df()
+    clf = IDHate_simple()
+    clf._train(df)
+    pickle.dump((clf), open(clf_path, 'wb'))
+    return
+
+"""def do_cross_validation(clf, X, y):
     all_preds = np.zeros(len(y))
     for train, test in StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X,y):
         clf.fit(X[train], y[train])
         all_preds[test] = clf.predict(X[test])
-    print(classification_report(y, all_preds))    
+    print(classification_report(y, all_preds))    """
 
-def top_coef(clf, vec, labels=['liberal', 'conservative'], n=10):
+"""def top_coef(clf, vec, labels=['not_hate', 'implicit_hate', "explicit_hate"], n=10):
     feats = np.array(vec.get_feature_names_out())
     print('top coef for %s' % labels[1])
     for i in np.argsort(clf.coef_[0])[::-1][:n]:
@@ -93,6 +131,6 @@ def top_coef(clf, vec, labels=['liberal', 'conservative'], n=10):
     print('\n\ntop coef for %s' % labels[0])
     for i in np.argsort(clf.coef_[0])[:n]:
         print('%20s\t%.2f' % (feats[i], clf.coef_[0][i]))
-
+"""
 if __name__ == "__main__":
     sys.exit(main())