diff --git a/nlp/__init__.py b/nlp/__init__.py index db614b4..a731e7b 100644 --- a/nlp/__init__.py +++ b/nlp/__init__.py @@ -3,8 +3,8 @@ """Top-level package for nlp.""" __author__ = """A Student""" -__email__ = 'student@example.com' -__version__ = '0.1.0' +__email__ = 'mmontgomery1@tulane.edu' +__version__ = '0.1.0'##I guess? # -*- coding: utf-8 -*- import configparser @@ -15,9 +15,9 @@ # here is an example. def write_default_config(path): w = open(path, 'wt') - w.write('[data]\n') - w.write('url = https://www.dropbox.com/s/o0nxd8pnwy809u2/headlines.csv?dl=1\n') - w.write('file = %s%s%s\n' % (nlp_path, os.path.sep, 'headlines.csv')) + w.write('[data]\n')#implicit_hate_v1_stg1_posts_embedded.tsv + w.write('url = https://app.box.com/index.php?rm=box_download_shared_file&shared_name=1ru8jihn2vi8qpruzbgc7dq73pnnqj4u&file_id=f_1519554536417\n') + w.write('file = %s%s%s\n' % (nlp_path, os.path.sep, 'hate_speech_dataset_embeddings.tsv')) w.close() # Find NLP_HOME path @@ -43,4 +43,5 @@ def write_default_config(path): # config variable now accessible throughout project. config = configparser.RawConfigParser() -config.read(config_path) \ No newline at end of file +config.read(config_path) +print(config) \ No newline at end of file diff --git a/nlp/cli.py b/nlp/cli.py index 15d9be4..04dceeb 100644 --- a/nlp/cli.py +++ b/nlp/cli.py @@ -5,18 +5,21 @@ import glob import pickle import sys - +import torch import numpy as np import pandas as pd import re import requests -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import StratifiedKFold -from sklearn.metrics import accuracy_score, classification_report - +from sentence_transformers import SentenceTransformer +from torch.optim.lr_scheduler import StepLR +#from sklearn.feature_extraction.text import CountVectorizer +#from sklearn.linear_model import LogisticRegression +#from sklearn.model_selection import StratifiedKFold +#from sklearn.metrics import accuracy_score, classification_report +import torch.nn as nn from . import clf_path, config + @click.group() def main(args=None): """Console script for nlp.""" @@ -45,7 +48,7 @@ def dl_data(): def data2df(): - return pd.read_csv(config.get('data', 'file')) + return pd.read_csv(config.get('data', 'file'),sep="\t") @main.command('stats') def stats(): @@ -55,37 +58,72 @@ def stats(): df = data2df() print('%d rows' % len(df)) print('label counts:') - print(df.partisan.value_counts()) + print(df['class'].value_counts()) @main.command('train') def train(): """ Train a classifier and save it. - """ - # (1) Read the data... - df = data2df() - # (2) Create classifier and vectorizer. - clf = LogisticRegression(max_iter=1000, C=1, class_weight='balanced') - vec = CountVectorizer(min_df=5, ngram_range=(1,3), binary=True, stop_words='english') - X = vec.fit_transform(df.title) - y = df.partisan.values - # (3) do cross-validation and print out validation metrics - # (classification_report) - do_cross_validation(clf, X, y) - # (4) Finally, train on ALL data one final time and - # train. Save the classifier to disk. - clf.fit(X, y) - pickle.dump((clf, vec), open(clf_path, 'wb')) - top_coef(clf, vec) + """ + class IDHate_simple(nn.Module): + def __init__(self): + super(IDHate_simple,self).__init__() + self.softmax = nn.Softmax(dim=0)#Need this for the probabilities at the end + self.sigmoid = nn.Sigmoid()#Need this to normalize at each layer + self.tanh = nn.Tanh()#Need this for weights; want to be able to weight a component but in a way that doesn't affect the classification + self.W = nn.Parameter(torch.zeros((384,3),dtype=torch.float64),requires_grad=True)#used to make X-based weights + self.S = torch.zeros((1,3),requires_grad=False,dtype=torch.float64) + self.labels = ['implicit_hate', 'not_hate', 'explicit_hate'] + self.output = torch.zeros((1,3)) + return + + def forward(self,x): + self.S= x @ self.W + self.output = self.softmax(self.S) + return self.output -def do_cross_validation(clf, X, y): + def _print(self): + print("W","\n",self.W) + print("S","\n",self.S) + print("p","\n",self.output) + return + def _train(self,data,epochs,learning_rate): + criterion = nn.CrossEntropyLoss() + torch.random.manual_seed(42) + np.random.seed(42) + optimizer = torch.optim.Adam(self.parameters(), + lr=learning_rate) + sched = StepLR(optimizer,gamma=0.3,step_size=10) + loss_val = [] + # main training loop + for epoch in range(epochs): + optimizer.zero_grad() + np.random.shuffle(data) + losses = [] + for batch, (X, y) in enumerate(data[:300]): + #print(y) + result = self.forward(X) + loss = criterion(result[0],y) + loss.backward() # computes all the gradients + optimizer.step() + losses.append(loss.item()) + loss_val.append(np.mean(losses)) + sched.step() + return + df = data2df() + clf = IDHate_simple() + clf._train(df) + pickle.dump((clf), open(clf_path, 'wb')) + return + +"""def do_cross_validation(clf, X, y): all_preds = np.zeros(len(y)) for train, test in StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X,y): clf.fit(X[train], y[train]) all_preds[test] = clf.predict(X[test]) - print(classification_report(y, all_preds)) + print(classification_report(y, all_preds)) """ -def top_coef(clf, vec, labels=['liberal', 'conservative'], n=10): +"""def top_coef(clf, vec, labels=['not_hate', 'implicit_hate', "explicit_hate"], n=10): feats = np.array(vec.get_feature_names_out()) print('top coef for %s' % labels[1]) for i in np.argsort(clf.coef_[0])[::-1][:n]: @@ -93,6 +131,6 @@ def top_coef(clf, vec, labels=['liberal', 'conservative'], n=10): print('\n\ntop coef for %s' % labels[0]) for i in np.argsort(clf.coef_[0])[:n]: print('%20s\t%.2f' % (feats[i], clf.coef_[0][i])) - +""" if __name__ == "__main__": sys.exit(main())