From e1f147ff00f4628b76ad64e84767aa241a260db6 Mon Sep 17 00:00:00 2001
From: petersapountzis <peter.sapountzis@gmail.com>
Date: Sun, 14 Apr 2024 20:39:38 -0500
Subject: [PATCH] working demo'

---
 nlp/__init__.py               |   4 +-
 nlp/app/routes.py             |  83 +++++++++++++++++++-----
 nlp/app/templates/myform.html |   6 +-
 nlp/cli.py                    | 119 +++++++++++++++++++++-------------
 train.csv                     |   3 +
 5 files changed, 149 insertions(+), 66 deletions(-)
 create mode 100644 train.csv

diff --git a/nlp/__init__.py b/nlp/__init__.py
index db614b4..3c59c3c 100644
--- a/nlp/__init__.py
+++ b/nlp/__init__.py
@@ -16,8 +16,8 @@
 def write_default_config(path):
 	w = open(path, 'wt')
 	w.write('[data]\n')
-	w.write('url = https://www.dropbox.com/s/o0nxd8pnwy809u2/headlines.csv?dl=1\n')
-	w.write('file = %s%s%s\n' % (nlp_path, os.path.sep, 'headlines.csv'))
+	w.write('url = https://drive.google.com/drive/folders/1gF0E9E8w1x-yz5FvxS8zFZlSNIivYfhT/train.csv\n')
+	w.write('file = %s%s%s\n' % (nlp_path, os.path.sep, 'train.csv'))
 	w.close()
 
 # Find NLP_HOME path
diff --git a/nlp/app/routes.py b/nlp/app/routes.py
index 57a3fff..ed8d2c0 100644
--- a/nlp/app/routes.py
+++ b/nlp/app/routes.py
@@ -6,23 +6,74 @@
 import pickle
 import sys
 
-clf, vec = pickle.load(open(clf_path, 'rb'))
-print('read clf %s' % str(clf))
-print('read vec %s' % str(vec))
-labels = ['liberal', 'conservative']
+
+from flask import render_template, flash, redirect, session, request
+from . import app
+from .forms import MyForm
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+
+# Assuming the model and tokenizer are loaded similarly to your Jupyter notebook
+model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+
+label_names = ["entailment", "neutral", "contradiction"]
 
 @app.route('/', methods=['GET', 'POST'])
 @app.route('/index', methods=['GET', 'POST'])
 def index():
-	form = MyForm()
-	result = None
-	if form.validate_on_submit():
-		input_field = form.input_field.data
-		X = vec.transform([input_field])
-		pred = clf.predict(X)[0]
-		proba = clf.predict_proba(X)[0].max()
-		# flash(input_field)
-		return render_template('myform.html', title='Interactive NLI Classifier', form=form, 
-								prediction=labels[pred], confidence='%.2f' % proba)
-		#return redirect('/index')
-	return render_template('myform.html', title='Interactive NLI Classifier', form=form, prediction=None, confidence=None)
+    form = MyForm()
+    result = None
+    if form.validate_on_submit():
+        premise = form.premise_field.data
+        hypothesis = form.hypothesis_field.data
+        print('premise: %s' % premise)
+        print('hypothesis: %s' % hypothesis)
+
+        # Tokenizing input
+        inputs = tokenizer(premise, hypothesis, return_tensors='pt').to(device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+            probs = torch.softmax(outputs.logits, dim=-1)
+            pred_idx = torch.argmax(probs)
+            confidence = probs[0, pred_idx].item()
+
+        prediction = label_names[pred_idx]
+
+        return render_template('myform.html', title='Interactive NLI Classifier', form=form, 
+                               prediction=prediction, confidence='{:.2f}%'.format(confidence * 100))
+    return render_template('myform.html', title='Interactive NLI Classifier', form=form, prediction=None, confidence=None)
+
+
+
+
+
+
+# clf, vec = pickle.load(open(clf_path, 'rb'))
+# print('read clf %s' % str(clf))
+# print('read vec %s' % str(vec))
+# labels = ['liberal', 'conservative']
+
+# @app.route('/', methods=['GET', 'POST'])
+# @app.route('/index', methods=['GET', 'POST'])
+# def index():
+# 	form = MyForm()
+# 	result = None
+# 	if form.validate_on_submit():
+# 		premise = form.premise_field.data
+# 		hypothesis = form.hypothesis_field.data
+# 		print('premise: %s' % premise)
+# 		print('hypothesis: %s' % hypothesis)
+# 		X = vec.transform([input_field])
+# 		pred = clf.predict(X)[0]
+# 		proba = clf.predict_proba(X)[0].max()
+# 		# flash(input_field)
+# 		return render_template('myform.html', title='Interactive NLI Classifier', form=form, 
+# 								prediction=labels[pred], confidence='%.2f' % proba)
+# 		#return redirect('/index')
+# 	return render_template('myform.html', title='Interactive NLI Classifier', form=form, prediction=None, confidence=None)
+
+
diff --git a/nlp/app/templates/myform.html b/nlp/app/templates/myform.html
index d11a028..3a40728 100644
--- a/nlp/app/templates/myform.html
+++ b/nlp/app/templates/myform.html
@@ -1,10 +1,8 @@
-{% extends "base.html" %} {% block content %}
+{% extends "base.html" %}
 <b>Interactive NLI classifier</b>
+{% block content %}
 <form action="" method="post" novalidate>
     <p>{{ form.premise_field.label }} {{ form.premise_field(size=32) }}</p>
-    {{ form.submit() }}
-</form>
-<form action="" method="post" novalidate>
     <p>
         {{ form.hypothesis_field.label }} {{ form.hypothesis_field(size=32) }}
     </p>
diff --git a/nlp/cli.py b/nlp/cli.py
index 15d9be4..a273e40 100644
--- a/nlp/cli.py
+++ b/nlp/cli.py
@@ -10,12 +10,22 @@
 import pandas as pd
 import re
 import requests
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import StratifiedKFold
-from sklearn.metrics import accuracy_score, classification_report
 
-from . import clf_path, config
+import torch
+from torch.utils.data import TensorDataset
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from torch.utils.data import DataLoader, Dataset
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report
+
+from . import clf_path, config, config_path
+
+model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+
 
 @click.group()
 def main(args=None):
@@ -36,16 +46,48 @@ def dl_data():
     """
     Download training/testing data.
     """
-    data_url = config.get('data', 'url')
-    data_file = config.get('data', 'file')
+    print("Config file path:", config_path)
+    config.read(config_path)  # Reload the configuration
+    # data_url = config.get('data', 'url')
+    # data_file = config.get('data', 'file')
+    data_url = 'https://www.dropbox.com/scl/fi/8afm3cbr1ui1j3qrtv1u9/train.csv?rlkey=d0y73zduv1ira37d5xyd0sg2m&dl=0'
+    data_file = '/Users/petersapountzis/.nlp/nli_train.csv'
     print('downloading from %s to %s' % (data_url, data_file))
     r = requests.get(data_url)
     with open(data_file, 'wt') as f:
         f.write(r.text)
     
 
-def data2df():
-    return pd.read_csv(config.get('data', 'file'))
+def load_and_tokenize_data(file_path):
+    df = pd.read_csv(file_path)
+    df['premise'] = df['premise'].astype(str)
+    df['hypothesis'] = df['hypothesis'].astype(str)
+    tokenized_data = tokenizer(df['premise'].tolist(), df['hypothesis'].tolist(), padding=True, truncation=True, return_tensors="pt")
+    labels = torch.tensor(df['label'].values)
+    return tokenized_data, labels
+
+def train_model(data_file):
+    tokenized_data, labels = load_and_tokenize_data(data_file)
+    dataset = TensorDataset(tokenized_data['input_ids'], tokenized_data['attention_mask'], labels)
+    train_loader = DataLoader(dataset, batch_size=16, shuffle=True)
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
+    criterion = torch.nn.CrossEntropyLoss()
+
+    model.train()
+    for epoch in range(3):  # Example: 3 training epochs
+        total_loss = 0
+        for batch in train_loader:
+            inputs, masks, labels = (t.to(device) for t in batch)
+            model.zero_grad()
+            outputs = model(inputs, attention_mask=masks, labels=labels)
+            loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')
+
+
 
 @main.command('stats')
 def stats():
@@ -58,41 +100,30 @@ def stats():
     print(df.partisan.value_counts())    
 
 @main.command('train')
-def train():
-    """
-    Train a classifier and save it.
-    """
-    # (1) Read the data...
-    df = data2df()    
-    # (2) Create classifier and vectorizer.
-    clf = LogisticRegression(max_iter=1000, C=1, class_weight='balanced')         
-    vec = CountVectorizer(min_df=5, ngram_range=(1,3), binary=True, stop_words='english')
-    X = vec.fit_transform(df.title)
-    y = df.partisan.values
-    # (3) do cross-validation and print out validation metrics
-    # (classification_report)
-    do_cross_validation(clf, X, y)
-    # (4) Finally, train on ALL data one final time and
-    # train. Save the classifier to disk.
-    clf.fit(X, y)
-    pickle.dump((clf, vec), open(clf_path, 'wb'))
-    top_coef(clf, vec)
-
-def do_cross_validation(clf, X, y):
-    all_preds = np.zeros(len(y))
-    for train, test in StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X,y):
-        clf.fit(X[train], y[train])
-        all_preds[test] = clf.predict(X[test])
-    print(classification_report(y, all_preds))    
-
-def top_coef(clf, vec, labels=['liberal', 'conservative'], n=10):
-    feats = np.array(vec.get_feature_names_out())
-    print('top coef for %s' % labels[1])
-    for i in np.argsort(clf.coef_[0])[::-1][:n]:
-        print('%20s\t%.2f' % (feats[i], clf.coef_[0][i]))
-    print('\n\ntop coef for %s' % labels[0])
-    for i in np.argsort(clf.coef_[0])[:n]:
-        print('%20s\t%.2f' % (feats[i], clf.coef_[0][i]))
+@click.argument('data_file', type=click.Path(exists=True))
+def train(data_file):
+    """Train the NLI classifier."""
+    train_model(data_file)
+    print("Training complete.")
+
+
+def predict(premise, hypothesis):
+    model.eval()
+    with torch.no_grad():
+        inputs = tokenizer(premise, hypothesis, return_tensors="pt").to(device)
+        outputs = model(**inputs)
+        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+        return predictions
+
+
+@main.command('predict')
+@click.argument('premise')
+@click.argument('hypothesis')
+def cli_predict(premise, hypothesis):
+    """Make a prediction given a premise and a hypothesis."""
+    prediction = predict(premise, hypothesis)
+    print(f"Prediction: {prediction}")
+
 
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/train.csv b/train.csv
new file mode 100644
index 0000000..9126c31
--- /dev/null
+++ b/train.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e84bfc095e4bb8bfa9846748f22b958643c07a219b09157df271d0376372d698
+size 1685