Skip to content

Commit

Permalink
working demo'
Browse files Browse the repository at this point in the history
  • Loading branch information
petersapountzis committed Apr 15, 2024
1 parent a724333 commit e1f147f
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 66 deletions.
4 changes: 2 additions & 2 deletions nlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
def write_default_config(path):
w = open(path, 'wt')
w.write('[data]\n')
w.write('url = https://www.dropbox.com/s/o0nxd8pnwy809u2/headlines.csv?dl=1\n')
w.write('file = %s%s%s\n' % (nlp_path, os.path.sep, 'headlines.csv'))
w.write('url = https://drive.google.com/drive/folders/1gF0E9E8w1x-yz5FvxS8zFZlSNIivYfhT/train.csv\n')
w.write('file = %s%s%s\n' % (nlp_path, os.path.sep, 'train.csv'))
w.close()

# Find NLP_HOME path
Expand Down
83 changes: 67 additions & 16 deletions nlp/app/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,74 @@
import pickle
import sys

clf, vec = pickle.load(open(clf_path, 'rb'))
print('read clf %s' % str(clf))
print('read vec %s' % str(vec))
labels = ['liberal', 'conservative']

from flask import render_template, flash, redirect, session, request
from . import app
from .forms import MyForm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Assuming the model and tokenizer are loaded similarly to your Jupyter notebook
model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

label_names = ["entailment", "neutral", "contradiction"]

@app.route('/', methods=['GET', 'POST'])
@app.route('/index', methods=['GET', 'POST'])
def index():
form = MyForm()
result = None
if form.validate_on_submit():
input_field = form.input_field.data
X = vec.transform([input_field])
pred = clf.predict(X)[0]
proba = clf.predict_proba(X)[0].max()
# flash(input_field)
return render_template('myform.html', title='Interactive NLI Classifier', form=form,
prediction=labels[pred], confidence='%.2f' % proba)
#return redirect('/index')
return render_template('myform.html', title='Interactive NLI Classifier', form=form, prediction=None, confidence=None)
form = MyForm()
result = None
if form.validate_on_submit():
premise = form.premise_field.data
hypothesis = form.hypothesis_field.data
print('premise: %s' % premise)
print('hypothesis: %s' % hypothesis)

# Tokenizing input
inputs = tokenizer(premise, hypothesis, return_tensors='pt').to(device)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)
pred_idx = torch.argmax(probs)
confidence = probs[0, pred_idx].item()

prediction = label_names[pred_idx]

return render_template('myform.html', title='Interactive NLI Classifier', form=form,
prediction=prediction, confidence='{:.2f}%'.format(confidence * 100))
return render_template('myform.html', title='Interactive NLI Classifier', form=form, prediction=None, confidence=None)






# clf, vec = pickle.load(open(clf_path, 'rb'))
# print('read clf %s' % str(clf))
# print('read vec %s' % str(vec))
# labels = ['liberal', 'conservative']

# @app.route('/', methods=['GET', 'POST'])
# @app.route('/index', methods=['GET', 'POST'])
# def index():
# form = MyForm()
# result = None
# if form.validate_on_submit():
# premise = form.premise_field.data
# hypothesis = form.hypothesis_field.data
# print('premise: %s' % premise)
# print('hypothesis: %s' % hypothesis)
# X = vec.transform([input_field])
# pred = clf.predict(X)[0]
# proba = clf.predict_proba(X)[0].max()
# # flash(input_field)
# return render_template('myform.html', title='Interactive NLI Classifier', form=form,
# prediction=labels[pred], confidence='%.2f' % proba)
# #return redirect('/index')
# return render_template('myform.html', title='Interactive NLI Classifier', form=form, prediction=None, confidence=None)


6 changes: 2 additions & 4 deletions nlp/app/templates/myform.html
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
{% extends "base.html" %} {% block content %}
{% extends "base.html" %}
<b>Interactive NLI classifier</b>
{% block content %}
<form action="" method="post" novalidate>
<p>{{ form.premise_field.label }} {{ form.premise_field(size=32) }}</p>
{{ form.submit() }}
</form>
<form action="" method="post" novalidate>
<p>
{{ form.hypothesis_field.label }} {{ form.hypothesis_field(size=32) }}
</p>
Expand Down
119 changes: 75 additions & 44 deletions nlp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,22 @@
import pandas as pd
import re
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report

from . import clf_path, config
import torch
from torch.utils.data import TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from . import clf_path, config, config_path

model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


@click.group()
def main(args=None):
Expand All @@ -36,16 +46,48 @@ def dl_data():
"""
Download training/testing data.
"""
data_url = config.get('data', 'url')
data_file = config.get('data', 'file')
print("Config file path:", config_path)
config.read(config_path) # Reload the configuration
# data_url = config.get('data', 'url')
# data_file = config.get('data', 'file')
data_url = 'https://www.dropbox.com/scl/fi/8afm3cbr1ui1j3qrtv1u9/train.csv?rlkey=d0y73zduv1ira37d5xyd0sg2m&dl=0'
data_file = '/Users/petersapountzis/.nlp/nli_train.csv'
print('downloading from %s to %s' % (data_url, data_file))
r = requests.get(data_url)
with open(data_file, 'wt') as f:
f.write(r.text)


def data2df():
return pd.read_csv(config.get('data', 'file'))
def load_and_tokenize_data(file_path):
df = pd.read_csv(file_path)
df['premise'] = df['premise'].astype(str)
df['hypothesis'] = df['hypothesis'].astype(str)
tokenized_data = tokenizer(df['premise'].tolist(), df['hypothesis'].tolist(), padding=True, truncation=True, return_tensors="pt")
labels = torch.tensor(df['label'].values)
return tokenized_data, labels

def train_model(data_file):
tokenized_data, labels = load_and_tokenize_data(data_file)
dataset = TensorDataset(tokenized_data['input_ids'], tokenized_data['attention_mask'], labels)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for epoch in range(3): # Example: 3 training epochs
total_loss = 0
for batch in train_loader:
inputs, masks, labels = (t.to(device) for t in batch)
model.zero_grad()
outputs = model(inputs, attention_mask=masks, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')



@main.command('stats')
def stats():
Expand All @@ -58,41 +100,30 @@ def stats():
print(df.partisan.value_counts())

@main.command('train')
def train():
"""
Train a classifier and save it.
"""
# (1) Read the data...
df = data2df()
# (2) Create classifier and vectorizer.
clf = LogisticRegression(max_iter=1000, C=1, class_weight='balanced')
vec = CountVectorizer(min_df=5, ngram_range=(1,3), binary=True, stop_words='english')
X = vec.fit_transform(df.title)
y = df.partisan.values
# (3) do cross-validation and print out validation metrics
# (classification_report)
do_cross_validation(clf, X, y)
# (4) Finally, train on ALL data one final time and
# train. Save the classifier to disk.
clf.fit(X, y)
pickle.dump((clf, vec), open(clf_path, 'wb'))
top_coef(clf, vec)

def do_cross_validation(clf, X, y):
all_preds = np.zeros(len(y))
for train, test in StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X,y):
clf.fit(X[train], y[train])
all_preds[test] = clf.predict(X[test])
print(classification_report(y, all_preds))

def top_coef(clf, vec, labels=['liberal', 'conservative'], n=10):
feats = np.array(vec.get_feature_names_out())
print('top coef for %s' % labels[1])
for i in np.argsort(clf.coef_[0])[::-1][:n]:
print('%20s\t%.2f' % (feats[i], clf.coef_[0][i]))
print('\n\ntop coef for %s' % labels[0])
for i in np.argsort(clf.coef_[0])[:n]:
print('%20s\t%.2f' % (feats[i], clf.coef_[0][i]))
@click.argument('data_file', type=click.Path(exists=True))
def train(data_file):
"""Train the NLI classifier."""
train_model(data_file)
print("Training complete.")


def predict(premise, hypothesis):
model.eval()
with torch.no_grad():
inputs = tokenizer(premise, hypothesis, return_tensors="pt").to(device)
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
return predictions


@main.command('predict')
@click.argument('premise')
@click.argument('hypothesis')
def cli_predict(premise, hypothesis):
"""Make a prediction given a premise and a hypothesis."""
prediction = predict(premise, hypothesis)
print(f"Prediction: {prediction}")


if __name__ == "__main__":
sys.exit(main())
3 changes: 3 additions & 0 deletions train.csv
Git LFS file not shown

0 comments on commit e1f147f

Please sign in to comment.