From e1f147ff00f4628b76ad64e84767aa241a260db6 Mon Sep 17 00:00:00 2001 From: petersapountzis Date: Sun, 14 Apr 2024 20:39:38 -0500 Subject: [PATCH] working demo' --- nlp/__init__.py | 4 +- nlp/app/routes.py | 83 +++++++++++++++++++----- nlp/app/templates/myform.html | 6 +- nlp/cli.py | 119 +++++++++++++++++++++------------- train.csv | 3 + 5 files changed, 149 insertions(+), 66 deletions(-) create mode 100644 train.csv diff --git a/nlp/__init__.py b/nlp/__init__.py index db614b4..3c59c3c 100644 --- a/nlp/__init__.py +++ b/nlp/__init__.py @@ -16,8 +16,8 @@ def write_default_config(path): w = open(path, 'wt') w.write('[data]\n') - w.write('url = https://www.dropbox.com/s/o0nxd8pnwy809u2/headlines.csv?dl=1\n') - w.write('file = %s%s%s\n' % (nlp_path, os.path.sep, 'headlines.csv')) + w.write('url = https://drive.google.com/drive/folders/1gF0E9E8w1x-yz5FvxS8zFZlSNIivYfhT/train.csv\n') + w.write('file = %s%s%s\n' % (nlp_path, os.path.sep, 'train.csv')) w.close() # Find NLP_HOME path diff --git a/nlp/app/routes.py b/nlp/app/routes.py index 57a3fff..ed8d2c0 100644 --- a/nlp/app/routes.py +++ b/nlp/app/routes.py @@ -6,23 +6,74 @@ import pickle import sys -clf, vec = pickle.load(open(clf_path, 'rb')) -print('read clf %s' % str(clf)) -print('read vec %s' % str(vec)) -labels = ['liberal', 'conservative'] + +from flask import render_template, flash, redirect, session, request +from . import app +from .forms import MyForm +from transformers import AutoTokenizer, AutoModelForSequenceClassification +import torch + +# Assuming the model and tokenizer are loaded similarly to your Jupyter notebook +model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForSequenceClassification.from_pretrained(model_name) +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +label_names = ["entailment", "neutral", "contradiction"] @app.route('/', methods=['GET', 'POST']) @app.route('/index', methods=['GET', 'POST']) def index(): - form = MyForm() - result = None - if form.validate_on_submit(): - input_field = form.input_field.data - X = vec.transform([input_field]) - pred = clf.predict(X)[0] - proba = clf.predict_proba(X)[0].max() - # flash(input_field) - return render_template('myform.html', title='Interactive NLI Classifier', form=form, - prediction=labels[pred], confidence='%.2f' % proba) - #return redirect('/index') - return render_template('myform.html', title='Interactive NLI Classifier', form=form, prediction=None, confidence=None) + form = MyForm() + result = None + if form.validate_on_submit(): + premise = form.premise_field.data + hypothesis = form.hypothesis_field.data + print('premise: %s' % premise) + print('hypothesis: %s' % hypothesis) + + # Tokenizing input + inputs = tokenizer(premise, hypothesis, return_tensors='pt').to(device) + with torch.no_grad(): + outputs = model(**inputs) + probs = torch.softmax(outputs.logits, dim=-1) + pred_idx = torch.argmax(probs) + confidence = probs[0, pred_idx].item() + + prediction = label_names[pred_idx] + + return render_template('myform.html', title='Interactive NLI Classifier', form=form, + prediction=prediction, confidence='{:.2f}%'.format(confidence * 100)) + return render_template('myform.html', title='Interactive NLI Classifier', form=form, prediction=None, confidence=None) + + + + + + +# clf, vec = pickle.load(open(clf_path, 'rb')) +# print('read clf %s' % str(clf)) +# print('read vec %s' % str(vec)) +# labels = ['liberal', 'conservative'] + +# @app.route('/', methods=['GET', 'POST']) +# @app.route('/index', methods=['GET', 'POST']) +# def index(): +# form = MyForm() +# result = None +# if form.validate_on_submit(): +# premise = form.premise_field.data +# hypothesis = form.hypothesis_field.data +# print('premise: %s' % premise) +# print('hypothesis: %s' % hypothesis) +# X = vec.transform([input_field]) +# pred = clf.predict(X)[0] +# proba = clf.predict_proba(X)[0].max() +# # flash(input_field) +# return render_template('myform.html', title='Interactive NLI Classifier', form=form, +# prediction=labels[pred], confidence='%.2f' % proba) +# #return redirect('/index') +# return render_template('myform.html', title='Interactive NLI Classifier', form=form, prediction=None, confidence=None) + + diff --git a/nlp/app/templates/myform.html b/nlp/app/templates/myform.html index d11a028..3a40728 100644 --- a/nlp/app/templates/myform.html +++ b/nlp/app/templates/myform.html @@ -1,10 +1,8 @@ -{% extends "base.html" %} {% block content %} +{% extends "base.html" %} Interactive NLI classifier +{% block content %}

{{ form.premise_field.label }} {{ form.premise_field(size=32) }}

- {{ form.submit() }} -
-

{{ form.hypothesis_field.label }} {{ form.hypothesis_field(size=32) }}

diff --git a/nlp/cli.py b/nlp/cli.py index 15d9be4..a273e40 100644 --- a/nlp/cli.py +++ b/nlp/cli.py @@ -10,12 +10,22 @@ import pandas as pd import re import requests -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import StratifiedKFold -from sklearn.metrics import accuracy_score, classification_report -from . import clf_path, config +import torch +from torch.utils.data import TensorDataset +from transformers import AutoTokenizer, AutoModelForSequenceClassification +from torch.utils.data import DataLoader, Dataset +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report + +from . import clf_path, config, config_path + +model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForSequenceClassification.from_pretrained(model_name) +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + @click.group() def main(args=None): @@ -36,16 +46,48 @@ def dl_data(): """ Download training/testing data. """ - data_url = config.get('data', 'url') - data_file = config.get('data', 'file') + print("Config file path:", config_path) + config.read(config_path) # Reload the configuration + # data_url = config.get('data', 'url') + # data_file = config.get('data', 'file') + data_url = 'https://www.dropbox.com/scl/fi/8afm3cbr1ui1j3qrtv1u9/train.csv?rlkey=d0y73zduv1ira37d5xyd0sg2m&dl=0' + data_file = '/Users/petersapountzis/.nlp/nli_train.csv' print('downloading from %s to %s' % (data_url, data_file)) r = requests.get(data_url) with open(data_file, 'wt') as f: f.write(r.text) -def data2df(): - return pd.read_csv(config.get('data', 'file')) +def load_and_tokenize_data(file_path): + df = pd.read_csv(file_path) + df['premise'] = df['premise'].astype(str) + df['hypothesis'] = df['hypothesis'].astype(str) + tokenized_data = tokenizer(df['premise'].tolist(), df['hypothesis'].tolist(), padding=True, truncation=True, return_tensors="pt") + labels = torch.tensor(df['label'].values) + return tokenized_data, labels + +def train_model(data_file): + tokenized_data, labels = load_and_tokenize_data(data_file) + dataset = TensorDataset(tokenized_data['input_ids'], tokenized_data['attention_mask'], labels) + train_loader = DataLoader(dataset, batch_size=16, shuffle=True) + + optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5) + criterion = torch.nn.CrossEntropyLoss() + + model.train() + for epoch in range(3): # Example: 3 training epochs + total_loss = 0 + for batch in train_loader: + inputs, masks, labels = (t.to(device) for t in batch) + model.zero_grad() + outputs = model(inputs, attention_mask=masks, labels=labels) + loss = outputs.loss + loss.backward() + optimizer.step() + total_loss += loss.item() + print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}') + + @main.command('stats') def stats(): @@ -58,41 +100,30 @@ def stats(): print(df.partisan.value_counts()) @main.command('train') -def train(): - """ - Train a classifier and save it. - """ - # (1) Read the data... - df = data2df() - # (2) Create classifier and vectorizer. - clf = LogisticRegression(max_iter=1000, C=1, class_weight='balanced') - vec = CountVectorizer(min_df=5, ngram_range=(1,3), binary=True, stop_words='english') - X = vec.fit_transform(df.title) - y = df.partisan.values - # (3) do cross-validation and print out validation metrics - # (classification_report) - do_cross_validation(clf, X, y) - # (4) Finally, train on ALL data one final time and - # train. Save the classifier to disk. - clf.fit(X, y) - pickle.dump((clf, vec), open(clf_path, 'wb')) - top_coef(clf, vec) - -def do_cross_validation(clf, X, y): - all_preds = np.zeros(len(y)) - for train, test in StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X,y): - clf.fit(X[train], y[train]) - all_preds[test] = clf.predict(X[test]) - print(classification_report(y, all_preds)) - -def top_coef(clf, vec, labels=['liberal', 'conservative'], n=10): - feats = np.array(vec.get_feature_names_out()) - print('top coef for %s' % labels[1]) - for i in np.argsort(clf.coef_[0])[::-1][:n]: - print('%20s\t%.2f' % (feats[i], clf.coef_[0][i])) - print('\n\ntop coef for %s' % labels[0]) - for i in np.argsort(clf.coef_[0])[:n]: - print('%20s\t%.2f' % (feats[i], clf.coef_[0][i])) +@click.argument('data_file', type=click.Path(exists=True)) +def train(data_file): + """Train the NLI classifier.""" + train_model(data_file) + print("Training complete.") + + +def predict(premise, hypothesis): + model.eval() + with torch.no_grad(): + inputs = tokenizer(premise, hypothesis, return_tensors="pt").to(device) + outputs = model(**inputs) + predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) + return predictions + + +@main.command('predict') +@click.argument('premise') +@click.argument('hypothesis') +def cli_predict(premise, hypothesis): + """Make a prediction given a premise and a hypothesis.""" + prediction = predict(premise, hypothesis) + print(f"Prediction: {prediction}") + if __name__ == "__main__": sys.exit(main()) diff --git a/train.csv b/train.csv new file mode 100644 index 0000000..9126c31 --- /dev/null +++ b/train.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e84bfc095e4bb8bfa9846748f22b958643c07a219b09157df271d0376372d698 +size 1685