diff --git a/nlp/__init__.py b/nlp/__init__.py deleted file mode 100644 index db614b4..0000000 --- a/nlp/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Top-level package for nlp.""" - -__author__ = """A Student""" -__email__ = 'student@example.com' -__version__ = '0.1.0' - -# -*- coding: utf-8 -*- -import configparser -import os - -# ~/.nlp/nlp.cfg will contain configuration information for the project, -# such as where data will be downloaded from. -# here is an example. -def write_default_config(path): - w = open(path, 'wt') - w.write('[data]\n') - w.write('url = https://www.dropbox.com/s/o0nxd8pnwy809u2/headlines.csv?dl=1\n') - w.write('file = %s%s%s\n' % (nlp_path, os.path.sep, 'headlines.csv')) - w.close() - -# Find NLP_HOME path -if 'NLP_HOME' in os.environ: - nlp_path = os.environ['NLP_HOME'] -else: - nlp_path = os.environ['HOME'] + os.path.sep + '.nlp' + os.path.sep - -# Make nlp directory if not present -try: - os.makedirs(nlp_path) -except: - pass - -# main config file. -config_path = nlp_path + 'nlp.cfg' -# classifier -clf_path = nlp_path + 'clf.pkl' - -# write default config if not present. -if not os.path.isfile(config_path): - write_default_config(config_path) - -# config variable now accessible throughout project. -config = configparser.RawConfigParser() -config.read(config_path) \ No newline at end of file diff --git a/nlp/app/__init__.py b/nlp/app/__init__.py deleted file mode 100644 index 8c02e85..0000000 --- a/nlp/app/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from flask import Flask -import os -from .. import nlp_path -app = Flask(__name__) -app.config['SECRET_KEY'] = 'you-will-never-guess' # for CSRF - -from . import routes \ No newline at end of file diff --git a/nlp/app/forms.py b/nlp/app/forms.py deleted file mode 100644 index 5395597..0000000 --- a/nlp/app/forms.py +++ /dev/null @@ -1,12 +0,0 @@ -from flask_wtf import FlaskForm -from wtforms import StringField, SubmitField -from wtforms.validators import DataRequired - -class MyForm(FlaskForm): - class Meta: # Ignoring CSRF security feature. - csrf = False - - input_field = StringField(label='input headline:', id='input_field', - validators=[DataRequired()], - render_kw={'style': 'width:50%'}) - submit = SubmitField('Submit') \ No newline at end of file diff --git a/nlp/app/routes.py b/nlp/app/routes.py deleted file mode 100644 index ec4db3f..0000000 --- a/nlp/app/routes.py +++ /dev/null @@ -1,28 +0,0 @@ -from flask import render_template, flash, redirect, session -from . import app -from .forms import MyForm -from .. import clf_path - -import pickle -import sys - -clf, vec = pickle.load(open(clf_path, 'rb')) -print('read clf %s' % str(clf)) -print('read vec %s' % str(vec)) -labels = ['liberal', 'conservative'] - -@app.route('/', methods=['GET', 'POST']) -@app.route('/index', methods=['GET', 'POST']) -def index(): - form = MyForm() - result = None - if form.validate_on_submit(): - input_field = form.input_field.data - X = vec.transform([input_field]) - pred = clf.predict(X)[0] - proba = clf.predict_proba(X)[0].max() - # flash(input_field) - return render_template('myform.html', title='', form=form, - prediction=labels[pred], confidence='%.2f' % proba) - #return redirect('/index') - return render_template('myform.html', title='', form=form, prediction=None, confidence=None) diff --git a/nlp/app/static/__init__.py b/nlp/app/static/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/nlp/app/static/main.css b/nlp/app/static/main.css deleted file mode 100644 index 3277346..0000000 --- a/nlp/app/static/main.css +++ /dev/null @@ -1,20 +0,0 @@ -body { - margin: 50px; - padding: 50px; - font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; - color: #444; -} -/* - * Formatting the header area - */ -header { - background-color: #DFB887; - height: 35px; - width: 100%; - opacity: .9; - margin-bottom: 10px; -} - -label { - color: black; -} \ No newline at end of file diff --git a/nlp/app/templates/__init__.py b/nlp/app/templates/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/nlp/app/templates/base.html b/nlp/app/templates/base.html deleted file mode 100644 index d747410..0000000 --- a/nlp/app/templates/base.html +++ /dev/null @@ -1,34 +0,0 @@ - - - {% if title %} - {{ title }} - nlp - {% else %} - nlp - {% endif %} - - - -
my nlp project
-
- {% if form.errors %} - {{ form.errors }} - {% endif %} - - {% block content %}{% endblock %} - {% if confidence %} -

- Prediction = {{ prediction }}
Confidence = {{ confidence }} -

- {% endif %} - {% with messages = get_flashed_messages() %} - {% if messages %} - - {% endif %} - {% endwith %} -
- - \ No newline at end of file diff --git a/nlp/app/templates/myform.html b/nlp/app/templates/myform.html deleted file mode 100644 index c39a643..0000000 --- a/nlp/app/templates/myform.html +++ /dev/null @@ -1,11 +0,0 @@ -{% extends "base.html" %} - -{% block content %} - Partisan headline classifier -
-

- {{ form.input_field.label }} {{ form.input_field(size=32) }} -

- {{ form.submit() }} -
-{% endblock %} \ No newline at end of file diff --git a/nlp/cli.py b/nlp/cli.py deleted file mode 100644 index 15d9be4..0000000 --- a/nlp/cli.py +++ /dev/null @@ -1,98 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Demonstrating a very simple NLP project. Yours should be more exciting than this.""" -import click -import glob -import pickle -import sys - -import numpy as np -import pandas as pd -import re -import requests -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import StratifiedKFold -from sklearn.metrics import accuracy_score, classification_report - -from . import clf_path, config - -@click.group() -def main(args=None): - """Console script for nlp.""" - return 0 - -@main.command('web') -@click.option('-p', '--port', required=False, default=5000, show_default=True, help='port of web server') -def web(port): - """ - Launch the flask web app. - """ - from .app import app - app.run(host='0.0.0.0', debug=True, port=port) - -@main.command('dl-data') -def dl_data(): - """ - Download training/testing data. - """ - data_url = config.get('data', 'url') - data_file = config.get('data', 'file') - print('downloading from %s to %s' % (data_url, data_file)) - r = requests.get(data_url) - with open(data_file, 'wt') as f: - f.write(r.text) - - -def data2df(): - return pd.read_csv(config.get('data', 'file')) - -@main.command('stats') -def stats(): - """ - Read the data files and print interesting statistics. - """ - df = data2df() - print('%d rows' % len(df)) - print('label counts:') - print(df.partisan.value_counts()) - -@main.command('train') -def train(): - """ - Train a classifier and save it. - """ - # (1) Read the data... - df = data2df() - # (2) Create classifier and vectorizer. - clf = LogisticRegression(max_iter=1000, C=1, class_weight='balanced') - vec = CountVectorizer(min_df=5, ngram_range=(1,3), binary=True, stop_words='english') - X = vec.fit_transform(df.title) - y = df.partisan.values - # (3) do cross-validation and print out validation metrics - # (classification_report) - do_cross_validation(clf, X, y) - # (4) Finally, train on ALL data one final time and - # train. Save the classifier to disk. - clf.fit(X, y) - pickle.dump((clf, vec), open(clf_path, 'wb')) - top_coef(clf, vec) - -def do_cross_validation(clf, X, y): - all_preds = np.zeros(len(y)) - for train, test in StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X,y): - clf.fit(X[train], y[train]) - all_preds[test] = clf.predict(X[test]) - print(classification_report(y, all_preds)) - -def top_coef(clf, vec, labels=['liberal', 'conservative'], n=10): - feats = np.array(vec.get_feature_names_out()) - print('top coef for %s' % labels[1]) - for i in np.argsort(clf.coef_[0])[::-1][:n]: - print('%20s\t%.2f' % (feats[i], clf.coef_[0][i])) - print('\n\ntop coef for %s' % labels[0]) - for i in np.argsort(clf.coef_[0])[:n]: - print('%20s\t%.2f' % (feats[i], clf.coef_[0][i])) - -if __name__ == "__main__": - sys.exit(main()) diff --git a/nlp/nlp.py b/nlp/nlp.py deleted file mode 100644 index 7fbbae4..0000000 --- a/nlp/nlp.py +++ /dev/null @@ -1,3 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Main module."""