util.py

#!/usr/bin/env python3
import os
import sys
import copy
import re
import time
import datetime

from urllib.request import urlopen

import numpy as np

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

import json
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F

# training with SGLD with annealing and save models
def train(X_train, y_train, X_valid, y_valid, X_test, y_test, model, args):
    model.train()
    batch = args.batch_size

    parameters = [parameter for parameter in model.parameters()]


    set_scale = [parameter.data.std().item() for parameter in model.parameters()]
    set_scale = [scale / max(set_scale) for scale in set_scale] # normalize
    for epoch in range(1, args.epochs+1):
        corrects = 0
        epsilon = args.lr * ((epoch * 1.0) ** (-0.333)) # optimal decay rate
        for idx in range(int(X_train.shape[0]/batch) + 1):
            feature = torch.LongTensor(X_train[(idx*batch):(idx*batch+batch),])
            target = torch.LongTensor(y_train[(idx*batch):(idx*batch+batch)])
            if args.cuda:
                feature, target = feature.cuda(), target.cuda()
            logit = model(feature)
            loss = F.cross_entropy(logit, target)
            model.zero_grad()
            loss.backward()

            for layer_no, param in enumerate(model.parameters()):
                if args.static and layer_no == 0: # fixed embedding layer cannot update
                    continue
                # by default I assume you train the models using GPU
                noise = torch.cuda.FloatTensor(param.data.size()).normal_() * np.sqrt(epsilon / args.t)
                #noise = torch.cuda.FloatTensor(param.data.size()).normal_() * set_scale[layer_no]
                parameters[layer_no].data += (- epsilon / 2 * param.grad + noise)

            corrects += (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum().item()
            accuracy = 100.0 * corrects / batch / (idx + 1)
            sys.stdout.write('\rEpoch[{}] Batch[{}] - loss: {:.4f}  acc: {:.2f}%({}/{}) tempreture: {}'.format(
                             epoch, idx, loss.item(), accuracy, corrects, batch * (idx + 1), int(args.t)))
            args.t = args.t + 1 # annealing
        if epoch % 5 != 0:
            continue
        '''
        try:
            set_scale = [parameter.grad.data.std().item() for parameter in model.parameters()]
            set_scale = [scale / max(set_scale) for scale in set_scale] # normalize
        except:
            set_scale = [parameter.data.std().item() for parameter in model.parameters()]
            set_scale = [scale / max(set_scale) for scale in set_scale] # normalize
        '''
        save(model, args.save_dir, epoch)
        print()
        eval(X_valid, y_valid, model, 'Validation', args)
        eval(X_test, y_test, model, 'Testing   ', args)


def eval(X, y, model, term, args):
    model.eval()
    corrects, TP, avg_loss = 0, 0, 0
    correct_part, total_part = {0.2:0, 0.4:0}, {0.2:1e-16, 0.4:1e-16}
    batch = args.batch_size

    for idx in range(int(X.shape[0]/batch) + 1):
        feature = torch.LongTensor(X[(idx*batch):(idx*batch+batch),])
        target = torch.LongTensor(y[(idx*batch):(idx*batch+batch)])
        if args.cuda:
            feature, target = feature.cuda(), target.cuda()

        logit = model(feature)
        loss = F.cross_entropy(logit, target, size_average=False)
        avg_loss += loss.data.item()
        predictor = torch.exp(logit[:, 1]) / (torch.exp(logit[:, 0]) + torch.exp(logit[:, 1]))
        for xnum in range(1, 3):
            thres = round(0.2 * xnum, 1)
            idx_thres = (predictor > 0.5 + thres) + (predictor < 0.5 - thres)
            correct_part[thres] += (torch.max(logit, 1)[1][idx_thres] == target.data[idx_thres]).sum().item()
            total_part[thres] += idx_thres.sum().item()

        corrects += (torch.max(logit, 1)[1] == target.data).sum().item()
        TP += (((torch.max(logit, 1)[1] == target.data).int() + (torch.max(logit, 1)[1]).int()) == 2).sum().item()

    size = y.shape[0]
    avg_loss /= size
    accuracy = 100.0 * corrects / size
    # TP, TN: True Positive/True Negative
    print('         {} - loss: {:.4f} acc: {:.2f}%({}/{}) {:.2f}%({}/{}) {:.2f}%({}/{}) TP/TN: ({}/{}) \n'.format(term,
          avg_loss, accuracy, corrects, size, 100.0 * correct_part[0.2] / total_part[0.2], correct_part[0.2], int(total_part[0.2]), 
          100.0 * correct_part[0.4] / total_part[0.4], correct_part[0.4], int(total_part[0.4]), TP, corrects - TP))
    return accuracy

def bma_eval(X, y, mymodels, term, args):
    
    corrects, TP, avg_loss = 0, 0, 0
    correct_part, total_part = {0.2:0, 0.4:0}, {0.2:1e-16,0.4:1e-16}
    batch = args.batch_size

    for model in mymodels:
        model.eval()
        for idx in range(int(X.shape[0]/batch) + 1):
            feature = torch.LongTensor(X[(idx*batch):(idx*batch+batch),])
            target = torch.LongTensor(y[(idx*batch):(idx*batch+batch)])
            if args.cuda:
                feature, target = feature.cuda(), target.cuda()

            logit = model(feature)
            loss = F.cross_entropy(logit, target, size_average=False)
            avg_loss += loss.data.item() / (len(mymodels) * 1.0)
            predictor = torch.exp(logit[:, 1]) / (torch.exp(logit[:, 0]) + torch.exp(logit[:, 1]))
            for xnum in range(1, 3):
                thres = round(0.2 * xnum, 1)
                idx_thres = (predictor > 0.5 + thres) + (predictor < 0.5 - thres)
                correct_part[thres] += (torch.max(logit, 1)[1][idx_thres] == target.data[idx_thres]).sum().item() / (len(mymodels) * 1.0)
                total_part[thres] += idx_thres.sum().item() / (len(mymodels) * 1.0)
            corrects += (torch.max(logit, 1)[1] == target.data).sum().item() / (len(mymodels) * 1.0)
            TP += (((torch.max(logit, 1)[1] == target.data).int() + (torch.max(logit, 1)[1]).int()) == 2).sum().item()

    size = y.shape[0]
    avg_loss /= size
    accuracy = 100.0 * corrects / size
    TP = TP * 1.0 / (len(mymodels) * 1.0)
    print('         {} - loss: {:.4f} acc: {:.2f}%({}/{}) {:.2f}%({}/{}) {:.2f}%({}/{}) TP/TN: ({}/{}) \n'.format(term,
            avg_loss, accuracy, corrects, size, 100.0 * correct_part[0.2] / total_part[0.2], correct_part[0.2], int(total_part[0.2]), 
            100.0 * correct_part[0.4] / total_part[0.4], correct_part[0.4], int(total_part[0.4]), TP, corrects - TP))
    return accuracy

def predictor_preprocess(cnn, args):
    # load trained thinning samples (Bayesian CNN models) from input/models/
    mymodels = []
    for num, each_model in enumerate(os.listdir(args.save_dir)):
        print(args.save_dir + each_model)
        if args.cuda:
            cnn.load_state_dict(torch.load(args.save_dir + each_model))
        else:
            cnn.load_state_dict(torch.load(args.save_dir + each_model, map_location=lambda storage, loc: storage))
        mymodels.append(copy.deepcopy(cnn))
        if num > 30: # in case memory overloads
            break

    with open('./input/word2idx', 'r') as file:
        word2idx = json.load(file)

    stopWords = set()
    with open('./input/stopWords') as file:
        for word in file:
            stopWords.add(word.strip())
    return(mymodels, word2idx, stopWords)

def predict(sentence, mymodels, word2idx, stopWords, args):
    tokens = tokenize_news(sentence, stopWords)
    tokens = [word2idx[t] if t in word2idx else word2idx['UNKNOWN'] for t in tokens]
    if len(tokens) < 5 or tokens == [word2idx['UNKNOWN']] * len(tokens): # tokens cannot be too short or unknown
        signal = 'Unknown'
    else:
        feature = torch.LongTensor([tokens])
        logits = []
        for model in mymodels:
            model.eval()
            if args.cuda:
                feature = feature.cuda()
            logit = model(feature)
            predictor = torch.exp(logit[:, 1]) / (torch.exp(logit[:, 0]) + torch.exp(logit[:, 1]))
            logits.append(predictor.item())
        signal = signals(np.mean(logits))
    return(signal)


def daily_predict(cnn, args):
    mymodels, word2idx, stopWords = predictor_preprocess(cnn, args)
    output = './input/news/' + args.date[:4] + '/news_' + args.date + '.csv'
    fout = open(output + '_bak', 'w')
    with open(output) as f:
        for num, line in enumerate(f):
            line = line.strip().split(',')
            if len(line) == 6:
                ticker, name, day, headline, body, newsType = line
            elif len(line) == 7:
                ticker, name, day, headline, body, newsType, signal = line
            else:
                continue

            #if newsType != 'topStory': # newsType: [topStory, normal]
            #    signal = 'Unknown'
            content = headline + ' ' + body
            signal = predict(content, mymodels, word2idx, stopWords, args)
            fout.write(','.join([ticker, name, day, headline, body, newsType, signal]) + '\n')
    fout.close()
    print('change file name')
    print('mv ' + output + '_bak ' + output)
    os.system('mv ' + output + '_bak ' + output)


def save(model, save_dir, steps):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    save_path = '{}/model_{}.pt'.format(save_dir,steps)
    torch.save(model.state_dict(), save_path)

def signals(digit):
    strong_signal = 0.4
    unknown_thres = 0.05
    if digit > 0.5 + strong_signal:
        return('Strong Buy')
    elif digit > 0.5 + unknown_thres:
        return('Buy')
    elif digit > 0.5 - unknown_thres:
        return('Unknown')
    elif digit > 0.5 - strong_signal:
        return('Sell')
    else:
        return('Strong Sell')

def padding(sentencesVec, keepNum):
    shape = sentencesVec.shape[0]
    ownLen = sentencesVec.shape[1]
    if ownLen < keepNum:
        return np.hstack((np.ones([shape, keepNum-ownLen]), sentencesVec)).flatten()
    else:
        return sentencesVec[:, -keepNum:].flatten()


def dateGenerator(numdays): # generate N days until now, eg [20151231, 20151230]
    base = datetime.datetime.today()
    date_list = [base - datetime.timedelta(days=x) for x in range(0, numdays)]
    for i in range(len(date_list)):
        date_list[i] = date_list[i].strftime("%Y%m%d")
    return set(date_list)


def generate_past_n_days(numdays):
    """Generate N days until now, e.g., [20151231, 20151230]."""
    base = datetime.datetime.today()
    date_range = [base - datetime.timedelta(days=x) for x in range(0, numdays)]
    return [x.strftime("%Y%m%d") for x in date_range]

def unify_word(word):  # went -> go, apples -> apple, BIG -> big
    """unify verb tense and noun singular"""
    ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    for wt in [ADJ, ADJ_SAT, ADV, NOUN, VERB]:
        try:
            word = WordNetLemmatizer().lemmatize(word, pos=wt)
        except:
            pass
    return word.lower()

def digit_filter(word):
    check = re.match(r'\d*\.?\d*', word).group()
    if check == "":
        return word
    else:
        return ""

def unify_word_meaning(word):
    if word in ["bigger-than-expected", "higher-than-expected", "better-than-expected", "stronger-than-expected"]:
        return "better"
    elif word in ["smaller-than-expected", "lower-than-expected", "weaker-than-expected", "worse-than-expected"]:
        return "lower"
    elif word in ["no", "not", "n't"]:
        return "not"
    else:
        return word

def get_soup_with_repeat(url, repeat_times=3, verbose=True):
    for i in range(repeat_times): # repeat in case of http failure
        try:
            time.sleep(np.random.poisson(3))
            response = urlopen(url)
            data = response.read().decode('utf-8')
            return BeautifulSoup(data, "lxml")
        except Exception as e:
            if i == 0:
                print(e)
            if verbose:
                print('retry...')
            continue

def tokenize_news(headline, stopWords):
    tokens = nltk.word_tokenize(headline) #+ nltk.word_tokenize(body)
    tokens = list(map(unify_word, tokens))
    tokens = list(map(unify_word, tokens)) # some words fail filtering in the 1st time
    tokens = list(map(digit_filter, tokens)) 
    tokens = list(map(unify_word_meaning, tokens))
    tokens = [t for t in tokens if t not in stopWords and t != ""]
    return(tokens)


def value2int(y, clusters=2):
    label = np.copy(y)
    label[y < np.percentile(y, 100 / clusters)] = 0
    for i in range(1, clusters):
        label[y > np.percentile(y, 100 * i / clusters)] = i
    return label

def value2int_simple(y):
    label = np.copy(y)
    label[y < 0] = 0
    label[y >= 0] = 1
    return label


def model_eval(net, data_loader, if_print=1):
    net.eval()
    correct = 0
    total = 0
    for cnt, (images, labels) in enumerate(data_loader):
        images, labels = Variable(images), Variable(labels)
        if torch.cuda.is_available():
            images, labels = images.cuda(), labels.cuda()
        outputs = net.forward(images)
        prediction = outputs.data.max(1)[1]
        correct += prediction.eq(labels.data).sum().item()
    print('\nTest set: Accuracy: {:0.2f}%'.format(100.0 * correct / len(data_loader.dataset)))