rnnquestionsimilarity.py

# -*- coding: utf-8 -*-
"""RNNQuestionSimilarity

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/14BiywiXQ4xZXr749o93DJk-gPhna2wny

## Required files for running the below code.

api_client.py

quora_questions.pkl (if not using the API client to fetch data)

glove.6B.zip (obtained after downloading the glove file below)

# Installing pydot and graphviz for Model plot.
Run the below commands and then restart runtime for effects to take place.
"""

!pip install -q pydot
!pip install graphviz

"""# Downloading Glove embeddings"""

!wget http://nlp.stanford.edu/data/glove.6B.zip

"""# nltk downloads for text preprocessing"""

import nltk
nltk.download('stopwords')
nltk.download('punkt')

"""# neccessary imports"""

import tensorflow as tf

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Lambda
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint
import pydot, graphviz
from keras.utils import plot_model

import pickle

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

from nltk.corpus import stopwords

import datetime
from time import time

import zipfile
import os
import sys
import io

from itertools import islice
import itertools

from sklearn.model_selection import train_test_split
import numpy as np

"""# Global Variables"""

EMBEDDING_DIM = 50

"""# ZIP extract for Glove embeddings downloaded"""

zip_ref = zipfile.ZipFile("glove.6B.zip.1", 'r')
zip_ref.extractall(".")
zip_ref.close()

"""# Code to stream data from the API client.
Has been converted to text since API has been shutdown by admin.

import api_client
def getData(size):
  client = api_client.ApiClient("eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE1NTQ2MjY2NDgsImlhdCI6MTUzOTA3NDY0OCwibmJmIjoxNTM5MDc0NjQ4LCJpZGVudGl0eSI6OH0.zJcMa4ZquR6AeXyoLlqaB8H-8VCWGwulaWdv7qHIn_o")
  data = client.get_kaggle_quora_data(size)
  return data
  
  
dat = getData(10000)
dat1 = getData(10000)
dat2 = getData(10000)
dat3 = getData(10000)
dat = dat + dat1 + dat2 + dat3
print(len(dat))
print(type(dat))

# Saved dataset to be loaded from pickle file.
"""

with open(r"quora_questions.pkl", "rb") as input_file:
  pickl_data = pickle.load(input_file)

"""# Functions for Preprocessing and Embedding indices creation."""

def get_maxlen(sequences):
  
  return len(max(sequences,key = lambda x: len(x)))

embeddings_index = {}
with io.open('glove.6B.50d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:],dtype='float32')
        embeddings_index[word] = coefs

def create_embed_matrix(word_index, embeddings_index):
  embedding_matrix = np.zeros((len(word_index)+1, EMBEDDING_DIM))
  for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all-zeros.
      embedding_matrix[i] = embedding_vector
      
  return embedding_matrix

def preprocess(dat,voc_size):
  from nltk.tokenize import word_tokenize as wt
  from collections import Counter
  # code to create the vocabulary
  stop_word = stopwords.words('english')
  tok = Tokenizer(lower=True)
  wor_list = []
  que_list = []
  question1 = []
  question2 = []
  labels = []
  for i in dat:
    #toks1 = list(wt(i['question1']))
    #tokens1 = [w.lower() for w in tok1]

    #wor_list += list(wt(i['question2']))
    q1 = i['question1']
    q2 = i['question2']
    q1 = ' '.join([word for word in q1.split() if word not in stop_word])
    q2 = ' '.join([word for word in q2.split() if word not in stop_word])
    question1.append(q1)
    question2.append(q2)

    labels.append(int(i['is_duplicate']))

  que_list = question1+question2
  
  tok.fit_on_texts(que_list)
  temp_voc = tok.word_index

  i= 0
  size = voc_size
  voc_words = list(temp_voc.keys())
  final_vocab = {}
  while(i < size):
    final_vocab[voc_words[i]] = temp_voc[voc_words[i]]
    i+=1
    
  padding = get_maxlen(que_list)
  q1_seq = np.array(pad_sequences(tok.texts_to_sequences(question1),maxlen = padding))
  q2_seq = np.array(pad_sequences(tok.texts_to_sequences(question2),maxlen = padding))
  
  q1_train,q1_test,q2_train,q2_test,label_train,label_test=train_test_split(q1_seq,q2_seq,labels,test_size=0.15, random_state=42)
  
  return  padding,q1_train,q1_test,q2_train,q2_test,label_train,label_test,final_vocab

"""# Creation of Train and Test datasets along with Embedding matrix"""

MAX_SEQ_LENGTH,q1_train,q1_test,q2_train,q2_test,label_train,label_test,word_to_index_map = preprocess(pickl_data,10000)

#print(word_to_index_map)
embedding_matrix = create_embed_matrix(word_to_index_map, embeddings_index)
#print(embedding_matrix)

# debug code
# print(embeddings_index["what"])

# redundant code
def word_to_index(vocab):
  word_to_index_map = {w : i for i, w in enumerate(vocab)}
  return word_to_index_map

"""# **LSTM Model with Manhattan Distance  **"""

# Model variables
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 1024
n_epoch = 20
def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

# The visible layer
left_input = Input(shape=(MAX_SEQ_LENGTH,), dtype='int32')
right_input = Input(shape=(MAX_SEQ_LENGTH,), dtype='int32')

embedding_layer = Embedding(len(embedding_matrix), EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQ_LENGTH, trainable=False)

# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(n_hidden)

left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])

# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

malstm.summary()

"""# Start Training with Validation."""

# Start training
training_start_time = time()

malstm_trained = malstm.fit([q1_train, q2_train], label_train, batch_size=batch_size, nb_epoch=n_epoch,
                            validation_data=([q1_test,q2_test], label_test))

print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

# saving the input data as a pickle file, do not run if api client is not available
# with open('quora_questions.pkl', 'wb') as f:
#  pickle.dump(dat,f)

"""# Saving the model architecture as a .PNG file"""

plot_model(malstm, to_file='modelLSTM.png')

"""# Plotting the model on Jupyter notebook."""

SVG(model_to_dot(malstm).create(prog='dot', format='svg'))