diff --git a/AttentionLayer.py b/AttentionLayer.py
new file mode 100644
index 0000000..063abd4
--- /dev/null
+++ b/AttentionLayer.py
@@ -0,0 +1,122 @@
+import tensorflow as tf
+import os
+from tensorflow.python.keras.layers import Layer
+from tensorflow.python.keras import backend as K
+
+
+class AttentionLayer(Layer):
+ """
+ This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
+ There are three sets of weights introduced W_a, U_a, and V_a
+ """
+
+ def __init__(self, **kwargs):
+ super(AttentionLayer, self).__init__(**kwargs)
+
+ def build(self, input_shape):
+ assert isinstance(input_shape, list)
+ # Create a trainable weight variable for this layer.
+
+ self.W_a = self.add_weight(name='W_a',
+ shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
+ initializer='uniform',
+ trainable=True)
+ self.U_a = self.add_weight(name='U_a',
+ shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
+ initializer='uniform',
+ trainable=True)
+ self.V_a = self.add_weight(name='V_a',
+ shape=tf.TensorShape((input_shape[0][2], 1)),
+ initializer='uniform',
+ trainable=True)
+
+ super(AttentionLayer, self).build(input_shape) # Be sure to call this at the end
+
+ def call(self, inputs, verbose=False):
+ """
+ inputs: [encoder_output_sequence, decoder_output_sequence]
+ """
+ assert type(inputs) == list
+ encoder_out_seq, decoder_out_seq = inputs
+ if verbose:
+ print('encoder_out_seq>', encoder_out_seq.shape)
+ print('decoder_out_seq>', decoder_out_seq.shape)
+
+ def energy_step(inputs, states):
+ """ Step function for computing energy for a single decoder state """
+
+ assert_msg = "States must be a list. However states {} is of type {}".format(states, type(states))
+ assert isinstance(states, list) or isinstance(states, tuple), assert_msg
+
+ """ Some parameters required for shaping tensors"""
+ en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
+ de_hidden = inputs.shape[-1]
+
+ """ Computing S.Wa where S=[s0, s1, ..., si]"""
+ # <= batch_size*en_seq_len, latent_dim
+ reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))
+ # <= batch_size*en_seq_len, latent_dim
+ W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden))
+ if verbose:
+ print('wa.s>',W_a_dot_s.shape)
+
+ """ Computing hj.Ua """
+ U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1) # <= batch_size, 1, latent_dim
+ if verbose:
+ print('Ua.h>',U_a_dot_h.shape)
+
+ """ tanh(S.Wa + hj.Ua) """
+ # <= batch_size*en_seq_len, latent_dim
+ reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
+ if verbose:
+ print('Ws+Uh>', reshaped_Ws_plus_Uh.shape)
+
+ """ softmax(va.tanh(S.Wa + hj.Ua)) """
+ # <= batch_size, en_seq_len
+ e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len))
+ # <= batch_size, en_seq_len
+ e_i = K.softmax(e_i)
+
+ if verbose:
+ print('ei>', e_i.shape)
+
+ return e_i, [e_i]
+
+ def context_step(inputs, states):
+ """ Step function for computing ci using ei """
+ # <= batch_size, hidden_size
+ c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
+ if verbose:
+ print('ci>', c_i.shape)
+ return c_i, [c_i]
+
+ def create_inital_state(inputs, hidden_size):
+ # We are not using initial states, but need to pass something to K.rnn funciton
+ fake_state = K.zeros_like(inputs) # <= (batch_size, enc_seq_len, latent_dim
+ fake_state = K.sum(fake_state, axis=[1, 2]) # <= (batch_size)
+ fake_state = K.expand_dims(fake_state) # <= (batch_size, 1)
+ fake_state = K.tile(fake_state, [1, hidden_size]) # <= (batch_size, latent_dim
+ return fake_state
+
+ fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1])
+ fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1]) # <= (batch_size, enc_seq_len, latent_dim
+
+ """ Computing energy outputs """
+ # e_outputs => (batch_size, de_seq_len, en_seq_len)
+ last_out, e_outputs, _ = K.rnn(
+ energy_step, decoder_out_seq, [fake_state_e],
+ )
+
+ """ Computing context vectors """
+ last_out, c_outputs, _ = K.rnn(
+ context_step, e_outputs, [fake_state_c],
+ )
+
+ return c_outputs, e_outputs
+
+ def compute_output_shape(self, input_shape):
+ """ Outputs produced by the layer """
+ return [
+ tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
+ tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
+ ]
\ No newline at end of file
diff --git a/__pycache__/AttentionLayer.cpython-37.pyc b/__pycache__/AttentionLayer.cpython-37.pyc
new file mode 100644
index 0000000..e300498
Binary files /dev/null and b/__pycache__/AttentionLayer.cpython-37.pyc differ
diff --git a/__pycache__/build_model_keras.cpython-37.pyc b/__pycache__/build_model_keras.cpython-37.pyc
new file mode 100644
index 0000000..6113130
Binary files /dev/null and b/__pycache__/build_model_keras.cpython-37.pyc differ
diff --git a/__pycache__/data_clean.cpython-37.pyc b/__pycache__/data_clean.cpython-37.pyc
new file mode 100644
index 0000000..993aaee
Binary files /dev/null and b/__pycache__/data_clean.cpython-37.pyc differ
diff --git a/__pycache__/data_manip.cpython-37.pyc b/__pycache__/data_manip.cpython-37.pyc
index 2fc3eac..79ff932 100644
Binary files a/__pycache__/data_manip.cpython-37.pyc and b/__pycache__/data_manip.cpython-37.pyc differ
diff --git a/__pycache__/train_model_keras.cpython-37.pyc b/__pycache__/train_model_keras.cpython-37.pyc
new file mode 100644
index 0000000..0309888
Binary files /dev/null and b/__pycache__/train_model_keras.cpython-37.pyc differ
diff --git a/build_model_keras.py b/build_model_keras.py
index b9efa8d..84768a9 100644
--- a/build_model_keras.py
+++ b/build_model_keras.py
@@ -1,37 +1,46 @@
-#building the model keras---an attempt to build a seq2seq model in keras
from data_clean import *
+from keras import backend as K
+K.clear_session()
+latent_dim = 300
+embedding_dim=100
-def define_model(max_text_length,max_summary_length,n_units):
- dim_rep=300
+# Encoder
+encoder_inputs = Input(shape=(max_text_len,))
+#embedding layer
+enc_emb = Embedding(x_voc, embedding_dim,trainable=True)(encoder_inputs)
- encoder_inputs=Input(shape=(None,max_text_length,dim_rep))
+#encoder lstm 1
+encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
+encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)
- encoder=LSTM(n_units,return_state=True)
- encoder_outputs, state_h, state_c = encoder(encoder_inputs)
- encoder_states = [state_h, state_c]
+#encoder lstm 2
+encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
+encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)
- #define training decoder
- decoder_inputs = Input(shape=(None, max_summary_length,dim_rep))
+#encoder lstm 3
+encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
+encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)
- decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
- decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
- decoder_dense = Dense(n_output, activation='softmax')
- decoder_outputs = decoder_dense(decoder_outputs)
- model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
+# Set up the decoder, using `encoder_states` as initial state.
+decoder_inputs = Input(shape=(None,))
- #define inference encoder
- encoder_model = Model(encoder_inputs, encoder_states)
+#embedding layer
+dec_emb_layer = Embedding(y_voc, embedding_dim,trainable=True)
+dec_emb = dec_emb_layer(decoder_inputs)
- #define inference decoder
- decoder_state_input_h = Input(shape=(n_units,))
- decoder_state_input_c = Input(shape=(n_units,))
- decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
- decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
- decoder_states = [state_h, state_c]
- decoder_outputs = decoder_dense(decoder_outputs)
- decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
+decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.4,recurrent_dropout=0.2)
+decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])
- return model, encoder_model, decoder_model
+
+
+#dense layer
+decoder_dense = TimeDistributed(Dense(y_voc, activation='softmax'))
+decoder_outputs = decoder_dense(decoder_outputs)
+
+# Define the model
+model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
+
+model.summary()
\ No newline at end of file
diff --git a/data_clean.py b/data_clean.py
index 1ff1a90..6a10ddc 100644
--- a/data_clean.py
+++ b/data_clean.py
@@ -1,244 +1,181 @@
from data_manip import *
-import nltk
-import tensorflow as tf
-nltk.download('stopwords')
-
-def clean_text(text, remove_stopwords = True):
-
- # Convert words to lower case
- text = text.lower()
- #[_\-;%()+&=*%.,!?:$@\[\]/]
- # Replace contractions with their longer forms
- if True:
- text = text.split()
- new_text = []
- for word in text:
- if word in contractions:
- new_text.append(contractions[word])
- else:
- new_text.append(word)
- text = " ".join(new_text)
-
- # Format words and remove unwanted characters
- text = re.sub(r'https?:\/\/.*[\r\n]*', '', text,
- flags=re.MULTILINE)
- text = re.sub(r'\', ' ', text)
- text = re.sub(r'\'', ' ', text)
-
- # Optionally, remove stop words
- if remove_stopwords:
- text = text.split()
- stops = set(stopwords.words("english"))
- text = [w for w in text if not w in stops]
- text = " ".join(text)
- return text
-
-clean_summaries = []
-for summary in reviews.Summary:
- clean_summaries.append(clean_text(summary, remove_stopwords=False))
-print("Summaries are complete.")
-
-clean_texts = []
-for text in reviews.Text:
- clean_texts.append(clean_text(text))
-print("Texts are complete.")
-
-def count_words(count_dict, text):
- '''Count the number of occurrences of each word in a set of text'''
- for sentence in text:
- for word in sentence.split():
- if word not in count_dict:
- count_dict[word] = 1
- else:
- count_dict[word] += 1
-word_counts = {}
-
-count_words(word_counts, clean_summaries)
-count_words(word_counts, clean_texts)
-
-print("Size of Vocabulary:", len(word_counts))
-
-embeddings_index = {}
-with open('/home/pbu/Downloads/numberbatch-en.txt', encoding='utf-8') as f:
- for line in f:
- values = line.split(' ')
- word = values[0]
- embedding = np.asarray(values[1:], dtype='float32')
- embeddings_index[word] = embedding
-print('Word embeddings:', len(embeddings_index))
-
-# Find the number of words that are missing from CN, and are used more than our threshold.
-missing_words = 0
-threshold = 20
-
-for word, count in word_counts.items():
- if count > threshold:
- if word not in embeddings_index:
- missing_words += 1
-
-missing_ratio = round(missing_words/len(word_counts),4)*100
-
-print("Number of words missing from CN:", missing_words)
-print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))
-
-# Limit the vocab that we will use to words that appear ≥ threshold or are in GloVe
-
-#dictionary to convert words to integers
-vocab_to_int = {}
-
-value = 0
-for word, count in word_counts.items():
- if count >= threshold or word in embeddings_index:
- vocab_to_int[word] = value
- value += 1
-
-# Special tokens that will be added to our vocab
-codes = ["","","",""]
-
-# Add codes to vocab
-for code in codes:
- vocab_to_int[code] = len(vocab_to_int)
-
-# Dictionary to convert integers to words
-int_to_vocab = {}
-for word, value in vocab_to_int.items():
- int_to_vocab[value] = word
-
-usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100
-
-print("Total number of unique words:", len(word_counts))
-print("Number of words we will use:", len(vocab_to_int))
-print("Percent of words we will use: {}%".format(usage_ratio))
-
-# Need to use 300 for embedding dimensions to match CN's vectors.
-embedding_dim = 300
-nb_words = len(vocab_to_int)
-
-# Create matrix with default values of zero
-word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
-for word, i in vocab_to_int.items():
- if word in embeddings_index:
- word_embedding_matrix[i] = embeddings_index[word]
+
+stop_words = set(stopwords.words('english'))
+
+def text_cleaner(text,num):
+ newString = text.lower()
+ newString = BeautifulSoup(newString, "lxml").text
+ newString = re.sub(r'\([^)]*\)', '', newString)
+ newString = re.sub('"','', newString)
+ newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])
+ newString = re.sub(r"'s\b","",newString)
+ newString = re.sub("[^a-zA-Z]", " ", newString)
+ newString = re.sub('[m]{2,}', 'mm', newString)
+ if(num==0):
+ tokens = [w for w in newString.split() if not w in stop_words]
else:
- # If word not in CN, create a random embedding for it
- new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
- embeddings_index[word] = new_embedding
- word_embedding_matrix[i] = new_embedding
-
-# Check if value matches len(vocab_to_int)
-print(len(word_embedding_matrix))
-
-def convert_to_ints(text, word_count, unk_count, eos=False):
- '''Convert words in text to an integer.
- If word is not in vocab_to_int, use UNK's integer.
- Total the number of words and UNKs.
- Add EOS token to the end of texts'''
- ints = []
- for sentence in text:
- sentence_ints = []
- for word in sentence.split():
- word_count += 1
- if word in vocab_to_int:
- sentence_ints.append(vocab_to_int[word])
- else:
- sentence_ints.append(vocab_to_int[""])
- unk_count += 1
- if eos:
- sentence_ints.append(vocab_to_int[""])
- ints.append(sentence_ints)
- return ints, word_count, unk_count
-# Apply convert_to_ints to clean_summaries and clean_texts
-word_count = 0
-unk_count = 0
-
-int_summaries, word_count, unk_count = convert_to_ints(clean_summaries, word_count, unk_count)
-int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True)
-unk_percent = round(unk_count/word_count,4)*100
-
-print("Total number of words in headlines:", word_count)
-print("Total number of UNKs in headlines:", unk_count)
-print("Percent of words that are UNK: {}%".format(unk_percent))
-
-
-def create_lengths(text):
- '''Create a data frame of the sentence lengths from a text'''
- lengths = []
- for sentence in text:
- lengths.append(len(sentence))
-
- return pd.DataFrame(lengths, columns=['counts'])
-
-lengths_summaries = create_lengths(int_summaries)
-lengths_texts = create_lengths(int_texts)
-
-print("Summaries:")
-print(lengths_summaries.describe())
-print()
-print("Texts:")
-print(lengths_texts.describe())
-
-
-def unk_counter(sentence):
- '''Counts the number of time UNK appears in a sentence.'''
- unk_count = 0
- for word in sentence:
- if word == vocab_to_int[""]:
- unk_count += 1
- return unk_count
-
-# Sort the summaries and texts by the length of the texts, shortest to longest
-# Limit the length of summaries and texts based on the min and max ranges.
-# Remove reviews that include too many UNKs
-
-sorted_summaries = []
-sorted_texts = []
-max_text_length = 84
-max_summary_length = 13
-min_length = 2
-unk_text_limit = 1
-unk_summary_limit = 0
-
-for length in range(min(lengths_texts.counts), max_text_length):
- for count, words in enumerate(int_summaries):
- if (len(int_summaries[count]) >= min_length and
- len(int_summaries[count]) <= max_summary_length and
- len(int_texts[count]) >= min_length and
- unk_counter(int_summaries[count]) <= unk_summary_limit and
- unk_counter(int_texts[count]) <= unk_text_limit and
- length == len(int_texts[count])
- ):
- sorted_summaries.append(int_summaries[count])
- sorted_texts.append(int_texts[count])
-
-# Compare lengths to ensure they match
-print(len(sorted_summaries))
-print(len(sorted_texts))
-#print(np.array(sorted_texts).shape)
-
-def pad_sentence_batch(sentence,max_length):
- """Pad sentences with so that each sentence of a batch has the same length"""
-
- return [sentence + [vocab_to_int['']] * (max_length - len(sentence))]
-
-def process_encoding_input(target_data, vocab_to_int, batch_size):
- '''Remove the last word id from each batch and concat the to the begining of each batch'''
-
- ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
- dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['']), ending], 1)
-
- return dec_input
-
-padded_summaries=list()
-padded_text=list()
-
-for summaries in sorted_summaries:
- summaries=process_encoding_input(summaries,vocab_to_int,1)
- summaries=pad_sentence_batch(summaries,max_summary_length)
- padded_summaries.append(summaries)
-
-for text in sorted_texts:
- text=pad_sentence_batch(text,max_text_length)
- padded_text.append(text)
+ tokens=newString.split()
+ long_words=[]
+ for i in tokens:
+ if len(i)>1: #removing short word
+ long_words.append(i)
+ return (" ".join(long_words)).strip()
+
+cleaned_text = []
+for t in data['Text']:
+ cleaned_text.append(text_cleaner(t,0))
+
+cleaned_summary = []
+for t in data['Summary']:
+ cleaned_summary.append(text_cleaner(t,1))
+
+data['cleaned_text']=cleaned_text
+data['cleaned_summary']=cleaned_summary
+'''
+import matplotlib.pyplot as plt
+
+text_word_count = []
+summary_word_count = []
+
+# populate the lists with sentence lengths
+for i in data['cleaned_text']:
+ text_word_count.append(len(i.split()))
+
+for i in data['cleaned_summary']:
+ summary_word_count.append(len(i.split()))
+
+length_df = pd.DataFrame({'text':text_word_count, 'summary':summary_word_count})
+
+length_df.hist(bins = 30)
+plt.show()
+'''
+
+cnt=0
+for i in data['cleaned_summary']:
+ if(len(i.split())<=8):
+ cnt=cnt+1
+print(cnt/len(data['cleaned_summary']))
+
+max_text_len=30
+max_summary_len=8
+
+cleaned_text = np.array(data['cleaned_text'])
+cleaned_summary = np.array(data['cleaned_summary'])
+
+short_text = []
+short_summary = []
+
+for i in range(len(cleaned_text)):
+ if (len(cleaned_summary[i].split()) <= max_summary_len and len(cleaned_text[i].split()) <= max_text_len):
+ short_text.append(cleaned_text[i])
+ short_summary.append(cleaned_summary[i])
+
+df = pd.DataFrame({'text': short_text, 'summary': short_summary})
+
+df['summary'] = df['summary'].apply(lambda x : 'sostok '+ x + ' eostok')
+
+from sklearn.model_selection import train_test_split
+x_tr,x_val,y_tr,y_val=train_test_split(np.array(df['text']),np.array(df['summary']),test_size=0.1,random_state=0,shuffle=True)
+
+
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+
+#prepare a tokenizer for reviews on training data
+x_tokenizer = Tokenizer()
+x_tokenizer.fit_on_texts(list(x_tr))
+
+thresh = 4
+
+cnt = 0
+tot_cnt = 0
+freq = 0
+tot_freq = 0
+
+for key, value in x_tokenizer.word_counts.items():
+ tot_cnt = tot_cnt + 1
+ tot_freq = tot_freq + value
+ if (value < thresh):
+ cnt = cnt + 1
+ freq = freq + value
+
+print("% of rare words in vocabulary:", (cnt / tot_cnt) * 100)
+print("Total Coverage of rare words:", (freq / tot_freq) * 100)
+
+#prepare a tokenizer for reviews on training data
+x_tokenizer = Tokenizer(num_words=tot_cnt-cnt)
+x_tokenizer.fit_on_texts(list(x_tr))
+
+#convert text sequences into integer sequences
+x_tr_seq = x_tokenizer.texts_to_sequences(x_tr)
+x_val_seq = x_tokenizer.texts_to_sequences(x_val)
+
+#padding zero upto maximum length
+x_tr = pad_sequences(x_tr_seq, maxlen=max_text_len, padding='post')
+x_val = pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')
+
+#size of vocabulary ( +1 for padding token)
+x_voc = x_tokenizer.num_words + 1
+
+print(x_voc)
+
+#prepare a tokenizer for reviews on training data
+y_tokenizer = Tokenizer()
+y_tokenizer.fit_on_texts(list(y_tr))
+
+thresh = 6
+
+cnt = 0
+tot_cnt = 0
+freq = 0
+tot_freq = 0
+
+for key, value in y_tokenizer.word_counts.items():
+ tot_cnt = tot_cnt + 1
+ tot_freq = tot_freq + value
+ if (value < thresh):
+ cnt = cnt + 1
+ freq = freq + value
+
+print("% of rare words in vocabulary:", (cnt / tot_cnt) * 100)
+print("Total Coverage of rare words:", (freq / tot_freq) * 100)
+
+#prepare a tokenizer for reviews on training data
+y_tokenizer = Tokenizer(num_words=tot_cnt-cnt)
+y_tokenizer.fit_on_texts(list(y_tr))
+
+#convert text sequences into integer sequences
+y_tr_seq = y_tokenizer.texts_to_sequences(y_tr)
+y_val_seq = y_tokenizer.texts_to_sequences(y_val)
+
+#padding zero upto maximum length
+y_tr = pad_sequences(y_tr_seq, maxlen=max_summary_len, padding='post')
+y_val = pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post')
+
+#size of vocabulary
+y_voc = y_tokenizer.num_words +1
+y_tokenizer.word_counts['sostok'],len(y_tr)
+
+ind=[]
+for i in range(len(y_tr)):
+ cnt=0
+ for j in y_tr[i]:
+ if j!=0:
+ cnt=cnt+1
+ if(cnt==2):
+ ind.append(i)
+
+y_tr=np.delete(y_tr,ind, axis=0)
+x_tr=np.delete(x_tr,ind, axis=0)
+
+ind=[]
+for i in range(len(y_val)):
+ cnt=0
+ for j in y_val[i]:
+ if j!=0:
+ cnt=cnt+1
+ if(cnt==2):
+ ind.append(i)
+
+y_val=np.delete(y_val,ind, axis=0)
+x_val=np.delete(x_val,ind, axis=0)
\ No newline at end of file
diff --git a/data_manip.py b/data_manip.py
index 10cc1f3..aa8c0a8 100644
--- a/data_manip.py
+++ b/data_manip.py
@@ -1,101 +1,38 @@
import numpy as np
-import re
import pandas as pd
-import numpy as np
-#import tensorflow as tf
import re
+from bs4 import BeautifulSoup
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
-import time
-#from tensorflow.python.layers.core import Dense
-#from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
-reviews = pd.read_csv("/home/pbu/Downloads/amazon-fine-food-reviews/Reviews.csv")
-#print(reviews.shape)
-print(reviews.isnull().sum())
-# Remove null values and unneeded features
-reviews = reviews.dropna()
-reviews = reviews.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator',
- 'Score','Time'], 1)
-reviews = reviews.reset_index(drop=True)
+from keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
+from keras.models import Model
+from keras.callbacks import ModelCheckpoint
+
+data=pd.read_csv("/home/pbu/Downloads/amazon-fine-food-reviews/Reviews.csv",nrows=100000)
+data.drop_duplicates(subset=['Text'],inplace=True)#dropping duplicates
+data.dropna(axis=0,inplace=True)#dropping na
-'''
-for i in range(5):
- print("Review #",i+1)
- print(reviews.Summary[i])
- print(reviews.Text[i])
- print('\n')
-'''
-contractions = {
-"ain't": "am not",
-"aren't": "are not",
-"can't": "cannot",
-"can't've": "cannot have",
-"'cause": "because",
-"could've": "could have",
-"couldn't": "could not",
-"couldn't've": "could not have",
-"didn't": "did not",
-"doesn't": "does not",
-"don't": "do not",
-"hadn't": "had not",
-"hadn't've": "had not have",
-"hasn't": "has not",
-"haven't": "have not",
-"he'd": "he would",
-"he'd've": "he would have",
-"he'll": "he will",
-"he's": "he is",
-"how'd": "how did",
-"how'll": "how will",
-"how's": "how is",
-"i'd": "i would",
-"i'll": "i will",
-"i'm": "i am",
-"i've": "i have",
-"isn't": "is not",
-"it'd": "it would",
-"it'll": "it will",
-"it's": "it is",
-"let's": "let us",
-"ma'am": "madam",
-"mayn't": "may not",
-"might've": "might have",
-"mightn't": "might not",
-"must've": "must have",
-"mustn't": "must not",
-"needn't": "need not",
-"oughtn't": "ought not",
-"shan't": "shall not",
-"sha'n't": "shall not",
-"she'd": "she would",
-"she'll": "she will",
-"she's": "she is",
-"should've": "should have",
-"shouldn't": "should not",
-"that'd": "that would",
-"that's": "that is",
-"there'd": "there had",
-"there's": "there is",
-"they'd": "they would",
-"they'll": "they will",
-"they're": "they are",
-"they've": "they have",
-"wasn't": "was not",
-"we'd": "we would",
-"we'll": "we will",
-"we're": "we are",
-"we've": "we have",
-"weren't": "were not",
-"what'll": "what will",
-"what're": "what are",
-"what's": "what is",
-"what've": "what have",
-"where'd": "where did",
-"where's": "where is",
-"who'll": "who will",
-"who's": "who is",
-"won't": "will not",
-"wouldn't": "would not",
-"you'd": "you would",
-"you'll": "you will",
-"you're": "you are"
-}
+contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
+ "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
+ "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
+ "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
+ "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
+ "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
+ "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
+ "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
+ "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
+ "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
+ "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
+ "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
+ "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
+ "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
+ "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
+ "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
+ "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
+ "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
+ "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
+ "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
+ "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
+ "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
+ "you're": "you are", "you've": "you have"}
\ No newline at end of file
diff --git a/inference_model_keras.py b/inference_model_keras.py
new file mode 100644
index 0000000..a6b88e2
--- /dev/null
+++ b/inference_model_keras.py
@@ -0,0 +1,85 @@
+from train_model_keras import *
+
+reverse_target_word_index=y_tokenizer.index_word
+reverse_source_word_index=x_tokenizer.index_word
+target_word_index=y_tokenizer.word_index
+
+# Encode the input sequence to get the feature vector
+encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])
+
+# Decoder setup
+# Below tensors will hold the states of the previous time step
+decoder_state_input_h = Input(shape=(latent_dim,))
+decoder_state_input_c = Input(shape=(latent_dim,))
+decoder_hidden_state_input = Input(shape=(max_text_len,latent_dim))
+
+# Get the embeddings of the decoder sequence
+dec_emb2= dec_emb_layer(decoder_inputs)
+# To predict the next word in the sequence, set the initial states to the states from the previous time step
+decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])
+
+
+
+# A dense softmax layer to generate prob dist. over the target vocabulary
+decoder_outputs2 = decoder_dense(decoder_outputs2)
+
+# Final decoder model
+decoder_model = Model(
+ [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
+ [decoder_outputs2] + [state_h2, state_c2])
+
+
+def decode_sequence(input_seq):
+ # Encode the input as state vectors.
+ e_out, e_h, e_c = encoder_model.predict(input_seq)
+
+ # Generate empty target sequence of length 1.
+ target_seq = np.zeros((1, 1))
+
+ # Populate the first word of target sequence with the start word.
+ target_seq[0, 0] = target_word_index['sostok']
+
+ stop_condition = False
+ decoded_sentence = ''
+ while not stop_condition:
+
+ output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])
+
+ # Sample a token
+ sampled_token_index = np.argmax(output_tokens[0, -1, :])
+ sampled_token = reverse_target_word_index[sampled_token_index]
+
+ if (sampled_token != 'eostok'):
+ decoded_sentence += ' ' + sampled_token
+
+ # Exit condition: either hit max length or find stop word.
+ if (sampled_token == 'eostok' or len(decoded_sentence.split()) >= (max_summary_len - 1)):
+ stop_condition = True
+
+ # Update the target sequence (of length 1).
+ target_seq = np.zeros((1, 1))
+ target_seq[0, 0] = sampled_token_index
+
+ # Update internal states
+ e_h, e_c = h, c
+
+ return decoded_sentence
+
+def seq2summary(input_seq):
+ newString=''
+ for i in input_seq:
+ if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']):
+ newString=newString+reverse_target_word_index[i]+' '
+ return newString
+
+def seq2text(input_seq):
+ newString=''
+ for i in input_seq:
+ if(i!=0):
+ newString=newString+reverse_source_word_index[i]+' '
+ return newString
+for i in range(0,5):
+ print("Review:",seq2text(x_tr[i]))
+ print("Original summary:",seq2summary(y_tr[i]))
+ print("Predicted summary:",decode_sequence(x_tr[i].reshape(1,max_text_len)))
+ print("\n")
\ No newline at end of file
diff --git a/seq2seq_keras_sp.h5 b/seq2seq_keras_sp.h5
new file mode 100644
index 0000000..ce0af92
Binary files /dev/null and b/seq2seq_keras_sp.h5 differ
diff --git a/train_model_keras.py b/train_model_keras.py
index 93686da..f3d5d3b 100644
--- a/train_model_keras.py
+++ b/train_model_keras.py
@@ -1,17 +1,13 @@
-#training the model in keras
from build_model_keras import *
-start = 200000
-end = start + 50000
-sorted_summaries_short = sorted_summaries[start:end]
-sorted_texts_short = sorted_texts[start:end]
-print("The shortest text length:", len(sorted_texts_short[0]))
-print("The longest text length:",len(sorted_texts_short[-1]))
-embeddings=word_embedding_matrix
-enc_embed_input=tf.nn.embedding_lookup(embeddings,padded_text)
-dec_embed_input=tf.nn.embedding_lookup(embeddings,padded_summaries)
-n_units=100
-model,encoder_model,decoder_model=define_model(max_text_length,max_summary_length,n_units)
+model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
+checkpointer = ModelCheckpoint(filepath='seq2seq'+"_keras_sp.h5", verbose=1, save_best_only=True)
+history=model.fit([x_tr,y_tr[:,:-1]], y_tr.reshape(y_tr.shape[0],y_tr.shape[1], 1)[:,1:] ,epochs=50,
+ callbacks=[checkpointer],batch_size=128, validation_data=([x_val,y_val[:,:-1]], y_val.reshape(y_val.shape[0],y_val.shape[1], 1)[:,1:]))
-model.compile(optimizer='rmsprop',loss='categorical_crossentropy')
+from matplotlib import pyplot
+pyplot.plot(history.history['loss'], label='train')
+pyplot.plot(history.history['val_loss'], label='test')
+pyplot.legend()
+pyplot.show()
\ No newline at end of file