diff --git a/AttentionLayer.py b/AttentionLayer.py new file mode 100644 index 0000000..063abd4 --- /dev/null +++ b/AttentionLayer.py @@ -0,0 +1,122 @@ +import tensorflow as tf +import os +from tensorflow.python.keras.layers import Layer +from tensorflow.python.keras import backend as K + + +class AttentionLayer(Layer): + """ + This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf). + There are three sets of weights introduced W_a, U_a, and V_a + """ + + def __init__(self, **kwargs): + super(AttentionLayer, self).__init__(**kwargs) + + def build(self, input_shape): + assert isinstance(input_shape, list) + # Create a trainable weight variable for this layer. + + self.W_a = self.add_weight(name='W_a', + shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])), + initializer='uniform', + trainable=True) + self.U_a = self.add_weight(name='U_a', + shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])), + initializer='uniform', + trainable=True) + self.V_a = self.add_weight(name='V_a', + shape=tf.TensorShape((input_shape[0][2], 1)), + initializer='uniform', + trainable=True) + + super(AttentionLayer, self).build(input_shape) # Be sure to call this at the end + + def call(self, inputs, verbose=False): + """ + inputs: [encoder_output_sequence, decoder_output_sequence] + """ + assert type(inputs) == list + encoder_out_seq, decoder_out_seq = inputs + if verbose: + print('encoder_out_seq>', encoder_out_seq.shape) + print('decoder_out_seq>', decoder_out_seq.shape) + + def energy_step(inputs, states): + """ Step function for computing energy for a single decoder state """ + + assert_msg = "States must be a list. However states {} is of type {}".format(states, type(states)) + assert isinstance(states, list) or isinstance(states, tuple), assert_msg + + """ Some parameters required for shaping tensors""" + en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2] + de_hidden = inputs.shape[-1] + + """ Computing S.Wa where S=[s0, s1, ..., si]""" + # <= batch_size*en_seq_len, latent_dim + reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden)) + # <= batch_size*en_seq_len, latent_dim + W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden)) + if verbose: + print('wa.s>',W_a_dot_s.shape) + + """ Computing hj.Ua """ + U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1) # <= batch_size, 1, latent_dim + if verbose: + print('Ua.h>',U_a_dot_h.shape) + + """ tanh(S.Wa + hj.Ua) """ + # <= batch_size*en_seq_len, latent_dim + reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden))) + if verbose: + print('Ws+Uh>', reshaped_Ws_plus_Uh.shape) + + """ softmax(va.tanh(S.Wa + hj.Ua)) """ + # <= batch_size, en_seq_len + e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len)) + # <= batch_size, en_seq_len + e_i = K.softmax(e_i) + + if verbose: + print('ei>', e_i.shape) + + return e_i, [e_i] + + def context_step(inputs, states): + """ Step function for computing ci using ei """ + # <= batch_size, hidden_size + c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1) + if verbose: + print('ci>', c_i.shape) + return c_i, [c_i] + + def create_inital_state(inputs, hidden_size): + # We are not using initial states, but need to pass something to K.rnn funciton + fake_state = K.zeros_like(inputs) # <= (batch_size, enc_seq_len, latent_dim + fake_state = K.sum(fake_state, axis=[1, 2]) # <= (batch_size) + fake_state = K.expand_dims(fake_state) # <= (batch_size, 1) + fake_state = K.tile(fake_state, [1, hidden_size]) # <= (batch_size, latent_dim + return fake_state + + fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1]) + fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1]) # <= (batch_size, enc_seq_len, latent_dim + + """ Computing energy outputs """ + # e_outputs => (batch_size, de_seq_len, en_seq_len) + last_out, e_outputs, _ = K.rnn( + energy_step, decoder_out_seq, [fake_state_e], + ) + + """ Computing context vectors """ + last_out, c_outputs, _ = K.rnn( + context_step, e_outputs, [fake_state_c], + ) + + return c_outputs, e_outputs + + def compute_output_shape(self, input_shape): + """ Outputs produced by the layer """ + return [ + tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])), + tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1])) + ] \ No newline at end of file diff --git a/__pycache__/AttentionLayer.cpython-37.pyc b/__pycache__/AttentionLayer.cpython-37.pyc new file mode 100644 index 0000000..e300498 Binary files /dev/null and b/__pycache__/AttentionLayer.cpython-37.pyc differ diff --git a/__pycache__/build_model_keras.cpython-37.pyc b/__pycache__/build_model_keras.cpython-37.pyc new file mode 100644 index 0000000..6113130 Binary files /dev/null and b/__pycache__/build_model_keras.cpython-37.pyc differ diff --git a/__pycache__/data_clean.cpython-37.pyc b/__pycache__/data_clean.cpython-37.pyc new file mode 100644 index 0000000..993aaee Binary files /dev/null and b/__pycache__/data_clean.cpython-37.pyc differ diff --git a/__pycache__/data_manip.cpython-37.pyc b/__pycache__/data_manip.cpython-37.pyc index 2fc3eac..79ff932 100644 Binary files a/__pycache__/data_manip.cpython-37.pyc and b/__pycache__/data_manip.cpython-37.pyc differ diff --git a/__pycache__/train_model_keras.cpython-37.pyc b/__pycache__/train_model_keras.cpython-37.pyc new file mode 100644 index 0000000..0309888 Binary files /dev/null and b/__pycache__/train_model_keras.cpython-37.pyc differ diff --git a/build_model_keras.py b/build_model_keras.py index b9efa8d..84768a9 100644 --- a/build_model_keras.py +++ b/build_model_keras.py @@ -1,37 +1,46 @@ -#building the model keras---an attempt to build a seq2seq model in keras from data_clean import * +from keras import backend as K +K.clear_session() +latent_dim = 300 +embedding_dim=100 -def define_model(max_text_length,max_summary_length,n_units): - dim_rep=300 +# Encoder +encoder_inputs = Input(shape=(max_text_len,)) +#embedding layer +enc_emb = Embedding(x_voc, embedding_dim,trainable=True)(encoder_inputs) - encoder_inputs=Input(shape=(None,max_text_length,dim_rep)) +#encoder lstm 1 +encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4) +encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb) - encoder=LSTM(n_units,return_state=True) - encoder_outputs, state_h, state_c = encoder(encoder_inputs) - encoder_states = [state_h, state_c] +#encoder lstm 2 +encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4) +encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1) - #define training decoder - decoder_inputs = Input(shape=(None, max_summary_length,dim_rep)) +#encoder lstm 3 +encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4) +encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2) - decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True) - decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) - decoder_dense = Dense(n_output, activation='softmax') - decoder_outputs = decoder_dense(decoder_outputs) - model = Model([encoder_inputs, decoder_inputs], decoder_outputs) +# Set up the decoder, using `encoder_states` as initial state. +decoder_inputs = Input(shape=(None,)) - #define inference encoder - encoder_model = Model(encoder_inputs, encoder_states) +#embedding layer +dec_emb_layer = Embedding(y_voc, embedding_dim,trainable=True) +dec_emb = dec_emb_layer(decoder_inputs) - #define inference decoder - decoder_state_input_h = Input(shape=(n_units,)) - decoder_state_input_c = Input(shape=(n_units,)) - decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c] - decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs) - decoder_states = [state_h, state_c] - decoder_outputs = decoder_dense(decoder_outputs) - decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states) +decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.4,recurrent_dropout=0.2) +decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c]) - return model, encoder_model, decoder_model + + +#dense layer +decoder_dense = TimeDistributed(Dense(y_voc, activation='softmax')) +decoder_outputs = decoder_dense(decoder_outputs) + +# Define the model +model = Model([encoder_inputs, decoder_inputs], decoder_outputs) + +model.summary() \ No newline at end of file diff --git a/data_clean.py b/data_clean.py index 1ff1a90..6a10ddc 100644 --- a/data_clean.py +++ b/data_clean.py @@ -1,244 +1,181 @@ from data_manip import * -import nltk -import tensorflow as tf -nltk.download('stopwords') - -def clean_text(text, remove_stopwords = True): - - # Convert words to lower case - text = text.lower() - #[_\-;%()+&=*%.,!?:$@\[\]/] - # Replace contractions with their longer forms - if True: - text = text.split() - new_text = [] - for word in text: - if word in contractions: - new_text.append(contractions[word]) - else: - new_text.append(word) - text = " ".join(new_text) - - # Format words and remove unwanted characters - text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, - flags=re.MULTILINE) - text = re.sub(r'\', ' ', text) - text = re.sub(r'\'', ' ', text) - - # Optionally, remove stop words - if remove_stopwords: - text = text.split() - stops = set(stopwords.words("english")) - text = [w for w in text if not w in stops] - text = " ".join(text) - return text - -clean_summaries = [] -for summary in reviews.Summary: - clean_summaries.append(clean_text(summary, remove_stopwords=False)) -print("Summaries are complete.") - -clean_texts = [] -for text in reviews.Text: - clean_texts.append(clean_text(text)) -print("Texts are complete.") - -def count_words(count_dict, text): - '''Count the number of occurrences of each word in a set of text''' - for sentence in text: - for word in sentence.split(): - if word not in count_dict: - count_dict[word] = 1 - else: - count_dict[word] += 1 -word_counts = {} - -count_words(word_counts, clean_summaries) -count_words(word_counts, clean_texts) - -print("Size of Vocabulary:", len(word_counts)) - -embeddings_index = {} -with open('/home/pbu/Downloads/numberbatch-en.txt', encoding='utf-8') as f: - for line in f: - values = line.split(' ') - word = values[0] - embedding = np.asarray(values[1:], dtype='float32') - embeddings_index[word] = embedding -print('Word embeddings:', len(embeddings_index)) - -# Find the number of words that are missing from CN, and are used more than our threshold. -missing_words = 0 -threshold = 20 - -for word, count in word_counts.items(): - if count > threshold: - if word not in embeddings_index: - missing_words += 1 - -missing_ratio = round(missing_words/len(word_counts),4)*100 - -print("Number of words missing from CN:", missing_words) -print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio)) - -# Limit the vocab that we will use to words that appear ≥ threshold or are in GloVe - -#dictionary to convert words to integers -vocab_to_int = {} - -value = 0 -for word, count in word_counts.items(): - if count >= threshold or word in embeddings_index: - vocab_to_int[word] = value - value += 1 - -# Special tokens that will be added to our vocab -codes = ["","","",""] - -# Add codes to vocab -for code in codes: - vocab_to_int[code] = len(vocab_to_int) - -# Dictionary to convert integers to words -int_to_vocab = {} -for word, value in vocab_to_int.items(): - int_to_vocab[value] = word - -usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100 - -print("Total number of unique words:", len(word_counts)) -print("Number of words we will use:", len(vocab_to_int)) -print("Percent of words we will use: {}%".format(usage_ratio)) - -# Need to use 300 for embedding dimensions to match CN's vectors. -embedding_dim = 300 -nb_words = len(vocab_to_int) - -# Create matrix with default values of zero -word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32) -for word, i in vocab_to_int.items(): - if word in embeddings_index: - word_embedding_matrix[i] = embeddings_index[word] + +stop_words = set(stopwords.words('english')) + +def text_cleaner(text,num): + newString = text.lower() + newString = BeautifulSoup(newString, "lxml").text + newString = re.sub(r'\([^)]*\)', '', newString) + newString = re.sub('"','', newString) + newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")]) + newString = re.sub(r"'s\b","",newString) + newString = re.sub("[^a-zA-Z]", " ", newString) + newString = re.sub('[m]{2,}', 'mm', newString) + if(num==0): + tokens = [w for w in newString.split() if not w in stop_words] else: - # If word not in CN, create a random embedding for it - new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim)) - embeddings_index[word] = new_embedding - word_embedding_matrix[i] = new_embedding - -# Check if value matches len(vocab_to_int) -print(len(word_embedding_matrix)) - -def convert_to_ints(text, word_count, unk_count, eos=False): - '''Convert words in text to an integer. - If word is not in vocab_to_int, use UNK's integer. - Total the number of words and UNKs. - Add EOS token to the end of texts''' - ints = [] - for sentence in text: - sentence_ints = [] - for word in sentence.split(): - word_count += 1 - if word in vocab_to_int: - sentence_ints.append(vocab_to_int[word]) - else: - sentence_ints.append(vocab_to_int[""]) - unk_count += 1 - if eos: - sentence_ints.append(vocab_to_int[""]) - ints.append(sentence_ints) - return ints, word_count, unk_count -# Apply convert_to_ints to clean_summaries and clean_texts -word_count = 0 -unk_count = 0 - -int_summaries, word_count, unk_count = convert_to_ints(clean_summaries, word_count, unk_count) -int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True) -unk_percent = round(unk_count/word_count,4)*100 - -print("Total number of words in headlines:", word_count) -print("Total number of UNKs in headlines:", unk_count) -print("Percent of words that are UNK: {}%".format(unk_percent)) - - -def create_lengths(text): - '''Create a data frame of the sentence lengths from a text''' - lengths = [] - for sentence in text: - lengths.append(len(sentence)) - - return pd.DataFrame(lengths, columns=['counts']) - -lengths_summaries = create_lengths(int_summaries) -lengths_texts = create_lengths(int_texts) - -print("Summaries:") -print(lengths_summaries.describe()) -print() -print("Texts:") -print(lengths_texts.describe()) - - -def unk_counter(sentence): - '''Counts the number of time UNK appears in a sentence.''' - unk_count = 0 - for word in sentence: - if word == vocab_to_int[""]: - unk_count += 1 - return unk_count - -# Sort the summaries and texts by the length of the texts, shortest to longest -# Limit the length of summaries and texts based on the min and max ranges. -# Remove reviews that include too many UNKs - -sorted_summaries = [] -sorted_texts = [] -max_text_length = 84 -max_summary_length = 13 -min_length = 2 -unk_text_limit = 1 -unk_summary_limit = 0 - -for length in range(min(lengths_texts.counts), max_text_length): - for count, words in enumerate(int_summaries): - if (len(int_summaries[count]) >= min_length and - len(int_summaries[count]) <= max_summary_length and - len(int_texts[count]) >= min_length and - unk_counter(int_summaries[count]) <= unk_summary_limit and - unk_counter(int_texts[count]) <= unk_text_limit and - length == len(int_texts[count]) - ): - sorted_summaries.append(int_summaries[count]) - sorted_texts.append(int_texts[count]) - -# Compare lengths to ensure they match -print(len(sorted_summaries)) -print(len(sorted_texts)) -#print(np.array(sorted_texts).shape) - -def pad_sentence_batch(sentence,max_length): - """Pad sentences with so that each sentence of a batch has the same length""" - - return [sentence + [vocab_to_int['']] * (max_length - len(sentence))] - -def process_encoding_input(target_data, vocab_to_int, batch_size): - '''Remove the last word id from each batch and concat the to the begining of each batch''' - - ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1]) - dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['']), ending], 1) - - return dec_input - -padded_summaries=list() -padded_text=list() - -for summaries in sorted_summaries: - summaries=process_encoding_input(summaries,vocab_to_int,1) - summaries=pad_sentence_batch(summaries,max_summary_length) - padded_summaries.append(summaries) - -for text in sorted_texts: - text=pad_sentence_batch(text,max_text_length) - padded_text.append(text) + tokens=newString.split() + long_words=[] + for i in tokens: + if len(i)>1: #removing short word + long_words.append(i) + return (" ".join(long_words)).strip() + +cleaned_text = [] +for t in data['Text']: + cleaned_text.append(text_cleaner(t,0)) + +cleaned_summary = [] +for t in data['Summary']: + cleaned_summary.append(text_cleaner(t,1)) + +data['cleaned_text']=cleaned_text +data['cleaned_summary']=cleaned_summary +''' +import matplotlib.pyplot as plt + +text_word_count = [] +summary_word_count = [] + +# populate the lists with sentence lengths +for i in data['cleaned_text']: + text_word_count.append(len(i.split())) + +for i in data['cleaned_summary']: + summary_word_count.append(len(i.split())) + +length_df = pd.DataFrame({'text':text_word_count, 'summary':summary_word_count}) + +length_df.hist(bins = 30) +plt.show() +''' + +cnt=0 +for i in data['cleaned_summary']: + if(len(i.split())<=8): + cnt=cnt+1 +print(cnt/len(data['cleaned_summary'])) + +max_text_len=30 +max_summary_len=8 + +cleaned_text = np.array(data['cleaned_text']) +cleaned_summary = np.array(data['cleaned_summary']) + +short_text = [] +short_summary = [] + +for i in range(len(cleaned_text)): + if (len(cleaned_summary[i].split()) <= max_summary_len and len(cleaned_text[i].split()) <= max_text_len): + short_text.append(cleaned_text[i]) + short_summary.append(cleaned_summary[i]) + +df = pd.DataFrame({'text': short_text, 'summary': short_summary}) + +df['summary'] = df['summary'].apply(lambda x : 'sostok '+ x + ' eostok') + +from sklearn.model_selection import train_test_split +x_tr,x_val,y_tr,y_val=train_test_split(np.array(df['text']),np.array(df['summary']),test_size=0.1,random_state=0,shuffle=True) + + +from keras.preprocessing.text import Tokenizer +from keras.preprocessing.sequence import pad_sequences + +#prepare a tokenizer for reviews on training data +x_tokenizer = Tokenizer() +x_tokenizer.fit_on_texts(list(x_tr)) + +thresh = 4 + +cnt = 0 +tot_cnt = 0 +freq = 0 +tot_freq = 0 + +for key, value in x_tokenizer.word_counts.items(): + tot_cnt = tot_cnt + 1 + tot_freq = tot_freq + value + if (value < thresh): + cnt = cnt + 1 + freq = freq + value + +print("% of rare words in vocabulary:", (cnt / tot_cnt) * 100) +print("Total Coverage of rare words:", (freq / tot_freq) * 100) + +#prepare a tokenizer for reviews on training data +x_tokenizer = Tokenizer(num_words=tot_cnt-cnt) +x_tokenizer.fit_on_texts(list(x_tr)) + +#convert text sequences into integer sequences +x_tr_seq = x_tokenizer.texts_to_sequences(x_tr) +x_val_seq = x_tokenizer.texts_to_sequences(x_val) + +#padding zero upto maximum length +x_tr = pad_sequences(x_tr_seq, maxlen=max_text_len, padding='post') +x_val = pad_sequences(x_val_seq, maxlen=max_text_len, padding='post') + +#size of vocabulary ( +1 for padding token) +x_voc = x_tokenizer.num_words + 1 + +print(x_voc) + +#prepare a tokenizer for reviews on training data +y_tokenizer = Tokenizer() +y_tokenizer.fit_on_texts(list(y_tr)) + +thresh = 6 + +cnt = 0 +tot_cnt = 0 +freq = 0 +tot_freq = 0 + +for key, value in y_tokenizer.word_counts.items(): + tot_cnt = tot_cnt + 1 + tot_freq = tot_freq + value + if (value < thresh): + cnt = cnt + 1 + freq = freq + value + +print("% of rare words in vocabulary:", (cnt / tot_cnt) * 100) +print("Total Coverage of rare words:", (freq / tot_freq) * 100) + +#prepare a tokenizer for reviews on training data +y_tokenizer = Tokenizer(num_words=tot_cnt-cnt) +y_tokenizer.fit_on_texts(list(y_tr)) + +#convert text sequences into integer sequences +y_tr_seq = y_tokenizer.texts_to_sequences(y_tr) +y_val_seq = y_tokenizer.texts_to_sequences(y_val) + +#padding zero upto maximum length +y_tr = pad_sequences(y_tr_seq, maxlen=max_summary_len, padding='post') +y_val = pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post') + +#size of vocabulary +y_voc = y_tokenizer.num_words +1 +y_tokenizer.word_counts['sostok'],len(y_tr) + +ind=[] +for i in range(len(y_tr)): + cnt=0 + for j in y_tr[i]: + if j!=0: + cnt=cnt+1 + if(cnt==2): + ind.append(i) + +y_tr=np.delete(y_tr,ind, axis=0) +x_tr=np.delete(x_tr,ind, axis=0) + +ind=[] +for i in range(len(y_val)): + cnt=0 + for j in y_val[i]: + if j!=0: + cnt=cnt+1 + if(cnt==2): + ind.append(i) + +y_val=np.delete(y_val,ind, axis=0) +x_val=np.delete(x_val,ind, axis=0) \ No newline at end of file diff --git a/data_manip.py b/data_manip.py index 10cc1f3..aa8c0a8 100644 --- a/data_manip.py +++ b/data_manip.py @@ -1,101 +1,38 @@ import numpy as np -import re import pandas as pd -import numpy as np -#import tensorflow as tf import re +from bs4 import BeautifulSoup +from keras.preprocessing.text import Tokenizer +from keras.preprocessing.sequence import pad_sequences from nltk.corpus import stopwords -import time -#from tensorflow.python.layers.core import Dense -#from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors -reviews = pd.read_csv("/home/pbu/Downloads/amazon-fine-food-reviews/Reviews.csv") -#print(reviews.shape) -print(reviews.isnull().sum()) -# Remove null values and unneeded features -reviews = reviews.dropna() -reviews = reviews.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator', - 'Score','Time'], 1) -reviews = reviews.reset_index(drop=True) +from keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed +from keras.models import Model +from keras.callbacks import ModelCheckpoint + +data=pd.read_csv("/home/pbu/Downloads/amazon-fine-food-reviews/Reviews.csv",nrows=100000) +data.drop_duplicates(subset=['Text'],inplace=True)#dropping duplicates +data.dropna(axis=0,inplace=True)#dropping na -''' -for i in range(5): - print("Review #",i+1) - print(reviews.Summary[i]) - print(reviews.Text[i]) - print('\n') -''' -contractions = { -"ain't": "am not", -"aren't": "are not", -"can't": "cannot", -"can't've": "cannot have", -"'cause": "because", -"could've": "could have", -"couldn't": "could not", -"couldn't've": "could not have", -"didn't": "did not", -"doesn't": "does not", -"don't": "do not", -"hadn't": "had not", -"hadn't've": "had not have", -"hasn't": "has not", -"haven't": "have not", -"he'd": "he would", -"he'd've": "he would have", -"he'll": "he will", -"he's": "he is", -"how'd": "how did", -"how'll": "how will", -"how's": "how is", -"i'd": "i would", -"i'll": "i will", -"i'm": "i am", -"i've": "i have", -"isn't": "is not", -"it'd": "it would", -"it'll": "it will", -"it's": "it is", -"let's": "let us", -"ma'am": "madam", -"mayn't": "may not", -"might've": "might have", -"mightn't": "might not", -"must've": "must have", -"mustn't": "must not", -"needn't": "need not", -"oughtn't": "ought not", -"shan't": "shall not", -"sha'n't": "shall not", -"she'd": "she would", -"she'll": "she will", -"she's": "she is", -"should've": "should have", -"shouldn't": "should not", -"that'd": "that would", -"that's": "that is", -"there'd": "there had", -"there's": "there is", -"they'd": "they would", -"they'll": "they will", -"they're": "they are", -"they've": "they have", -"wasn't": "was not", -"we'd": "we would", -"we'll": "we will", -"we're": "we are", -"we've": "we have", -"weren't": "were not", -"what'll": "what will", -"what're": "what are", -"what's": "what is", -"what've": "what have", -"where'd": "where did", -"where's": "where is", -"who'll": "who will", -"who's": "who is", -"won't": "will not", -"wouldn't": "would not", -"you'd": "you would", -"you'll": "you will", -"you're": "you are" -} +contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", + "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", + "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", + "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", + "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", + "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", + "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", + "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", + "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", + "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", + "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", + "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", + "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", + "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", + "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", + "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", + "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", + "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", + "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", + "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", + "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have", + "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", + "you're": "you are", "you've": "you have"} \ No newline at end of file diff --git a/inference_model_keras.py b/inference_model_keras.py new file mode 100644 index 0000000..a6b88e2 --- /dev/null +++ b/inference_model_keras.py @@ -0,0 +1,85 @@ +from train_model_keras import * + +reverse_target_word_index=y_tokenizer.index_word +reverse_source_word_index=x_tokenizer.index_word +target_word_index=y_tokenizer.word_index + +# Encode the input sequence to get the feature vector +encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c]) + +# Decoder setup +# Below tensors will hold the states of the previous time step +decoder_state_input_h = Input(shape=(latent_dim,)) +decoder_state_input_c = Input(shape=(latent_dim,)) +decoder_hidden_state_input = Input(shape=(max_text_len,latent_dim)) + +# Get the embeddings of the decoder sequence +dec_emb2= dec_emb_layer(decoder_inputs) +# To predict the next word in the sequence, set the initial states to the states from the previous time step +decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c]) + + + +# A dense softmax layer to generate prob dist. over the target vocabulary +decoder_outputs2 = decoder_dense(decoder_outputs2) + +# Final decoder model +decoder_model = Model( + [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c], + [decoder_outputs2] + [state_h2, state_c2]) + + +def decode_sequence(input_seq): + # Encode the input as state vectors. + e_out, e_h, e_c = encoder_model.predict(input_seq) + + # Generate empty target sequence of length 1. + target_seq = np.zeros((1, 1)) + + # Populate the first word of target sequence with the start word. + target_seq[0, 0] = target_word_index['sostok'] + + stop_condition = False + decoded_sentence = '' + while not stop_condition: + + output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c]) + + # Sample a token + sampled_token_index = np.argmax(output_tokens[0, -1, :]) + sampled_token = reverse_target_word_index[sampled_token_index] + + if (sampled_token != 'eostok'): + decoded_sentence += ' ' + sampled_token + + # Exit condition: either hit max length or find stop word. + if (sampled_token == 'eostok' or len(decoded_sentence.split()) >= (max_summary_len - 1)): + stop_condition = True + + # Update the target sequence (of length 1). + target_seq = np.zeros((1, 1)) + target_seq[0, 0] = sampled_token_index + + # Update internal states + e_h, e_c = h, c + + return decoded_sentence + +def seq2summary(input_seq): + newString='' + for i in input_seq: + if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']): + newString=newString+reverse_target_word_index[i]+' ' + return newString + +def seq2text(input_seq): + newString='' + for i in input_seq: + if(i!=0): + newString=newString+reverse_source_word_index[i]+' ' + return newString +for i in range(0,5): + print("Review:",seq2text(x_tr[i])) + print("Original summary:",seq2summary(y_tr[i])) + print("Predicted summary:",decode_sequence(x_tr[i].reshape(1,max_text_len))) + print("\n") \ No newline at end of file diff --git a/seq2seq_keras_sp.h5 b/seq2seq_keras_sp.h5 new file mode 100644 index 0000000..ce0af92 Binary files /dev/null and b/seq2seq_keras_sp.h5 differ diff --git a/train_model_keras.py b/train_model_keras.py index 93686da..f3d5d3b 100644 --- a/train_model_keras.py +++ b/train_model_keras.py @@ -1,17 +1,13 @@ -#training the model in keras from build_model_keras import * -start = 200000 -end = start + 50000 -sorted_summaries_short = sorted_summaries[start:end] -sorted_texts_short = sorted_texts[start:end] -print("The shortest text length:", len(sorted_texts_short[0])) -print("The longest text length:",len(sorted_texts_short[-1])) -embeddings=word_embedding_matrix -enc_embed_input=tf.nn.embedding_lookup(embeddings,padded_text) -dec_embed_input=tf.nn.embedding_lookup(embeddings,padded_summaries) -n_units=100 -model,encoder_model,decoder_model=define_model(max_text_length,max_summary_length,n_units) +model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy') +checkpointer = ModelCheckpoint(filepath='seq2seq'+"_keras_sp.h5", verbose=1, save_best_only=True) +history=model.fit([x_tr,y_tr[:,:-1]], y_tr.reshape(y_tr.shape[0],y_tr.shape[1], 1)[:,1:] ,epochs=50, + callbacks=[checkpointer],batch_size=128, validation_data=([x_val,y_val[:,:-1]], y_val.reshape(y_val.shape[0],y_val.shape[1], 1)[:,1:])) -model.compile(optimizer='rmsprop',loss='categorical_crossentropy') +from matplotlib import pyplot +pyplot.plot(history.history['loss'], label='train') +pyplot.plot(history.history['val_loss'], label='test') +pyplot.legend() +pyplot.show() \ No newline at end of file