Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
sasibonu authored May 5, 2024
1 parent e819107 commit d508060
Showing 1 changed file with 187 additions and 0 deletions.
187 changes: 187 additions & 0 deletions SasiBonuA05.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from datasets import load_dataset\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"import seaborn as sns\n",
"\n",
"import tensorflow as tf\n",
"from tensorflow.keras.models import Model\n",
"from tensorflow.keras.layers import Input, LSTM, Dense"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"dataset_en_hi = load_dataset(\"open_subtitles\", \"en-hi\")\n",
"dataset_da_ru = load_dataset(\"open_subtitles\", \"da-ru\")\n",
"data_en_hi = dataset_en_hi[\"train\"]\n",
"data_da_ru = dataset_da_ru[\"train\"]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'id': ['0', '1'],\n",
" 'meta': [{'year': 1948,\n",
" 'imdbId': 40522,\n",
" 'subtitleId': {'en': 4180294, 'hi': 4239106},\n",
" 'sentenceIds': {'en': [1], 'hi': [1]}},\n",
" {'year': 1948,\n",
" 'imdbId': 40522,\n",
" 'subtitleId': {'en': 4180294, 'hi': 4239106},\n",
" 'sentenceIds': {'en': [2], 'hi': [2]}}],\n",
" 'translation': [{'en': 'THE BICYCLE THIEF', 'hi': 'साइकिल चोर'},\n",
" {'en': 'Ricci?', 'hi': 'रिच्ची?'}]}"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_en_hi[0:2]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"# Extract English and Hindi sentences\n",
"english_sentences = [item['en'] for item in data_en_hi['translation']]\n",
"hindi_sentences = [item['hi'] for item in data_en_hi['translation']]\n",
"\n",
"# Tokenize English sentences\n",
"en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')\n",
"en_tokenizer.fit_on_texts(english_sentences)\n",
"en_vocab_size = len(en_tokenizer.word_index) + 1\n",
"max_en_seq_length = max([len(sentence.split()) for sentence in english_sentences])\n",
"en_input_sequences = en_tokenizer.texts_to_sequences(english_sentences)\n",
"padded_en_input_sequences = tf.keras.preprocessing.sequence.pad_sequences(en_input_sequences, padding='post', maxlen=max_en_seq_length)\n",
"\n",
"# Tokenize Hindi sentences\n",
"hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')\n",
"hi_tokenizer.fit_on_texts(hindi_sentences)\n",
"hi_vocab_size = len(hi_tokenizer.word_index) + 1\n",
"max_hi_seq_length = max([len(sentence.split()) for sentence in hindi_sentences])\n",
"hi_input_sequences = hi_tokenizer.texts_to_sequences(hindi_sentences)\n",
"padded_hi_input_sequences = tf.keras.preprocessing.sequence.pad_sequences(hi_input_sequences, padding='post', maxlen=max_hi_seq_length)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Define encoder-decoder model\n",
"embedding_dim = 256\n",
"\n",
"# Encoder\n",
"encoder_inputs = Input(shape=(max_en_seq_length,))\n",
"encoder_embedding = tf.keras.layers.Embedding(en_vocab_size, embedding_dim)(encoder_inputs)\n",
"encoder_lstm = LSTM(embedding_dim, return_state=True)\n",
"encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)\n",
"encoder_states = [state_h, state_c]\n",
"\n",
"# Decoder\n",
"decoder_inputs = Input(shape=(max_hi_seq_length,))\n",
"decoder_embedding = tf.keras.layers.Embedding(hi_vocab_size, embedding_dim)(decoder_inputs)\n",
"decoder_lstm = LSTM(embedding_dim, return_sequences=True, return_state=True)\n",
"decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)\n",
"decoder_dense = Dense(hi_vocab_size, activation='softmax')\n",
"decoder_outputs = decoder_dense(decoder_outputs)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/3\n",
"\u001b[1m1454/1454\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4654s\u001b[0m 3s/step - accuracy: 0.8945 - loss: 1.2579\n",
"Epoch 2/3\n",
"\u001b[1m1454/1454\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5806s\u001b[0m 4s/step - accuracy: 0.9642 - loss: 0.3142\n",
"Epoch 3/3\n",
"\u001b[1m1454/1454\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3675s\u001b[0m 3s/step - accuracy: 0.9805 - loss: 0.1782\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
}
],
"source": [
"\n",
"# Model\n",
"model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\n",
"\n",
"# Compile model\n",
"model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n",
"\n",
"# Train model\n",
"model.fit([padded_en_input_sequences, padded_hi_input_sequences], np.expand_dims(padded_hi_input_sequences, -1), batch_size=64, epochs=3)\n",
"\n",
"# Save model\n",
"model.save('translation_model.h5')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit d508060

Please sign in to comment.