Add files via upload

sasibonu · May 5, 2024 · d508060 · d508060
1 parent e819107
commit d508060
Showing 1 changed file with 187 additions and 0 deletions.
diff --git a/SasiBonuA05.ipynb b/SasiBonuA05.ipynb
@@ -0,0 +1,187 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from datasets import load_dataset\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import seaborn as sns\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "from tensorflow.keras.models import Model\n",
+    "from tensorflow.keras.layers import Input, LSTM, Dense"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_en_hi = load_dataset(\"open_subtitles\", \"en-hi\")\n",
+    "dataset_da_ru = load_dataset(\"open_subtitles\", \"da-ru\")\n",
+    "data_en_hi = dataset_en_hi[\"train\"]\n",
+    "data_da_ru = dataset_da_ru[\"train\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': ['0', '1'],\n",
+       " 'meta': [{'year': 1948,\n",
+       "   'imdbId': 40522,\n",
+       "   'subtitleId': {'en': 4180294, 'hi': 4239106},\n",
+       "   'sentenceIds': {'en': [1], 'hi': [1]}},\n",
+       "  {'year': 1948,\n",
+       "   'imdbId': 40522,\n",
+       "   'subtitleId': {'en': 4180294, 'hi': 4239106},\n",
+       "   'sentenceIds': {'en': [2], 'hi': [2]}}],\n",
+       " 'translation': [{'en': 'THE BICYCLE THIEF', 'hi': 'साइकिल चोर'},\n",
+       "  {'en': 'Ricci?', 'hi': 'रिच्ची?'}]}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data_en_hi[0:2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "# Extract English and Hindi sentences\n",
+    "english_sentences = [item['en'] for item in data_en_hi['translation']]\n",
+    "hindi_sentences = [item['hi'] for item in data_en_hi['translation']]\n",
+    "\n",
+    "# Tokenize English sentences\n",
+    "en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')\n",
+    "en_tokenizer.fit_on_texts(english_sentences)\n",
+    "en_vocab_size = len(en_tokenizer.word_index) + 1\n",
+    "max_en_seq_length = max([len(sentence.split()) for sentence in english_sentences])\n",
+    "en_input_sequences = en_tokenizer.texts_to_sequences(english_sentences)\n",
+    "padded_en_input_sequences = tf.keras.preprocessing.sequence.pad_sequences(en_input_sequences, padding='post', maxlen=max_en_seq_length)\n",
+    "\n",
+    "# Tokenize Hindi sentences\n",
+    "hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')\n",
+    "hi_tokenizer.fit_on_texts(hindi_sentences)\n",
+    "hi_vocab_size = len(hi_tokenizer.word_index) + 1\n",
+    "max_hi_seq_length = max([len(sentence.split()) for sentence in hindi_sentences])\n",
+    "hi_input_sequences = hi_tokenizer.texts_to_sequences(hindi_sentences)\n",
+    "padded_hi_input_sequences = tf.keras.preprocessing.sequence.pad_sequences(hi_input_sequences, padding='post', maxlen=max_hi_seq_length)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Define encoder-decoder model\n",
+    "embedding_dim = 256\n",
+    "\n",
+    "# Encoder\n",
+    "encoder_inputs = Input(shape=(max_en_seq_length,))\n",
+    "encoder_embedding = tf.keras.layers.Embedding(en_vocab_size, embedding_dim)(encoder_inputs)\n",
+    "encoder_lstm = LSTM(embedding_dim, return_state=True)\n",
+    "encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)\n",
+    "encoder_states = [state_h, state_c]\n",
+    "\n",
+    "# Decoder\n",
+    "decoder_inputs = Input(shape=(max_hi_seq_length,))\n",
+    "decoder_embedding = tf.keras.layers.Embedding(hi_vocab_size, embedding_dim)(decoder_inputs)\n",
+    "decoder_lstm = LSTM(embedding_dim, return_sequences=True, return_state=True)\n",
+    "decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)\n",
+    "decoder_dense = Dense(hi_vocab_size, activation='softmax')\n",
+    "decoder_outputs = decoder_dense(decoder_outputs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/3\n",
+      "\u001b[1m1454/1454\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4654s\u001b[0m 3s/step - accuracy: 0.8945 - loss: 1.2579\n",
+      "Epoch 2/3\n",
+      "\u001b[1m1454/1454\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5806s\u001b[0m 4s/step - accuracy: 0.9642 - loss: 0.3142\n",
+      "Epoch 3/3\n",
+      "\u001b[1m1454/1454\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3675s\u001b[0m 3s/step - accuracy: 0.9805 - loss: 0.1782\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# Model\n",
+    "model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\n",
+    "\n",
+    "# Compile model\n",
+    "model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n",
+    "\n",
+    "# Train model\n",
+    "model.fit([padded_en_input_sequences, padded_hi_input_sequences], np.expand_dims(padded_hi_input_sequences, -1), batch_size=64, epochs=3)\n",
+    "\n",
+    "# Save model\n",
+    "model.save('translation_model.h5')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}