From d977d1dd0e6e2e6040c981848dc34763d59357cd Mon Sep 17 00:00:00 2001 From: Sasi Bonu <37780704+sasibonu@users.noreply.github.com> Date: Sat, 4 May 2024 16:36:00 -0600 Subject: [PATCH] Delete .ipynb_checkpoints directory --- .../SasiBonuA04-checkpoint.ipynb | 627 ------------------ 1 file changed, 627 deletions(-) delete mode 100644 .ipynb_checkpoints/SasiBonuA04-checkpoint.ipynb diff --git a/.ipynb_checkpoints/SasiBonuA04-checkpoint.ipynb b/.ipynb_checkpoints/SasiBonuA04-checkpoint.ipynb deleted file mode 100644 index e88ea75..0000000 --- a/.ipynb_checkpoints/SasiBonuA04-checkpoint.ipynb +++ /dev/null @@ -1,627 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from datasets import load_dataset\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.model_selection import train_test_split\n", - "import seaborn as sns\n", - "\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "from sklearn.svm import SVC\n", - "from sklearn.metrics import accuracy_score\n", - "from sklearn.metrics import confusion_matrix" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "dataset_en_hi = load_dataset(\"open_subtitles\", \"en-hi\")\n", - "dataset_da_ru = load_dataset(\"open_subtitles\", \"da-ru\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "data_en_hi = dataset_en_hi[\"train\"]\n", - "data_da_ru = dataset_da_ru[\"train\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': ['0', '1'],\n", - " 'meta': [{'year': 1948,\n", - " 'imdbId': 40522,\n", - " 'subtitleId': {'en': 4180294, 'hi': 4239106},\n", - " 'sentenceIds': {'en': [1], 'hi': [1]}},\n", - " {'year': 1948,\n", - " 'imdbId': 40522,\n", - " 'subtitleId': {'en': 4180294, 'hi': 4239106},\n", - " 'sentenceIds': {'en': [2], 'hi': [2]}}],\n", - " 'translation': [{'en': 'THE BICYCLE THIEF', 'hi': 'साइकिल चोर'},\n", - " {'en': 'Ricci?', 'hi': 'रिच्ची?'}]}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_en_hi[0:2]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "data_en_hi = data_en_hi[:len(data_en_hi)//4] # Get the first half of the data" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.feature_extraction.text import CountVectorizer\n", - "from sklearn.svm import SVC\n", - "from sklearn.model_selection import GridSearchCV\n", - "\n", - "# Extract English subtitles and their corresponding Hindi translations\n", - "english_subtitles = [item['en'] for item in data_en_hi['translation']]\n", - "hindi_translations = [item['hi'] for item in data_en_hi['translation']]\n", - "merged_subtitles = english_subtitles + hindi_translations\n", - "labels = [0] * len(english_subtitles) + [1] * len(hindi_translations)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Convert text data into numerical features using CountVectorizer\n", - "vectorizer = CountVectorizer()\n", - "X = vectorizer.fit_transform(merged_subtitles)\n", - "y = labels" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = SVC(C=1, kernel='linear') # You can experiment with different kernels\n", - "model.fit(X_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "IOPub data rate exceeded.\n", - "The Jupyter server will temporarily stop sending output\n", - "to the client in order to avoid crashing it.\n", - "To change this limit, set the config variable\n", - "`--ServerApp.iopub_data_rate_limit`.\n", - "\n", - "Current values:\n", - "ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n", - "ServerApp.rate_limit_window=3.0 (secs)\n", - "\n" - ] - } - ], - "source": [ - "# Check the unique classes in Hindi translations\n", - "unique_classes = set(hindi_translations)\n", - "print(\"Unique classes:\", unique_classes)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def prepare_labels(data):\n", - " \"\"\"\n", - " This function assigns labels based on the presence of keys 'en' and 'hi'.\n", - " \"\"\"\n", - " labels = []\n", - " for item in data['translation']:\n", - " if 'en' in item:\n", - " labels.append(1) # Label 1 if 'en' key exists\n", - " elif 'hi' in item and 'en' not in item:\n", - " labels.append(0) # Label 0 if 'hi' exists (but not 'en')\n", - " else:\n", - " # Handle cases where neither 'en' nor 'hi' exist (modify as needed)\n", - " labels.append(-1) # Example: assign -1 for undefined label\n", - " return labels\n" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "text = []\n", - "\n", - "for item in data_en_hi['translation']:\n", - " text.append(item['en']) \n", - "\n", - "labels = prepare_labels(data_en_hi)\n", - "vectorizer = CountVectorizer()\n", - "\n", - "if len(text) != len(labels):\n", - " raise ValueError(\"Number of sentences and labels don't match!\")\n", - "\n", - "X = vectorizer.fit_transform(text)\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "The number of classes has to be greater than one; got 1 class", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[13], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m model \u001b[38;5;241m=\u001b[39m SVC(C\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, kernel\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrbf\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;66;03m# You can experiment with different kernels\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# Use the model for prediction on the testing set\u001b[39;00m\n\u001b[1;32m 5\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mpredict(X_test)\n", - "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py:1152\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1145\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m 1147\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[1;32m 1148\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[1;32m 1149\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[1;32m 1150\u001b[0m )\n\u001b[1;32m 1151\u001b[0m ):\n\u001b[0;32m-> 1152\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/svm/_base.py:199\u001b[0m, in \u001b[0;36mBaseLibSVM.fit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 190\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_data(\n\u001b[1;32m 191\u001b[0m X,\n\u001b[1;32m 192\u001b[0m y,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 196\u001b[0m accept_large_sparse\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 197\u001b[0m )\n\u001b[0;32m--> 199\u001b[0m y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_targets\u001b[49m\u001b[43m(\u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 201\u001b[0m sample_weight \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39masarray(\n\u001b[1;32m 202\u001b[0m [] \u001b[38;5;28;01mif\u001b[39;00m sample_weight \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m sample_weight, dtype\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39mfloat64\n\u001b[1;32m 203\u001b[0m )\n\u001b[1;32m 204\u001b[0m solver_type \u001b[38;5;241m=\u001b[39m LIBSVM_IMPL\u001b[38;5;241m.\u001b[39mindex(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_impl)\n", - "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/svm/_base.py:747\u001b[0m, in \u001b[0;36mBaseSVC._validate_targets\u001b[0;34m(self, y)\u001b[0m\n\u001b[1;32m 745\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclass_weight_ \u001b[38;5;241m=\u001b[39m compute_class_weight(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclass_weight, classes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mcls\u001b[39m, y\u001b[38;5;241m=\u001b[39my_)\n\u001b[1;32m 746\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mcls\u001b[39m) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[0;32m--> 747\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 748\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe number of classes has to be greater than one; got \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m class\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 749\u001b[0m \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mcls\u001b[39m)\n\u001b[1;32m 750\u001b[0m )\n\u001b[1;32m 752\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclasses_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\n\u001b[1;32m 754\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m np\u001b[38;5;241m.\u001b[39masarray(y, dtype\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39mfloat64, order\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mC\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mValueError\u001b[0m: The number of classes has to be greater than one; got 1 class" - ] - } - ], - "source": [ - "model = SVC(C=1, kernel='rbf') # You can experiment with different kernels\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Use the model for prediction on the testing set\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Evaluate model performance (e.g., accuracy)\n", - "from sklearn.metrics import accuracy_score\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(f\"Model accuracy on testing data: {accuracy:.4f}\")\n", - "\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# Print the confusion matrix\n", - "print(\"Confusion Matrix:\")\n", - "print(conf_matrix)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "The number of classes has to be greater than one; got 1 class", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[10], line 7\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# Step 5: Train the SVM model\u001b[39;00m\n\u001b[1;32m 6\u001b[0m model \u001b[38;5;241m=\u001b[39m SVC(C\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, kernel\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrbf\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 7\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;66;03m# Step 6: Evaluate the model\u001b[39;00m\n\u001b[1;32m 10\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mpredict(X_test)\n", - "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py:1152\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1145\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m 1147\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[1;32m 1148\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[1;32m 1149\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[1;32m 1150\u001b[0m )\n\u001b[1;32m 1151\u001b[0m ):\n\u001b[0;32m-> 1152\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/svm/_base.py:199\u001b[0m, in \u001b[0;36mBaseLibSVM.fit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 190\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_data(\n\u001b[1;32m 191\u001b[0m X,\n\u001b[1;32m 192\u001b[0m y,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 196\u001b[0m accept_large_sparse\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 197\u001b[0m )\n\u001b[0;32m--> 199\u001b[0m y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_targets\u001b[49m\u001b[43m(\u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 201\u001b[0m sample_weight \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39masarray(\n\u001b[1;32m 202\u001b[0m [] \u001b[38;5;28;01mif\u001b[39;00m sample_weight \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m sample_weight, dtype\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39mfloat64\n\u001b[1;32m 203\u001b[0m )\n\u001b[1;32m 204\u001b[0m solver_type \u001b[38;5;241m=\u001b[39m LIBSVM_IMPL\u001b[38;5;241m.\u001b[39mindex(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_impl)\n", - "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/svm/_base.py:747\u001b[0m, in \u001b[0;36mBaseSVC._validate_targets\u001b[0;34m(self, y)\u001b[0m\n\u001b[1;32m 745\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclass_weight_ \u001b[38;5;241m=\u001b[39m compute_class_weight(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclass_weight, classes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mcls\u001b[39m, y\u001b[38;5;241m=\u001b[39my_)\n\u001b[1;32m 746\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mcls\u001b[39m) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[0;32m--> 747\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 748\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe number of classes has to be greater than one; got \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m class\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 749\u001b[0m \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mcls\u001b[39m)\n\u001b[1;32m 750\u001b[0m )\n\u001b[1;32m 752\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclasses_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\n\u001b[1;32m 754\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m np\u001b[38;5;241m.\u001b[39masarray(y, dtype\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39mfloat64, order\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mC\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mValueError\u001b[0m: The number of classes has to be greater than one; got 1 class" - ] - } - ], - "source": [ - "\n", - "\n", - "# Step 4: Split the data into training and testing sets\n", - "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)\n", - "\n", - "\n", - "# Step 5: Train the SVM model\n", - "model = SVC(C=1, kernel='rbf')\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Step 6: Evaluate the model\n", - "y_pred = model.predict(X_test)\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(f\"Model accuracy on testing data: {accuracy:.4f}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of samples in X: 186032\n", - "Number of samples in labels: 186032\n" - ] - } - ], - "source": [ - "# Check the number of samples in X and labels\n", - "print(\"Number of samples in X:\", X.shape[0])\n", - "print(\"Number of samples in labels:\", len(labels))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model accuracy on testing data: 0.9999\n", - "Confusion Matrix:\n", - "[[ 0 1]\n", - " [ 0 18603]]\n" - ] - } - ], - "source": [ - "# Train the SVM model\n", - "model = SVC(C=1, kernel='poly', degree=3) # You can experiment with different kernels\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Use the model for prediction on the testing set\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Evaluate model performance (e.g., accuracy)\n", - "from sklearn.metrics import accuracy_score\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(f\"Model accuracy on testing data: {accuracy:.4f}\")\n", - "\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# Print the confusion matrix\n", - "print(\"Confusion Matrix:\")\n", - "print(conf_matrix)" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model accuracy on testing data: 1.0000\n", - "Confusion Matrix:\n", - "[[ 1 0]\n", - " [ 0 18603]]\n" - ] - } - ], - "source": [ - "# Train the SVM model\n", - "model = SVC(C=1, kernel='linear') # You can experiment with different kernels\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Use the model for prediction on the testing set\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Evaluate model performance (e.g., accuracy)\n", - "from sklearn.metrics import accuracy_score\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(f\"Model accuracy on testing data: {accuracy:.4f}\")\n", - "\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# Print the confusion matrix\n", - "print(\"Confusion Matrix:\")\n", - "print(conf_matrix)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model accuracy on testing data: 0.9999\n", - "Confusion Matrix:\n", - "[[ 0 1]\n", - " [ 0 18603]]\n" - ] - } - ], - "source": [ - "\n", - "# Train the SVM model\n", - "model = SVC(C=2, kernel='rbf') # You can experiment with different kernels\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Use the model for prediction on the testing set\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Evaluate model performance (e.g., accuracy)\n", - "from sklearn.metrics import accuracy_score\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(f\"Model accuracy on testing data: {accuracy:.4f}\")\n", - "\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# Print the confusion matrix\n", - "print(\"Confusion Matrix:\")\n", - "print(conf_matrix)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model accuracy on testing data: 0.9999\n", - "Confusion Matrix:\n", - "[[ 0 1]\n", - " [ 0 18603]]\n" - ] - } - ], - "source": [ - "# Train the SVM model\n", - "model = SVC(C=2, kernel='poly', degree=3) # You can experiment with different kernels\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Use the model for prediction on the testing set\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Evaluate model performance (e.g., accuracy)\n", - "from sklearn.metrics import accuracy_score\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(f\"Model accuracy on testing data: {accuracy:.4f}\")\n", - "\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# Print the confusion matrix\n", - "print(\"Confusion Matrix:\")\n", - "print(conf_matrix)" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model accuracy on testing data: 1.0000\n", - "Confusion Matrix:\n", - "[[ 1 0]\n", - " [ 0 18603]]\n" - ] - } - ], - "source": [ - "# Train the SVM model\n", - "model = SVC(C=2, kernel='linear') # You can experiment with different kernels\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Use the model for prediction on the testing set\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Evaluate model performance (e.g., accuracy)\n", - "from sklearn.metrics import accuracy_score\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(f\"Model accuracy on testing data: {accuracy:.4f}\")\n", - "\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# Print the confusion matrix\n", - "print(\"Confusion Matrix:\")\n", - "print(conf_matrix)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model accuracy on testing data: 0.9999\n", - "Confusion Matrix:\n", - "[[ 0 1]\n", - " [ 0 18603]]\n" - ] - } - ], - "source": [ - "\n", - "# Train the SVM model\n", - "model = SVC(C=5, kernel='rbf') # You can experiment with different kernels\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Use the model for prediction on the testing set\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Evaluate model performance (e.g., accuracy)\n", - "from sklearn.metrics import accuracy_score\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(f\"Model accuracy on testing data: {accuracy:.4f}\")\n", - "\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# Print the confusion matrix\n", - "print(\"Confusion Matrix:\")\n", - "print(conf_matrix)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'SVC' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Train the SVM model\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mSVC\u001b[49m(C\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m, kernel\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpoly\u001b[39m\u001b[38;5;124m'\u001b[39m, degree\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m) \u001b[38;5;66;03m# You can experiment with different kernels\u001b[39;00m\n\u001b[1;32m 3\u001b[0m model\u001b[38;5;241m.\u001b[39mfit(X_train, y_train)\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# Use the model for prediction on the testing set\u001b[39;00m\n", - "\u001b[0;31mNameError\u001b[0m: name 'SVC' is not defined" - ] - } - ], - "source": [ - "# Train the SVM model\n", - "model = SVC(C=5, kernel='poly', degree=3) # You can experiment with different kernels\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Use the model for prediction on the testing set\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Evaluate model performance (e.g., accuracy)\n", - "from sklearn.metrics import accuracy_score\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(f\"Model accuracy on testing data: {accuracy:.4f}\")\n", - "\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# Print the confusion matrix\n", - "print(\"Confusion Matrix:\")\n", - "print(conf_matrix)\n", - "\n", - "plt.figure(figsize=(8, 6))\n", - "sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])\n", - "plt.xlabel('Predicted')\n", - "plt.ylabel('True')\n", - "plt.title('Confusion Matrix')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model accuracy on testing data: 1.0000\n", - "Confusion Matrix:\n", - "[[ 1 0]\n", - " [ 0 18603]]\n" - ] - } - ], - "source": [ - "# Train the SVM model\n", - "model = SVC(C=5, kernel='linear') # You can experiment with different kernels\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Use the model for prediction on the testing set\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Evaluate model performance (e.g., accuracy)\n", - "from sklearn.metrics import accuracy_score\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(f\"Model accuracy on testing data: {accuracy:.4f}\")\n", - "\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# Print the confusion matrix\n", - "print(\"Confusion Matrix:\")\n", - "print(conf_matrix)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}