diff --git a/ACI IoT Network Traffic Dataset Analysis/Dataset/README.md b/ACI IoT Network Traffic Dataset Analysis/Dataset/README.md new file mode 100644 index 000000000..e4f69271e --- /dev/null +++ b/ACI IoT Network Traffic Dataset Analysis/Dataset/README.md @@ -0,0 +1 @@ +Link: https://www.kaggle.com/datasets/emilynack/aci-iot-network-traffic-dataset-2023 diff --git a/ACI IoT Network Traffic Dataset Analysis/Images/Bar Graph.png b/ACI IoT Network Traffic Dataset Analysis/Images/Bar Graph.png new file mode 100644 index 000000000..2d390c3bd Binary files /dev/null and b/ACI IoT Network Traffic Dataset Analysis/Images/Bar Graph.png differ diff --git a/ACI IoT Network Traffic Dataset Analysis/Images/Pie Chart.png b/ACI IoT Network Traffic Dataset Analysis/Images/Pie Chart.png new file mode 100644 index 000000000..86449d357 Binary files /dev/null and b/ACI IoT Network Traffic Dataset Analysis/Images/Pie Chart.png differ diff --git a/ACI IoT Network Traffic Dataset Analysis/Model/ACI_IoT_Network_Traffic_Dataset_Analysis.ipynb b/ACI IoT Network Traffic Dataset Analysis/Model/ACI_IoT_Network_Traffic_Dataset_Analysis.ipynb new file mode 100644 index 000000000..66c7e6e01 --- /dev/null +++ b/ACI IoT Network Traffic Dataset Analysis/Model/ACI_IoT_Network_Traffic_Dataset_Analysis.ipynb @@ -0,0 +1,3194 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "YwuxWrIj26L7" + }, + "source": [ + "# ACI IoT Network Traffic" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eA-t2jeQ2_Ay" + }, + "source": [ + "## Get dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "f2-gDXPihjaF" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "D:\\ML\\ACI IoT Network Traffic Dataset Analysis\\Model\n" + ] + } + ], + "source": [ + "import os\n", + "print(os.getcwd())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
srcipsportdstipdsportprotocol_msttltotal_lenpayloadstimelabel
0192.168.1.8160683239.255.255.2501900udp23624e4f54494659202a20485454502f312e310d0a4e54533a...1698670981Benign
1192.168.1.953160239.255.255.2501900udp12044d2d534541524348202a20485454502f312e310d0a484f...1698670984Benign
2192.168.1.953160239.255.255.2501900udp12044d2d534541524348202a20485454502f312e310d0a484f...1698670985Benign
3192.168.1.953160239.255.255.2501900udp12044d2d534541524348202a20485454502f312e310d0a484f...1698670986Benign
4192.168.1.953160239.255.255.2501900udp12044d2d534541524348202a20485454502f312e310d0a484f...1698670987Benign
\n", + "
" + ], + "text/plain": [ + " srcip sport dstip dsport protocol_m sttl total_len \\\n", + "0 192.168.1.81 60683 239.255.255.250 1900 udp 2 362 \n", + "1 192.168.1.9 53160 239.255.255.250 1900 udp 1 204 \n", + "2 192.168.1.9 53160 239.255.255.250 1900 udp 1 204 \n", + "3 192.168.1.9 53160 239.255.255.250 1900 udp 1 204 \n", + "4 192.168.1.9 53160 239.255.255.250 1900 udp 1 204 \n", + "\n", + " payload stime label \n", + "0 4e4f54494659202a20485454502f312e310d0a4e54533a... 1698670981 Benign \n", + "1 4d2d534541524348202a20485454502f312e310d0a484f... 1698670984 Benign \n", + "2 4d2d534541524348202a20485454502f312e310d0a484f... 1698670985 Benign \n", + "3 4d2d534541524348202a20485454502f312e310d0a484f... 1698670986 Benign \n", + "4 4d2d534541524348202a20485454502f312e310d0a484f... 1698670987 Benign " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('D:\\ML\\ACI IoT Network Traffic Dataset Analysis\\Dataset\\ACI-IoT-2023-Payload.csv')\n", + "pd.set_option('display.max_columns', None)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dW7fRPpahgGd" + }, + "source": [ + "## EDA" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "t4ChRqiXsIZZ" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "srcip 0\n", + "sport 0\n", + "dstip 0\n", + "dsport 0\n", + "protocol_m 0\n", + "sttl 0\n", + "total_len 0\n", + "payload 0\n", + "stime 0\n", + "label 0\n", + "dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "CxOR0kJM3SWR" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "label\n", + "Benign 601868\n", + "DNS Flood 18577\n", + "Dictionary Attack 4645\n", + "Slowloris 2974\n", + "SYN Flood 2113\n", + "Port Scan 582\n", + "Vulnerability Scan 445\n", + "OS Scan 156\n", + "UDP Flood 68\n", + "ICMP Flood 58\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.label.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "mrlQp4My4faj" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "labels = ['Benign', 'DNS Flood', 'Dictionary Attack', 'Slowloris', 'SYN Flood',\n", + " 'Port Scan', 'Vulnerability Scan', 'OS Scan', 'UDP Flood', 'ICMP Flood']\n", + "sizes = [601868, 18577, 4645, 2974, 2113, 582, 445, 156, 68, 58]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "Actc2Dc-4l4W" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(12, 6))\n", + "plt.bar(labels, sizes)\n", + "plt.title('Bar Graph')\n", + "plt.xlabel('Labels')\n", + "plt.ylabel('Count')\n", + "plt.xticks(rotation=45)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "YtSxaBSZ5C_h" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(14, 10))\n", + "wedges, texts, autotexts = ax.pie(sizes, autopct='%1.1f%%', startangle=140)\n", + "\n", + "ax.axis('equal')\n", + "plt.legend(wedges, labels, title=\"Activities\", loc=\"center left\", bbox_to_anchor=(1, 0, 0.5, 1))\n", + "\n", + "plt.title('Distribution of Network Activities')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KObpx4wfj3aQ" + }, + "source": [ + "## Data Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "LuWeKTG2j0hF" + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.preprocessing import StandardScaler" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "x0UovuBRjmE0" + }, + "outputs": [], + "source": [ + "X = df.drop('label', axis=1)\n", + "y = df['label']" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy import sparse\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "encoder = OneHotEncoder(sparse_output=True, dtype=np.float32)\n", + "\n", + "X_sparse = encoder.fit_transform(X)\n", + "\n", + "y_encoded = encoder.fit_transform(np.array(y).reshape(-1, 1))\n", + "y_train_dense = y_encoded.toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "pbRTAjhy6Yb3" + }, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X_sparse, y_train_dense, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "FBpz07hL_cxj" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((505188, 646287), (126298, 646287))" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((505188, 10), (126298, 10))" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jNDgEG-88ff9" + }, + "source": [ + "## Model Training" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lxFMcxYX-wMg" + }, + "source": [ + "### Model 1: Random Forest Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "2IhE9rN0_ZSE" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
RandomForestClassifier(n_estimators=10, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestClassifier(n_estimators=10, random_state=42)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "model_1 = RandomForestClassifier(n_estimators=10, random_state=42)\n", + "model_1.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = model_1.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "id": "m76sl7hf_mMY" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9977196788547721\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 126154\n", + " 1 1.00 1.00 1.00 126186\n", + "\n", + " micro avg 1.00 1.00 1.00 252340\n", + " macro avg 1.00 1.00 1.00 252340\n", + "weighted avg 1.00 1.00 1.00 252340\n", + " samples avg 1.00 1.00 1.00 252340\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import MultiLabelBinarizer\n", + "from sklearn.metrics import confusion_matrix, accuracy_score, classification_report\n", + "\n", + "# Assuming y_test and y_pred are in multi-label format\n", + "mlb = MultiLabelBinarizer()\n", + "y_test_binary = mlb.fit_transform(y_test)\n", + "y_pred_binary = mlb.transform(y_pred)\n", + "\n", + "# Compute and print accuracy\n", + "accuracy = accuracy_score(y_test_binary, y_pred_binary)\n", + "print(f\"Accuracy: {accuracy}\")\n", + "\n", + "# Print classification report\n", + "print(classification_report(y_test_binary, y_pred_binary, zero_division=1))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FsWqdWgf_PCW" + }, + "source": [ + "### Model 2: XGBClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "id": "uPcz1c688h5U" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
+       "              colsample_bylevel=None, colsample_bynode=None,\n",
+       "              colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
+       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
+       "              gamma=None, grow_policy=None, importance_type=None,\n",
+       "              interaction_constraints=None, learning_rate=None, max_bin=None,\n",
+       "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
+       "              max_delta_step=None, max_depth=None, max_leaves=None,\n",
+       "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
+       "              multi_strategy=None, n_estimators=None, n_jobs=None,\n",
+       "              num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=None, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=None, n_jobs=None,\n", + " num_parallel_tree=None, random_state=None, ...)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from xgboost import XGBClassifier\n", + "\n", + "model_2 = XGBClassifier()\n", + "model_2.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = model_2.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9979176233986287\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 126154\n", + " 1 1.00 1.00 1.00 126186\n", + "\n", + " micro avg 1.00 1.00 1.00 252340\n", + " macro avg 1.00 1.00 1.00 252340\n", + "weighted avg 1.00 1.00 1.00 252340\n", + " samples avg 1.00 1.00 1.00 252340\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import MultiLabelBinarizer\n", + "from sklearn.metrics import confusion_matrix, accuracy_score, classification_report\n", + "\n", + "# Assuming y_test and y_pred are in multi-label format\n", + "mlb = MultiLabelBinarizer()\n", + "y_test_binary = mlb.fit_transform(y_test)\n", + "y_pred_binary = mlb.transform(y_pred)\n", + "\n", + "# Compute and print accuracy\n", + "accuracy = accuracy_score(y_test_binary, y_pred_binary)\n", + "print(f\"Accuracy: {accuracy}\")\n", + "\n", + "# Print classification report\n", + "print(classification_report(y_test_binary, y_pred_binary, zero_division=1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model 3: SVM" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
MultiOutputClassifier(estimator=SVC(), n_jobs=-1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "MultiOutputClassifier(estimator=SVC(), n_jobs=-1)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.multioutput import MultiOutputClassifier\n", + "from sklearn.svm import SVC\n", + "\n", + "svm = SVC(kernel='rbf', gamma='scale', C=1.0)\n", + "\n", + "model_3 = MultiOutputClassifier(svm, n_jobs=-1)\n", + "\n", + "model_3.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = model_3.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9999287399642116\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 126298\n", + " 1 1.00 1.00 1.00 126298\n", + "\n", + " micro avg 1.00 1.00 1.00 252596\n", + " macro avg 1.00 1.00 1.00 252596\n", + "weighted avg 1.00 1.00 1.00 252596\n", + " samples avg 1.00 1.00 1.00 252596\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import MultiLabelBinarizer\n", + "from sklearn.metrics import confusion_matrix, accuracy_score, classification_report\n", + "\n", + "# Assuming y_test and y_pred are in multi-label format\n", + "mlb = MultiLabelBinarizer()\n", + "y_test_binary = mlb.fit_transform(y_test)\n", + "y_pred_binary = mlb.transform(y_pred)\n", + "\n", + "# Compute and print accuracy\n", + "accuracy = accuracy_score(y_test_binary, y_pred_binary)\n", + "print(f\"Accuracy: {accuracy}\")\n", + "\n", + "# Print classification report\n", + "print(classification_report(y_test_binary, y_pred_binary, zero_division=1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model 4: KNN" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier(n_neighbors=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier(n_neighbors=3)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "\n", + "model_4 = KNeighborsClassifier(n_neighbors=3)\n", + "model_4.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = model_4.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9999841644364915\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 126298\n", + " 1 1.00 1.00 1.00 126298\n", + "\n", + " micro avg 1.00 1.00 1.00 252596\n", + " macro avg 1.00 1.00 1.00 252596\n", + "weighted avg 1.00 1.00 1.00 252596\n", + " samples avg 1.00 1.00 1.00 252596\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import MultiLabelBinarizer\n", + "from sklearn.metrics import confusion_matrix, accuracy_score, classification_report\n", + "\n", + "# Assuming y_test and y_pred are in multi-label format\n", + "mlb = MultiLabelBinarizer()\n", + "y_test_binary = mlb.fit_transform(y_test)\n", + "y_pred_binary = mlb.transform(y_pred)\n", + "\n", + "# Compute and print accuracy\n", + "accuracy = accuracy_score(y_test_binary, y_pred_binary)\n", + "print(f\"Accuracy: {accuracy}\")\n", + "\n", + "# Print classification report\n", + "print(classification_report(y_test_binary, y_pred_binary, zero_division=1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model 5: Decision Tree" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
DecisionTreeClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "DecisionTreeClassifier(random_state=42)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "model_5 = DecisionTreeClassifier(random_state=42)\n", + "model_5.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = model_5.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 1.0\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 126298\n", + " 1 1.00 1.00 1.00 126298\n", + "\n", + " micro avg 1.00 1.00 1.00 252596\n", + " macro avg 1.00 1.00 1.00 252596\n", + "weighted avg 1.00 1.00 1.00 252596\n", + " samples avg 1.00 1.00 1.00 252596\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import MultiLabelBinarizer\n", + "from sklearn.metrics import confusion_matrix, accuracy_score, classification_report\n", + "\n", + "# Assuming y_test and y_pred are in multi-label format\n", + "mlb = MultiLabelBinarizer()\n", + "y_test_binary = mlb.fit_transform(y_test)\n", + "y_pred_binary = mlb.transform(y_pred)\n", + "\n", + "# Compute and print accuracy\n", + "accuracy = accuracy_score(y_test_binary, y_pred_binary)\n", + "print(f\"Accuracy: {accuracy}\")\n", + "\n", + "# Print classification report\n", + "print(classification_report(y_test_binary, y_pred_binary, zero_division=1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model 6: Dense Model" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Model: \"sequential_1\"\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1mModel: \"sequential_1\"\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n",
+       "┃ Layer (type)                          Output Shape                         Param # ┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n",
+       "│ dense_2 (Dense)                      │ (None, 10)                  │       6,462,880 │\n",
+       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
+       "│ dense_3 (Dense)                      │ (None, 10)                  │             110 │\n",
+       "└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n", + "│ dense_2 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m10\u001b[0m) │ \u001b[38;5;34m6,462,880\u001b[0m │\n", + "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n", + "│ dense_3 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m10\u001b[0m) │ \u001b[38;5;34m110\u001b[0m │\n", + "└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
 Total params: 6,462,990 (24.65 MB)\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m6,462,990\u001b[0m (24.65 MB)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
 Trainable params: 6,462,990 (24.65 MB)\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m6,462,990\u001b[0m (24.65 MB)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
 Non-trainable params: 0 (0.00 B)\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "input_shape = X_train.shape[1]\n", + "\n", + "model_6 = tf.keras.Sequential([\n", + " tf.keras.layers.Input(shape=(input_shape,)),\n", + " tf.keras.layers.Dense(10, activation='relu'),\n", + " tf.keras.layers.Dense(y_train.shape[1], activation='softmax')\n", + "])\n", + "\n", + "model_6.compile(loss='categorical_crossentropy',\n", + " optimizer=tf.keras.optimizers.SGD(),\n", + " metrics=['accuracy'])\n", + "\n", + "model_6.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n", + "\u001b[1m15788/15788\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m126s\u001b[0m 8ms/step - accuracy: 0.9728 - loss: 0.1719 - val_accuracy: 0.9928 - val_loss: 0.0325\n", + "Epoch 2/10\n", + "\u001b[1m15788/15788\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m125s\u001b[0m 8ms/step - accuracy: 0.9947 - loss: 0.0283 - val_accuracy: 0.9971 - val_loss: 0.0202\n", + "Epoch 3/10\n", + "\u001b[1m15788/15788\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m133s\u001b[0m 8ms/step - accuracy: 0.9972 - loss: 0.0181 - val_accuracy: 0.9971 - val_loss: 0.0153\n", + "Epoch 4/10\n", + "\u001b[1m15788/15788\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m132s\u001b[0m 8ms/step - accuracy: 0.9972 - loss: 0.0144 - val_accuracy: 0.9971 - val_loss: 0.0128\n", + "Epoch 5/10\n", + "\u001b[1m15788/15788\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m469s\u001b[0m 30ms/step - accuracy: 0.9972 - loss: 0.0120 - val_accuracy: 0.9971 - val_loss: 0.0113\n", + "Epoch 6/10\n", + "\u001b[1m15788/15788\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m138s\u001b[0m 9ms/step - accuracy: 0.9974 - loss: 0.0102 - val_accuracy: 0.9973 - val_loss: 0.0102\n", + "Epoch 7/10\n", + "\u001b[1m15788/15788\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m142s\u001b[0m 9ms/step - accuracy: 0.9975 - loss: 0.0092 - val_accuracy: 0.9975 - val_loss: 0.0094\n", + "Epoch 8/10\n", + "\u001b[1m15788/15788\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4099s\u001b[0m 256ms/step - accuracy: 0.9976 - loss: 0.0087 - val_accuracy: 0.9977 - val_loss: 0.0088\n", + "Epoch 9/10\n", + "\u001b[1m15788/15788\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m144s\u001b[0m 8ms/step - accuracy: 0.9980 - loss: 0.0079 - val_accuracy: 0.9979 - val_loss: 0.0083\n", + "Epoch 10/10\n", + "\u001b[1m15788/15788\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1035s\u001b[0m 65ms/step - accuracy: 0.9980 - loss: 0.0077 - val_accuracy: 0.9980 - val_loss: 0.0078\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_6.fit(X_train, y_train,\n", + " epochs=10,\n", + " validation_data=(X_test, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate_preds(y_true, y_pred, threshold=0.1):\n", + " # Make sure float32 (for metric calculations)\n", + " y_true = tf.cast(y_true, dtype=tf.float32)\n", + " y_pred = tf.cast(y_pred, dtype=tf.float32)\n", + "\n", + " # Calculate various metrics\n", + " mae = tf.reduce_mean(tf.abs(y_true - y_pred))\n", + " mse = tf.reduce_mean(tf.square(y_true - y_pred))\n", + " rmse = tf.sqrt(mse)\n", + " mape = tf.reduce_mean(tf.abs((y_true - y_pred) / tf.clip_by_value(tf.abs(y_true), 1e-7, tf.reduce_max(tf.abs(y_true))))) * 100\n", + "\n", + " # Calculate accuracy\n", + " # Predictions are considered accurate if the absolute error is within the threshold\n", + " accurate_predictions = tf.abs(y_true - y_pred) < (threshold * tf.abs(y_true))\n", + " accuracy = tf.reduce_mean(tf.cast(accurate_predictions, dtype=tf.float32))\n", + "\n", + " return {\n", + " \"mae\": mae.numpy(),\n", + " \"mse\": mse.numpy(),\n", + " \"rmse\": rmse.numpy(),\n", + " \"mape\": mape.numpy(),\n", + " \"accuracy\": accuracy.numpy()\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m3947/3947\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 8ms/step\n" + ] + } + ], + "source": [ + "y_preds = tf.squeeze(model_6.predict(X_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'mae': 0.0007503413,\n", + " 'mse': 0.0003056716,\n", + " 'rmse': 0.017483467,\n", + " 'mape': 375171.03,\n", + " 'accuracy': 0.099478215}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluate_preds(y_true=y_test, y_pred=y_preds, threshold=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m3947/3947\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m27s\u001b[0m 7ms/step - accuracy: 0.9980 - loss: 0.0079\n", + "Accuracy: 99.80%\n" + ] + } + ], + "source": [ + "loss, accuracy = model_6.evaluate(X_test, y_test)\n", + "print(f'Accuracy: {accuracy * 100:.2f}%')" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/ACI IoT Network Traffic Dataset Analysis/Model/README.md b/ACI IoT Network Traffic Dataset Analysis/Model/README.md new file mode 100644 index 000000000..dc909f79f --- /dev/null +++ b/ACI IoT Network Traffic Dataset Analysis/Model/README.md @@ -0,0 +1,55 @@ +## **ACI IoT Network Traffic Dataset Analysis** + +### 🎯 **Goal** + +Analyze the traffic dataset + +### 🧵 **Dataset** + +https://www.kaggle.com/datasets/emilynack/aci-iot-network-traffic-dataset-2023 + +### 🧾 **Description** + +The project aims to analyze the ACI IoT Network Traffic Dataset 2023 to identify patterns and anomalies in network traffic. The goal is to build an accurate predictive model for network anomaly detection. + +### 🧮 **What I had done!** + +Load the data using appropriate tools and conduct an initial inspection to identify missing values and outliers. Perform exploratory data analysis (EDA) to understand feature distributions and relationships. Clean the data by handling missing values and outliers, and engineer new features if necessary. Split the data into training and testing sets, scaling features as needed. Build and evaluate various models. Finalize the best model, evaluate it on the test set, and prepare it for deployment. Document each step and report the findings to ensure clarity and reproducibility. +### 🚀 **Models Implemented** + +1. Random Forest Classifier +2. XGBoost +3. SVM +4. KNN +5. Decision Tree +6. Dense Model + +### 📚 **Libraries Needed** + +1. numpy +2. Pandas +3. Matplotlib +4. sci-kit learn + +### 📊 **Exploratory Data Analysis Results** + + + + +### 📈 **Performance of the Models based on the Accuracy Scores** + +1. Random Forest Classifier: 99.77% +2. XGBoost: 99.79% +3. SVM: 99.99% +4. KNN: 99.99% +5. Decision Tree: 100% +6. Dense Model: 99.80% + + +### 📢 **Conclusion** + +Decision Tree is proven to be the best model with the accuracy score of 100% + +### ✒️ **Your Signature** + +Aditi Kala diff --git a/ACI IoT Network Traffic Dataset Analysis/requirements.txt b/ACI IoT Network Traffic Dataset Analysis/requirements.txt new file mode 100644 index 000000000..8493c72c5 --- /dev/null +++ b/ACI IoT Network Traffic Dataset Analysis/requirements.txt @@ -0,0 +1,4 @@ +sci-kit learn +matplotlib +numpy +pandas \ No newline at end of file