ironhack-labs · saul-vv · Oct 13, 2024
diff --git a/your-code/main.ipynb b/your-code/main.ipynb
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "00cf591d-8a5b-499e-8715-1ad140867934",
    "metadata": {},
    "outputs": [],
@@ -28,10 +28,158 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "0bb5ea1c-a4e5-4419-bae8-661fe2d82711",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>age</th>\n",
+       "      <th>sex</th>\n",
+       "      <th>cp</th>\n",
+       "      <th>trestbps</th>\n",
+       "      <th>chol</th>\n",
+       "      <th>fbs</th>\n",
+       "      <th>restecg</th>\n",
+       "      <th>thalach</th>\n",
+       "      <th>exang</th>\n",
+       "      <th>oldpeak</th>\n",
+       "      <th>slope</th>\n",
+       "      <th>ca</th>\n",
+       "      <th>thal</th>\n",
+       "      <th>target</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>63</td>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>145</td>\n",
+       "      <td>233</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>150</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2.3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>37</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>130</td>\n",
+       "      <td>250</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>187</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3.5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>41</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>130</td>\n",
+       "      <td>204</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>172</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.4</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>56</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>120</td>\n",
+       "      <td>236</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>178</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.8</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>57</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>120</td>\n",
+       "      <td>354</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>163</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.6</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \\\n",
+       "0   63    1   3       145   233    1        0      150      0      2.3      0   \n",
+       "1   37    1   2       130   250    0        1      187      0      3.5      0   \n",
+       "2   41    0   1       130   204    0        0      172      0      1.4      2   \n",
+       "3   56    1   1       120   236    0        1      178      0      0.8      2   \n",
+       "4   57    0   0       120   354    0        1      163      1      0.6      2   \n",
+       "\n",
+       "   ca  thal  target  \n",
+       "0   0     1       1  \n",
+       "1   0     2       1  \n",
+       "2   0     2       1  \n",
+       "3   0     2       1  \n",
+       "4   0     2       1  "
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "df.head()"
    ]
@@ -41,12 +189,12 @@
    "id": "870ebc45-d873-4c37-b1e4-ce2b0ebc08f2",
    "metadata": {},
    "source": [
-    "We are going to try to predict the presence of hart disease suing this features, starting with a classical baseline method and trying to improve on that result with a series of ensembled approaches."
+    "We are going to try to predict the presence of heart disease using this features, starting with a classical baseline method and trying to improve on that result with a series of ensembled approaches."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "23ad7e40-87f3-4b93-bef9-a9ddb5881ddc",
    "metadata": {},
    "outputs": [],
@@ -75,23 +223,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "d39376f1-b4ca-44c0-8364-d11b9a7605f9",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy of the train set: 1.0\n",
+      "Accuracy of the test set: 0.7894736842105263\n"
+     ]
+    }
+   ],
    "source": [
     "#Create and Train a Decision Tree Classifier and print the train and test accuracy\n",
     "\n",
     "from sklearn.tree import DecisionTreeClassifier\n",
     "from sklearn.metrics import accuracy_score, mean_squared_error\n",
     "\n",
     "# Train Decision Tree\n",
-    "\n",
+    "decision_tree_model = DecisionTreeClassifier()\n",
+    "decision_tree_model.fit(X_train_scaled, y_train)\n",
     "\n",
     "# Predictions and evaluation\n",
+    "y_pred_train = decision_tree_model.predict(X_train_scaled)\n",
+    "acc_train = accuracy_score(y_train, y_pred_train)\n",
+    "print(\"Accuracy of the train set:\", acc_train)\n",
     "\n",
+    "y_pred_test = decision_tree_model.predict(X_test_scaled)\n",
+    "acc_test = accuracy_score(y_test, y_pred_test)\n",
+    "print(\"Accuracy of the test set:\", acc_test)\n",
     "\n",
-    "# Evaluate performance\n"
+    "# Evaluate performance\n",
+    "# The accuracy of the train score is perfect, while the accuracy of the test score is much lower. This shows a clear overfitting model"
    ]
   },
   {
@@ -135,10 +300,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "id": "8fc76766-a90c-47ed-bd02-66827a1dc115",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy of the train set: 0.986784140969163\n",
+      "Accuracy of the test set: 0.8421052631578947\n"
+     ]
+    }
+   ],
    "source": [
     "# Create and Train a BaggingClassifier. \n",
     "# Use as base estimator a weak decision tree (max_depth=1) and 100 estimators to really over a lot of different data samples\n",
@@ -147,12 +321,20 @@
     "from sklearn.ensemble import BaggingClassifier\n",
     "\n",
     "# Train BaggingClassifier\n",
-    "\n",
+    "bagging_model = BaggingClassifier()\n",
+    "bagging_model.fit(X_train_scaled, y_train)\n",
     "\n",
     "# Predictions and evaluation\n",
+    "y_pred_train = bagging_model.predict(X_train_scaled)\n",
+    "acc_train = accuracy_score(y_train, y_pred_train)\n",
+    "print(\"Accuracy of the train set:\", acc_train)\n",
     "\n",
+    "y_pred_test = bagging_model.predict(X_test_scaled)\n",
+    "acc_test = accuracy_score(y_test, y_pred_test)\n",
+    "print(\"Accuracy of the test set:\", acc_test)\n",
     "\n",
-    "# Evaluate performance\n"
+    "# Evaluate performance\n",
+    "# It's still overfitting\n"
    ]
   },
   {
@@ -192,10 +374,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
    "id": "4bba1773-b0b0-44ba-a838-58b8c466ff88",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy of the train set: 0.9383259911894273\n",
+      "Accuracy of the test set: 0.8421052631578947\n"
+     ]
+    }
+   ],
    "source": [
     "# Create and Train a AdaBoostClassifier. \n",
     "# Use as base estimator a weak decision tree (max_depth=1) and 100 estimators to really target the specific behaviors of this phenomenon\n",
@@ -204,12 +395,20 @@
     "from sklearn.ensemble import AdaBoostClassifier\n",
     "\n",
     "# Train AdaBoost\n",
-    "\n",
+    "boosting_model = AdaBoostClassifier()\n",
+    "boosting_model.fit(X_train_scaled, y_train)\n",
     "\n",
     "# Predictions and evaluation\n",
+    "y_pred_train = boosting_model.predict(X_train_scaled)\n",
+    "acc_train = accuracy_score(y_train, y_pred_train)\n",
+    "print(\"Accuracy of the train set:\", acc_train)\n",
     "\n",
+    "y_pred_test = boosting_model.predict(X_test_scaled)\n",
+    "acc_test = accuracy_score(y_test, y_pred_test)\n",
+    "print(\"Accuracy of the test set:\", acc_test)\n",
     "\n",
-    "# Evaluate performance\n"
+    "# Evaluate performance\n",
+    "# While training accuracy is decreasing, testing accuracy has improved. This shows less overfitting (although it's still high)"
    ]
   },
   {
@@ -235,7 +434,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -249,7 +448,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,