Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

241008 Lab Ensembles #2

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 218 additions & 19 deletions your-code/main.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "00cf591d-8a5b-499e-8715-1ad140867934",
"metadata": {},
"outputs": [],
Expand All @@ -28,10 +28,158 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "0bb5ea1c-a4e5-4419-bae8-661fe2d82711",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>cp</th>\n",
" <th>trestbps</th>\n",
" <th>chol</th>\n",
" <th>fbs</th>\n",
" <th>restecg</th>\n",
" <th>thalach</th>\n",
" <th>exang</th>\n",
" <th>oldpeak</th>\n",
" <th>slope</th>\n",
" <th>ca</th>\n",
" <th>thal</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>63</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>145</td>\n",
" <td>233</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>150</td>\n",
" <td>0</td>\n",
" <td>2.3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>130</td>\n",
" <td>250</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>187</td>\n",
" <td>0</td>\n",
" <td>3.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>41</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>130</td>\n",
" <td>204</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>172</td>\n",
" <td>0</td>\n",
" <td>1.4</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>56</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>120</td>\n",
" <td>236</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>178</td>\n",
" <td>0</td>\n",
" <td>0.8</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>57</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>120</td>\n",
" <td>354</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>163</td>\n",
" <td>1</td>\n",
" <td>0.6</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \\\n",
"0 63 1 3 145 233 1 0 150 0 2.3 0 \n",
"1 37 1 2 130 250 0 1 187 0 3.5 0 \n",
"2 41 0 1 130 204 0 0 172 0 1.4 2 \n",
"3 56 1 1 120 236 0 1 178 0 0.8 2 \n",
"4 57 0 0 120 354 0 1 163 1 0.6 2 \n",
"\n",
" ca thal target \n",
"0 0 1 1 \n",
"1 0 2 1 \n",
"2 0 2 1 \n",
"3 0 2 1 \n",
"4 0 2 1 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
Expand All @@ -41,12 +189,12 @@
"id": "870ebc45-d873-4c37-b1e4-ce2b0ebc08f2",
"metadata": {},
"source": [
"We are going to try to predict the presence of hart disease suing this features, starting with a classical baseline method and trying to improve on that result with a series of ensembled approaches."
"We are going to try to predict the presence of heart disease using this features, starting with a classical baseline method and trying to improve on that result with a series of ensembled approaches."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "23ad7e40-87f3-4b93-bef9-a9ddb5881ddc",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -75,23 +223,40 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"id": "d39376f1-b4ca-44c0-8364-d11b9a7605f9",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy of the train set: 1.0\n",
"Accuracy of the test set: 0.7894736842105263\n"
]
}
],
"source": [
"#Create and Train a Decision Tree Classifier and print the train and test accuracy\n",
"\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.metrics import accuracy_score, mean_squared_error\n",
"\n",
"# Train Decision Tree\n",
"\n",
"decision_tree_model = DecisionTreeClassifier()\n",
"decision_tree_model.fit(X_train_scaled, y_train)\n",
"\n",
"# Predictions and evaluation\n",
"y_pred_train = decision_tree_model.predict(X_train_scaled)\n",
"acc_train = accuracy_score(y_train, y_pred_train)\n",
"print(\"Accuracy of the train set:\", acc_train)\n",
"\n",
"y_pred_test = decision_tree_model.predict(X_test_scaled)\n",
"acc_test = accuracy_score(y_test, y_pred_test)\n",
"print(\"Accuracy of the test set:\", acc_test)\n",
"\n",
"# Evaluate performance\n"
"# Evaluate performance\n",
"# The accuracy of the train score is perfect, while the accuracy of the test score is much lower. This shows a clear overfitting model"
]
},
{
Expand Down Expand Up @@ -135,10 +300,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 21,
"id": "8fc76766-a90c-47ed-bd02-66827a1dc115",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy of the train set: 0.986784140969163\n",
"Accuracy of the test set: 0.8421052631578947\n"
]
}
],
"source": [
"# Create and Train a BaggingClassifier. \n",
"# Use as base estimator a weak decision tree (max_depth=1) and 100 estimators to really over a lot of different data samples\n",
Expand All @@ -147,12 +321,20 @@
"from sklearn.ensemble import BaggingClassifier\n",
"\n",
"# Train BaggingClassifier\n",
"\n",
"bagging_model = BaggingClassifier()\n",
"bagging_model.fit(X_train_scaled, y_train)\n",
"\n",
"# Predictions and evaluation\n",
"y_pred_train = bagging_model.predict(X_train_scaled)\n",
"acc_train = accuracy_score(y_train, y_pred_train)\n",
"print(\"Accuracy of the train set:\", acc_train)\n",
"\n",
"y_pred_test = bagging_model.predict(X_test_scaled)\n",
"acc_test = accuracy_score(y_test, y_pred_test)\n",
"print(\"Accuracy of the test set:\", acc_test)\n",
"\n",
"# Evaluate performance\n"
"# Evaluate performance\n",
"# It's still overfitting\n"
]
},
{
Expand Down Expand Up @@ -192,10 +374,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 31,
"id": "4bba1773-b0b0-44ba-a838-58b8c466ff88",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy of the train set: 0.9383259911894273\n",
"Accuracy of the test set: 0.8421052631578947\n"
]
}
],
"source": [
"# Create and Train a AdaBoostClassifier. \n",
"# Use as base estimator a weak decision tree (max_depth=1) and 100 estimators to really target the specific behaviors of this phenomenon\n",
Expand All @@ -204,12 +395,20 @@
"from sklearn.ensemble import AdaBoostClassifier\n",
"\n",
"# Train AdaBoost\n",
"\n",
"boosting_model = AdaBoostClassifier()\n",
"boosting_model.fit(X_train_scaled, y_train)\n",
"\n",
"# Predictions and evaluation\n",
"y_pred_train = boosting_model.predict(X_train_scaled)\n",
"acc_train = accuracy_score(y_train, y_pred_train)\n",
"print(\"Accuracy of the train set:\", acc_train)\n",
"\n",
"y_pred_test = boosting_model.predict(X_test_scaled)\n",
"acc_test = accuracy_score(y_test, y_pred_test)\n",
"print(\"Accuracy of the test set:\", acc_test)\n",
"\n",
"# Evaluate performance\n"
"# Evaluate performance\n",
"# While training accuracy is decreasing, testing accuracy has improved. This shows less overfitting (although it's still high)"
]
},
{
Expand All @@ -235,7 +434,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -249,7 +448,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
"version": "3.11.7"
}
},
"nbformat": 4,
Expand Down