diff --git "a/TrabajoPr\303\241cticoIntegradorAA1.ipynb" "b/TrabajoPr\303\241cticoIntegradorAA1.ipynb"
index 3801ab2..7928584 100644
--- "a/TrabajoPr\303\241cticoIntegradorAA1.ipynb"
+++ "b/TrabajoPr\303\241cticoIntegradorAA1.ipynb"
@@ -6,7 +6,7 @@
"id": "WD5l3fumxWSA"
},
"source": [
- "# **Trabajo Práctico**\n",
+ "# **Trabajo Práctico Integrador**\n",
"\n",
"\n",
"\n",
@@ -42,15 +42,80 @@
"----------------------------------------------------------------------------------------------------------------------------------------------------------------"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Q1uJGdddgVHF"
+ },
+ "source": [
+ "**Comenzaremos instalado e importando las librerias necesarias.**"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 1,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "PGv1U2dRvG4o",
+ "outputId": "a2808360-4d3f-4678-8da3-ee2e625f91af"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Collecting tensorflow_addons\n",
+ " Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)\n",
+ "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/611.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m122.9/611.8 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m604.2/611.8 kB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m611.8/611.8 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from tensorflow_addons) (23.2)\n",
+ "Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)\n",
+ " Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)\n",
+ "Installing collected packages: typeguard, tensorflow_addons\n",
+ "Successfully installed tensorflow_addons-0.23.0 typeguard-2.13.3\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pip install tensorflow_addons"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "TQf7a6dli0dI",
+ "outputId": "97e4de92-32e4-4461-a676-4b5184414d72"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m409.6/409.6 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m230.6/230.6 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.6/78.6 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h"
+ ]
+ }
+ ],
+ "source": [
+ "!pip install --quiet optuna"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "zzDqBzYEol_h",
- "outputId": "9e40b865-30b3-4be8-8d9d-d23e9ea0ab86"
+ "outputId": "53e8967f-0980-48f0-faed-a0c99284a1ab"
},
"outputs": [
{
@@ -58,10 +123,10 @@
"name": "stdout",
"text": [
"Collecting shap\n",
- " Downloading shap-0.43.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (532 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m532.9/532.9 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ " Downloading shap-0.44.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (533 kB)\n",
+ "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/533.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.6/533.5 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m532.5/533.5 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m533.5/533.5 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from shap) (1.23.5)\n",
- "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from shap) (1.11.3)\n",
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from shap) (1.11.4)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from shap) (1.2.2)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from shap) (1.5.3)\n",
"Requirement already satisfied: tqdm>=4.27.0 in /usr/local/lib/python3.10/dist-packages (from shap) (4.66.1)\n",
@@ -77,7 +142,7 @@
"Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->shap) (3.2.0)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->shap) (1.16.0)\n",
"Installing collected packages: slicer, shap\n",
- "Successfully installed shap-0.43.0 slicer-0.0.7\n"
+ "Successfully installed shap-0.44.0 slicer-0.0.7\n"
]
}
],
@@ -87,46 +152,98 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 4,
"metadata": {
- "id": "UsEIcRVchiLd"
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "GHQTWBffHt2N",
+ "outputId": "e5100992-f506-49d8-fa77-12ff4ac1cd9f"
},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "\n",
+ "\n",
+ "TensorFlow Addons (TFA) has ended development and introduction of new features.\n",
+ "TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.\n",
+ "Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). \n",
+ "\n",
+ "For more information see: https://github.com/tensorflow/addons/issues/2807 \n",
+ "\n"
+ ]
+ }
+ ],
"source": [
"#Importamos las librerias necesarias\n",
- "\n",
+ "from datetime import datetime\n",
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
+ "import optuna\n",
+ "import joblib\n",
+ "import shap\n",
"\n",
- "\n",
- "from sklearn.preprocessing import StandardScaler\n",
- "from sklearn.preprocessing import MaxAbsScaler\n",
"from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, ElasticNetCV, LassoCV\n",
- "from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error\n",
"from sklearn.linear_model import SGDRegressor\n",
"\n",
- "\n",
+ "from tensorflow_addons.metrics import RSquare\n",
"from sklearn.datasets import make_classification\n",
"from sklearn.linear_model import LogisticRegression\n",
- "from sklearn.ensemble import RandomForestClassifier\n",
"from imblearn.over_sampling import RandomOverSampler, SMOTE\n",
"from imblearn.under_sampling import RandomUnderSampler, NearMiss\n",
- "from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, auc,accuracy_score,classification_report, confusion_matrix\n",
+ "from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, auc,accuracy_score,classification_report, confusion_matrix,make_scorer, f1_score\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
- "import shap\n",
"from sklearn.dummy import DummyClassifier, DummyRegressor\n",
"\n",
- "\n",
+ "from sklearn.base import RegressorMixin\n",
"import tensorflow as tf\n",
- "from mpl_toolkits.mplot3d import Axes3D"
+ "from tensorflow.keras.models import Sequential\n",
+ "from tensorflow.keras.layers import Dense\n",
+ "from mpl_toolkits.mplot3d import Axes3D\n",
+ "from sklearn.base import BaseEstimator, TransformerMixin\n",
+ "from tensorflow.keras.metrics import F1Score\n",
+ "\n",
+ "from sklearn.pipeline import Pipeline, make_pipeline\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, LeaveOneOut,GridSearchCV, RandomizedSearchCV\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.preprocessing import FunctionTransformer\n",
+ "\n",
+ "\n",
+ "\n",
+ "from keras.metrics import Precision, Recall\n",
+ "import keras.backend as K"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "---\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "Qb9vgzdpPQWT"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "sJg0OdXigdQC"
+ },
+ "source": [
+ "# Cargamos nuestros datos y exploramos los mismos"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 5,
"metadata": {
"id": "lDILFm7jhiLh"
},
@@ -137,6 +254,37 @@
"df = pd.read_csv(file_path, sep=',',engine='python')"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "id": "x4ZswXJnNBIl",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "57e57182-dcf9-4c75-c87a-065f5df48ba7"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Index(['Unnamed: 0', 'Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall',\n",
+ " 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am',\n",
+ " 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',\n",
+ " 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',\n",
+ " 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow', 'RainfallTomorrow'],\n",
+ " dtype='object')"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ],
+ "source": [
+ "df.columns"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {
@@ -153,14 +301,14 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 7,
"metadata": {
+ "id": "dsofBr3ZhiLi",
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 360
+ "height": 236
},
- "id": "dsofBr3ZhiLi",
- "outputId": "6533704e-3a53-44a4-ce5f-fcf39d03e08b"
+ "outputId": "25d7e03d-bc76-4ddd-f076-c3196b636ee8"
},
"outputs": [
{
@@ -199,7 +347,7 @@
],
"text/html": [
"\n",
- "
\n",
+ "
\n",
"
\n",
"\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Date | \n",
- " Location | \n",
- " MinTemp | \n",
- " MaxTemp | \n",
- " Rainfall | \n",
- " Evaporation | \n",
- " Sunshine | \n",
- " WindGustDir | \n",
- " WindGustSpeed | \n",
- " WindDir9am | \n",
- " ... | \n",
- " Humidity3pm | \n",
- " Pressure9am | \n",
- " Pressure3pm | \n",
- " Cloud9am | \n",
- " Cloud3pm | \n",
- " Temp9am | \n",
- " Temp3pm | \n",
- " RainToday | \n",
- " RainTomorrow | \n",
- " RainfallTomorrow | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 30167 | \n",
- " 2008-02-01 | \n",
- " Sydney | \n",
- " 19.5 | \n",
- " 22.4 | \n",
- " 15.6 | \n",
- " 6.2 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " S | \n",
- " ... | \n",
- " 84.0 | \n",
- " 1017.6 | \n",
- " 1017.4 | \n",
- " 8.0 | \n",
- " 8.0 | \n",
- " 20.7 | \n",
- " 20.9 | \n",
- " Yes | \n",
- " Yes | \n",
- " 6.0 | \n",
- "
\n",
- " \n",
- " 30168 | \n",
- " 2008-02-02 | \n",
- " Sydney | \n",
- " 19.5 | \n",
- " 25.6 | \n",
- " 6.0 | \n",
- " 3.4 | \n",
- " 2.7 | \n",
- " NaN | \n",
- " NaN | \n",
- " W | \n",
- " ... | \n",
- " 73.0 | \n",
- " 1017.9 | \n",
- " 1016.4 | \n",
- " 7.0 | \n",
- " 7.0 | \n",
- " 22.4 | \n",
- " 24.8 | \n",
- " Yes | \n",
- " Yes | \n",
- " 6.6 | \n",
- "
\n",
- " \n",
- " 30169 | \n",
- " 2008-02-03 | \n",
- " Sydney | \n",
- " 21.6 | \n",
- " 24.5 | \n",
- " 6.6 | \n",
- " 2.4 | \n",
- " 0.1 | \n",
- " NaN | \n",
- " NaN | \n",
- " ESE | \n",
- " ... | \n",
- " 86.0 | \n",
- " 1016.7 | \n",
- " 1015.6 | \n",
- " 7.0 | \n",
- " 8.0 | \n",
- " 23.5 | \n",
- " 23.0 | \n",
- " Yes | \n",
- " Yes | \n",
- " 18.8 | \n",
- "
\n",
- " \n",
- " 30170 | \n",
- " 2008-02-04 | \n",
- " Sydney | \n",
- " 20.2 | \n",
- " 22.8 | \n",
- " 18.8 | \n",
- " 2.2 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NNE | \n",
- " ... | \n",
- " 90.0 | \n",
- " 1014.2 | \n",
- " 1011.8 | \n",
- " 8.0 | \n",
- " 8.0 | \n",
- " 21.4 | \n",
- " 20.9 | \n",
- " Yes | \n",
- " Yes | \n",
- " 77.4 | \n",
- "
\n",
- " \n",
- " 30171 | \n",
- " 2008-02-05 | \n",
- " Sydney | \n",
- " 19.7 | \n",
- " 25.7 | \n",
- " 77.4 | \n",
- " NaN | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NNE | \n",
- " ... | \n",
- " 74.0 | \n",
- " 1008.3 | \n",
- " 1004.8 | \n",
- " 8.0 | \n",
- " 8.0 | \n",
- " 22.5 | \n",
- " 25.5 | \n",
- " Yes | \n",
- " Yes | \n",
- " 1.6 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 24 columns
\n",
- "
\n",
- "
\n",
- "
\n"
- ]
- },
- "metadata": {},
- "execution_count": 7
- }
- ],
- "source": [
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "mQlH9q5Q8N_z",
- "outputId": "34d8ed8f-ede4-41af-dcde-bd35aca4279b"
+ "outputId": "d66ea139-3e38-49f8-d2b0-ad99a1e0b899"
},
"outputs": [
{
@@ -1077,7 +830,7 @@
]
},
"metadata": {},
- "execution_count": 8
+ "execution_count": 12
}
],
"source": [
@@ -1105,13 +858,13 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 13,
"metadata": {
+ "id": "nQSe-TKp6Rgp",
"colab": {
"base_uri": "https://localhost:8080/"
},
- "id": "nQSe-TKp6Rgp",
- "outputId": "755a6c08-7574-47c4-f7a9-48aac68ac6f9"
+ "outputId": "f67bf11a-4dc6-4dbe-a9b5-bfdbee6f1c51"
},
"outputs": [
{
@@ -1122,7 +875,7 @@
]
},
"metadata": {},
- "execution_count": 9
+ "execution_count": 13
}
],
"source": [
@@ -1140,13 +893,13 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 14,
"metadata": {
+ "id": "IW09Nyvj7CGd",
"colab": {
"base_uri": "https://localhost:8080/"
},
- "id": "IW09Nyvj7CGd",
- "outputId": "8de0e9f1-c44b-4a24-adb6-23e84fb5d9f0"
+ "outputId": "131a9f05-7162-40e3-ce60-81df052a1ccb"
},
"outputs": [
{
@@ -1157,7 +910,7 @@
]
},
"metadata": {},
- "execution_count": 10
+ "execution_count": 14
}
],
"source": [
@@ -1175,13 +928,13 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 15,
"metadata": {
+ "id": "wyQ-xvHQ2cTz",
"colab": {
"base_uri": "https://localhost:8080/"
},
- "id": "wyQ-xvHQ2cTz",
- "outputId": "5e7303dd-561f-40e0-b006-915e29fc08a7"
+ "outputId": "9c10efe4-8bdd-4d33-f2f9-2a89f89a52d8"
},
"outputs": [
{
@@ -1216,7 +969,7 @@
]
},
"metadata": {},
- "execution_count": 11
+ "execution_count": 15
}
],
"source": [
@@ -1270,16 +1023,198 @@
"\n"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "dnCDkiwTkqbo"
+ },
+ "source": [
+ "En primer lugar, hacemos una función para dividir nuestros datos en conjuntos de Entrenamiento y Test"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "sAsRkcdk62F_"
+ },
+ "source": [
+ "Visualizaremos si las variables a predecir tienen valores faltantes\n",
+ "\n",
+ "\n",
+ "Eliminamos los registros donde `RainTomorrow` y `RainfallTomorrow` tienen valores nulos ya que completarlos erroneamente podria sesgar nuestro modelo."
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 16,
+ "metadata": {
+ "id": "8iKAa15P6_LA"
+ },
+ "outputs": [],
+ "source": [
+ "df_nulos = df[df['RainfallTomorrow'].isna()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "zx3RPLvl6_PN",
+ "outputId": "19c6a4d0-1480-4291-cd7a-000867255fc3"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "787"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ],
+ "source": [
+ "df_nulos['RainfallTomorrow'].isna().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "4fVlMdOL7D2r",
+ "outputId": "9ff80b46-49e7-4469-da29-4724abad2e39"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "787"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 18
+ }
+ ],
+ "source": [
+ "df_nulos['RainTomorrow'].isna().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "-36LifFE7GiG",
+ "outputId": "e35d4038-b726-4d19-dba7-218ea0da79ce"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(15986, 24)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 19
+ }
+ ],
+ "source": [
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "OcOe0aOC7I2i"
+ },
+ "source": [
+ "Contamos con 15986 registros, por lo tanto eliminar 787 en los que nuestras variables a predecir contienen valores nulos consideramos que no afectará a nuestro modelo."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "id": "gewGjcXEa9OS"
+ },
+ "outputs": [],
+ "source": [
+ "def split_train_test(df, test_size, seed=42):\n",
+ "\n",
+ " train, test = train_test_split(df, test_size = test_size, shuffle = True, random_state = seed) # Splitting data\n",
+ "\n",
+ " #Eliminamos los registros con valores nulos de la variable target\n",
+ " train = train.dropna(subset=['RainfallTomorrow','RainTomorrow'])\n",
+ " test = test.dropna(subset=['RainfallTomorrow','RainTomorrow'])\n",
+ "\n",
+ " print(f'\\n Train shape: {train.shape}\\n')\n",
+ " print(f'\\n {len(train)} Samples \\n')\n",
+ " print('\\n' * 2)\n",
+ "\n",
+ " print(f'\\n Test shape: {test.shape:}\\n')\n",
+ " print(f'\\n {len(test)} Samples \\n')\n",
+ "\n",
+ " return train, test"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "73ltOabZn-Q0",
+ "outputId": "c3e5b817-4568-43b5-a29f-30d019e3c4f3"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ " Train shape: (12162, 24)\n",
+ "\n",
+ "\n",
+ " 12162 Samples \n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " Test shape: (3037, 24)\n",
+ "\n",
+ "\n",
+ " 3037 Samples \n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "train, test = split_train_test(df, test_size=0.2, seed=42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 320
+ "height": 300
},
"id": "0Wx3O01VhiLk",
- "outputId": "0e486570-08cd-49ce-e55c-7de412faf8e1"
+ "outputId": "035b9242-3d6f-454f-c72d-3d99305a456b"
},
"outputs": [
{
@@ -1287,19 +1222,19 @@
"data": {
"text/plain": [
" MinTemp MaxTemp Rainfall Evaporation Sunshine \\\n",
- "count 15495.000000 15500.000000 15199.000000 14280.000000 14038.000000 \n",
- "mean 11.605221 21.751781 2.303744 4.866828 6.854694 \n",
- "std 5.851347 6.053679 7.385992 3.069869 3.842419 \n",
+ "count 12153.000000 12156.000000 12029.000000 10816.000000 10625.000000 \n",
+ "mean 11.570781 21.753751 2.267986 4.869915 6.916847 \n",
+ "std 5.857130 6.045081 7.355861 3.067040 3.822876 \n",
"min -8.000000 4.100000 0.000000 0.000000 0.000000 \n",
- "25% 7.900000 17.100000 0.000000 2.600000 3.700000 \n",
- "50% 11.600000 21.300000 0.000000 4.200000 7.500000 \n",
- "75% 15.800000 25.800000 0.800000 6.600000 10.000000 \n",
- "max 30.500000 46.800000 119.400000 23.800000 13.900000 \n",
+ "25% 7.900000 17.200000 0.000000 2.600000 3.800000 \n",
+ "50% 11.500000 21.300000 0.000000 4.200000 7.600000 \n",
+ "75% 15.800000 25.700000 0.800000 6.600000 10.000000 \n",
+ "max 28.800000 46.800000 119.400000 23.800000 13.900000 \n",
"\n",
" WindGustSpeed WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm \\\n",
- "count 14531.000000 15725.000000 15737.000000 15416.000000 15461.000000 \n",
- "mean 44.417315 16.613672 21.797166 68.903023 51.280577 \n",
- "std 14.958027 10.336386 9.518033 15.614788 17.721335 \n",
+ "count 11000.000000 11953.000000 11963.000000 12094.000000 12124.000000 \n",
+ "mean 44.247636 16.423994 21.711527 68.804366 51.090069 \n",
+ "std 14.939090 10.264634 9.514047 15.700963 17.702179 \n",
"min 11.000000 0.000000 0.000000 11.000000 3.000000 \n",
"25% 33.000000 9.000000 15.000000 59.000000 39.000000 \n",
"50% 43.000000 15.000000 20.000000 70.000000 51.000000 \n",
@@ -1307,28 +1242,28 @@
"max 122.000000 69.000000 76.000000 100.000000 100.000000 \n",
"\n",
" Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am \\\n",
- "count 15251.000000 15260.000000 13309.000000 13054.000000 15481.000000 \n",
- "mean 1018.305836 1016.064239 4.750545 4.746131 15.545953 \n",
- "std 7.400730 7.265386 2.728329 2.589879 5.559295 \n",
- "min 986.700000 985.500000 0.000000 0.000000 -1.300000 \n",
- "25% 1013.400000 1011.200000 2.000000 2.000000 11.600000 \n",
+ "count 11959.000000 11963.000000 10501.000000 10323.000000 12144.000000 \n",
+ "mean 1018.401346 1016.144311 4.721074 4.717136 15.536553 \n",
+ "std 7.407163 7.263761 2.730010 2.592636 5.554874 \n",
+ "min 986.700000 985.500000 0.000000 0.000000 -1.200000 \n",
+ "25% 1013.500000 1011.300000 2.000000 2.000000 11.500000 \n",
"50% 1018.500000 1016.300000 6.000000 6.000000 15.400000 \n",
- "75% 1023.300000 1021.000000 7.000000 7.000000 19.500000 \n",
- "max 1040.600000 1037.900000 9.000000 8.000000 37.200000 \n",
+ "75% 1023.300000 1021.100000 7.000000 7.000000 19.500000 \n",
+ "max 1040.300000 1037.800000 9.000000 8.000000 37.200000 \n",
"\n",
" Temp3pm RainfallTomorrow \n",
- "count 15490.000000 15199.000000 \n",
- "mean 20.229154 2.302586 \n",
- "std 5.857956 7.385251 \n",
+ "count 12148.000000 12162.000000 \n",
+ "mean 20.237521 2.240585 \n",
+ "std 5.849409 7.209555 \n",
"min 3.700000 0.000000 \n",
- "25% 15.900000 0.000000 \n",
+ "25% 16.000000 0.000000 \n",
"50% 19.800000 0.000000 \n",
"75% 24.100000 0.800000 \n",
- "max 46.100000 119.400000 "
+ "max 46.100000 109.400000 "
],
"text/html": [
"\n",
- "
\n",
+ "
\n",
"
\n",
"\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Date | \n",
+ " Location | \n",
+ " MinTemp | \n",
+ " MaxTemp | \n",
+ " Rainfall | \n",
+ " Evaporation | \n",
+ " Sunshine | \n",
+ " WindGustDir | \n",
+ " WindGustSpeed | \n",
+ " WindDir9am | \n",
+ " ... | \n",
+ " Pressure9am | \n",
+ " Pressure3pm | \n",
+ " Cloud9am | \n",
+ " Cloud3pm | \n",
+ " Temp9am | \n",
+ " Temp3pm | \n",
+ " RainToday | \n",
+ " RainTomorrow | \n",
+ " RainfallTomorrow | \n",
+ " season | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 35279 | \n",
+ " 2014-02-02 | \n",
+ " SydneyAirport | \n",
+ " 22.1 | \n",
+ " 29.7 | \n",
+ " 0.0 | \n",
+ " 7.2 | \n",
+ " 12.7 | \n",
+ " NNE | \n",
+ " 52.0 | \n",
+ " NNE | \n",
+ " ... | \n",
+ " 1015.6 | \n",
+ " 1014.5 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 25.4 | \n",
+ " 28.6 | \n",
+ " No | \n",
+ " No | \n",
+ " 0.0 | \n",
+ " Verano | \n",
+ "
\n",
+ " \n",
+ " 31088 | \n",
+ " 2010-08-10 | \n",
+ " Sydney | \n",
+ " 9.3 | \n",
+ " 16.4 | \n",
+ " 2.8 | \n",
+ " 2.4 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " WSW | \n",
+ " ... | \n",
+ " 1019.2 | \n",
+ " 1013.8 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 12.2 | \n",
+ " 16.2 | \n",
+ " Yes | \n",
+ " Yes | \n",
+ " 4.6 | \n",
+ " Invierno | \n",
+ "
\n",
+ " \n",
+ " 31349 | \n",
+ " 2011-05-28 | \n",
+ " Sydney | \n",
+ " 8.5 | \n",
+ " 17.4 | \n",
+ " 0.0 | \n",
+ " 3.2 | \n",
+ " 3.1 | \n",
+ " SSE | \n",
+ " 37.0 | \n",
+ " W | \n",
+ " ... | \n",
+ " 1023.8 | \n",
+ " 1022.5 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 10.4 | \n",
+ " 16.8 | \n",
+ " No | \n",
+ " No | \n",
+ " 0.0 | \n",
+ " Otoño | \n",
+ "
\n",
+ " \n",
+ " 34076 | \n",
+ " 2010-07-21 | \n",
+ " SydneyAirport | \n",
+ " 8.0 | \n",
+ " 14.6 | \n",
+ " 8.4 | \n",
+ " 3.2 | \n",
+ " 4.1 | \n",
+ " NNE | \n",
+ " 33.0 | \n",
+ " NW | \n",
+ " ... | \n",
+ " 1026.7 | \n",
+ " 1024.7 | \n",
+ " 7.0 | \n",
+ " 5.0 | \n",
+ " 9.6 | \n",
+ " 14.4 | \n",
+ " Yes | \n",
+ " No | \n",
+ " 0.0 | \n",
+ " Invierno | \n",
+ "
\n",
+ " \n",
+ " 66023 | \n",
+ " 2014-04-26 | \n",
+ " MelbourneAirport | \n",
+ " 10.4 | \n",
+ " 19.0 | \n",
+ " 0.0 | \n",
+ " 5.4 | \n",
+ " 2.5 | \n",
+ " SW | \n",
+ " 54.0 | \n",
+ " NW | \n",
+ " ... | \n",
+ " 1012.0 | \n",
+ " 1015.1 | \n",
+ " 6.0 | \n",
+ " 7.0 | \n",
+ " 18.2 | \n",
+ " 16.0 | \n",
+ " No | \n",
+ " No | \n",
+ " 0.2 | \n",
+ " Otoño | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 25 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 30
+ }
+ ],
+ "source": [
+ "probando_train.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "E75d3urwrDON"
+ },
+ "source": [
+ "# Gráficos"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {
+ "id": "Xh1BOe6lcqtI"
+ },
+ "outputs": [],
+ "source": [
+ "#Creamos otro df eliminandole los nulos para poder graficar\n",
+ "df_clean = train.dropna()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "vcHATpBxW5YW"
+ },
+ "source": [
+ "Hacemos una función para crear `Boxplots` y otra para crear `Histogramas`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {
+ "id": "Rn9_M3wmXFoO"
+ },
+ "outputs": [],
+ "source": [
+ "#La funcion toma una DF y columnas del mismo. Se encarga de generar boxplots de dichas columnas\n",
+ "def plot_boxplots(df, columns):\n",
+ " # Creamos una figura de boxplots\n",
+ " fig, axes = plt.subplots(1, len(columns), figsize=(12, 6))\n",
+ "\n",
+ " for i, column in enumerate(columns):\n",
+ " # Generamos el gráfico de caja para la columna actual\n",
+ " axes[i].boxplot(df[column])\n",
+ " axes[i].set_title(column)\n",
+ " axes[i].set_xticks([1])\n",
+ " axes[i].set_xticklabels([column])\n",
+ "\n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {
+ "id": "WLEKitleW-E0"
+ },
+ "outputs": [],
+ "source": [
+ "#La funcion toma una DF y columnas del mismo. Se encarga de generar histogramas de dichas columnas\n",
+ "def plot_histograms(df, columns):\n",
+ " # Creamos una figura de histograma\n",
+ " fig, axes = plt.subplots(1, len(columns), figsize=(12, 6))\n",
+ "\n",
+ " for i, column in enumerate(columns):\n",
+ " # Generamos el histograma para la columna actual\n",
+ " axes[i].hist(df[column], bins=20)\n",
+ " axes[i].set_title(column)\n",
+ " axes[i].set_xlabel('Valor')\n",
+ " axes[i].set_ylabel('Frecuencia')\n",
+ "\n",
+ " plt.tight_layout()\n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 545
+ },
+ "id": "sdp_LSqjp-3q",
+ "outputId": "5a50103e-cdd3-42c6-8a58-bab0eb67547e"
+ },
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "