diff --git a/lessons/pydata/homework_revisited/index_na_hodinu.ipynb b/lessons/pydata/homework_revisited/index_na_hodinu.ipynb index 8f34a53..2b829a8 100644 --- a/lessons/pydata/homework_revisited/index_na_hodinu.ipynb +++ b/lessons/pydata/homework_revisited/index_na_hodinu.ipynb @@ -19,6 +19,24 @@ "np.random.seed(42)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "from sklearn.exceptions import ConvergenceWarning\n", + "warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ÚKOL 1**: Načtěte data, vyhoďte přebytečné sloupce, vyberte vstupy a výstupy a připravte rozdělení na trénovací a testovací množinu.\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -33,7 +51,7 @@ "outputs": [], "source": [ "fish_data = pd.read_csv(\"fish_data.csv\", index_col=0)\n", - "# fish_data = fish_data.drop(columns=[____])\n", + "# fish_data = fish_data.drop(columns=[___])\n", "fish_data" ] }, @@ -73,8 +91,14 @@ "source": [ "from sklearn.model_selection import train_test_split \n", "\n", - "X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y)\n", - "# X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=X[\"Species\"])" + "X_train_raw, X_test_raw, y_train, y_test = ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ÚKOL 2**: Překódujte kategorické proměnné a přeškálujte všechny sloupce." ] }, { @@ -91,22 +115,19 @@ "outputs": [], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.compose import make_column_transformer\n", "\n", - "categorical_columns = [____] \n", + "categorical_columns = ___ \n", "\n", - "encoder = OneHotEncoder()\n", - "encoder.fit(X_train_raw[categorical_columns])\n", - "column_names = encoder.get_feature_names_out()\n", - " \n", - "def transform_species(X_raw):\n", - " X_res = X_raw.drop(columns=[\"Species\"])\n", - " X_res = X_res.reindex(columns=list(X_res.columns)+list(column_names))\n", - " X_res[list(column_names)] = encoder.transform(X_raw[categorical_columns]).toarray() \n", - " return X_res\n", + "transformer = make_column_transformer(\n", + " (_______, _____),\n", + " remainder=\"passthrough\"\n", + ")\n", "\n", - "X_train_onehot = transform_species(X_train_raw)\n", - "X_test_onehot = transform_species(X_test_raw)\n", - "X_train_onehot" + "X_train_onehot = transformer._______(X_train_raw)\n", + "X_test_onehot = transformer.________(X_test_raw)\n", + "\n", + "pd.DataFrame(X_train_onehot, columns=transformer.get_feature_names_out())" ] }, { @@ -124,11 +145,11 @@ "source": [ "from sklearn.preprocessing import StandardScaler\n", "\n", - "scaler = StandardScaler()\n", - "scaler.____(X_train_onehot)\n", + "scaler = ______\n", + "scaler._____(_____)\n", "\n", - "X_train = scaler.____(X_train_onehot)\n", - "X_test = scaler.____(X_test_onehot)" + "X_train = scaler.transform(X_train_onehot)\n", + "X_test = scaler.transform(X_test_onehot)" ] }, { @@ -151,6 +172,13 @@ " * C, float, optional (default=1.0)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -207,6 +235,13 @@ "``` " ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ÚKOL 3**: Dopište funkci `fit_and_eval` dle instrukcí." + ] + }, { "cell_type": "code", "execution_count": null, @@ -218,11 +253,13 @@ "def fit_and_eval(X_train, y_train, X_test, y_test, model, name):\n", " \"\"\" 1. Natrénuje model na trénovací množině.\n", " 2. Spočte hodnoty metrik na trénovací i testovací množině.\n", - " vrátí slovník ve tvaru {\"název metriky train\": hodnota , \"název metriky test\": hodnota} \n", + " vrátí slovník ve tvaru {\"název metriky\": hodnota} \n", " \"\"\" \n", - " ...\n", + " # zde dopiš kód\n", + " ... \n", " return {\n", - " .... \n", + " \"MSE_test\": ____,\n", + " \"MSE_train\": ____\n", " }" ] }, @@ -296,8 +333,8 @@ "y_real_test = test_data.pop(\"Weight\")\n", "X_real_test = test_data \n", "\n", - "X_real_test = transform_species(X_real_test)\n", - "X_real_test_scaled = scaler.transform(X_real_test)" + "X_real_test_transformed = _____\n", + "X_real_test_scaled = _______" ] }, { @@ -309,7 +346,8 @@ "y_pred_test = best_model.predict(X_real_test_scaled)\n", "\n", "print(f\"MAE {mean_absolute_error(y_real_test, y_pred_test):.3f}\")\n", - "print(f\"MSE {mean_squared_error(y_real_test, y_pred_test):.3f}\")" + "print(f\"MSE {mean_squared_error(y_real_test, y_pred_test):.3f}\")\n", + "print(f\"R2 {r2_score(y_real_test, y_pred_test):.3f}\")" ] }, { @@ -342,18 +380,32 @@ "metadata": {}, "outputs": [], "source": [ - "is_bream = X_real_test[\"Species_Bream\"] == 1 \n", - "bream = X_real_test[is_bream][\"Length3\"]\n", - "\n", - "bream_weights = y_real_test[is_bream]\n", + "is_bream = X_real_test[\"Species\"] == \"Bream\"\n", "predicted_bream_weights = best_model.predict(X_real_test_scaled[is_bream])\n", "\n", - "is_roach = X_real_test[\"Species_Roach\"] == 1\n", - "roach = X_real_test[is_roach][\"Length3\"]\n", - "roach_weights = y_real_test[is_roach]\n", + "is_roach = X_real_test[\"Species\"] == \"Roach\"\n", "predicted_roach_weights = best_model.predict(X_real_test_scaled[is_roach])" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result_bream = pd.DataFrame()\n", + "result_bream[\"length\"] = X_real_test[is_bream][\"Length3\"]\n", + "result_bream[\"weight\"] = y_real_test[is_bream]\n", + "result_bream[\"predicted\"] = predicted_bream_weights\n", + "result_bream = result_bream.sort_values(\"length\")\n", + "\n", + "result_roach = pd.DataFrame()\n", + "result_roach[\"length\"] = X_real_test[is_roach][\"Length3\"]\n", + "result_roach[\"weight\"] = y_real_test[is_roach]\n", + "result_roach[\"predicted\"] = predicted_roach_weights\n", + "result_roach = result_roach.sort_values(\"length\")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -361,17 +413,16 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt \n", - "%matplotlib inline\n", "\n", "fig, ax = plt.subplots(1, 2)\n", "\n", - "ax[0].scatter(bream, bream_weights, label=\"true weight\");\n", - "ax[0].scatter(bream, predicted_bream_weights, label=\"prediction\");\n", + "ax[0].plot(result_bream[\"length\"], result_bream[\"weight\"], label=\"true weight\", marker=\"o\");\n", + "ax[0].plot(result_bream[\"length\"], result_bream[\"predicted\"], label=\"prediction\", marker=\"o\");\n", "ax[0].legend()\n", "ax[0].set_title(\"Bream\")\n", "\n", - "ax[1].scatter(roach, roach_weights, label=\"true weight\");\n", - "ax[1].scatter(roach, predicted_roach_weights, label=\"prediction\");\n", + "ax[1].plot(result_roach[\"length\"], result_roach[\"weight\"], label=\"true weight\", marker=\"o\");\n", + "ax[1].plot(result_roach[\"length\"], result_roach[\"predicted\"], label=\"prediction\", marker=\"o\");\n", "ax[1].legend()\n", "ax[1].set_title(\"Roach\");" ] @@ -393,9 +444,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.6" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }