🧐 Last exploration of the pklm test before implementation in Qolmat

scikit-learn-contrib · Aug 1, 2024 · e13f359 · e13f359
1 parent c87d08d
commit e13f359
Show file tree

Hide file tree

Showing 15 changed files with 724,506 additions and 229 deletions.
diff --git a/examples/pklm/Datasets_creation_for_comparison/Dataset_creation.ipynb b/examples/pklm/Datasets_creation_for_comparison/Dataset_creation.ipynb
@@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9f9c3954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8a57d1c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_df_with_nan(n_rows: int, n_cols: int) -> pd.DataFrame:\n",
+    "    data = {f\"Colonne_{i}\": np.random.normal(size=n_rows).astype(float) for i in range(n_cols)}\n",
+    "    df = pd.DataFrame(data)\n",
+    "    nb_valeurs_manquantes = int(0.35 * df.size)\n",
+    "    indices_valeurs_manquantes = np.random.choice(df.size, nb_valeurs_manquantes, replace=False)\n",
+    "    df.values.flat[indices_valeurs_manquantes] = np.nan\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1aa325a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "create_df_with_nan(500, 10).to_csv(\"df_for_bug.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "2fb7eb43",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = {\n",
+    "    \"df1\": {\"n_rows\": 200, \"n_cols\": 2},\n",
+    "    \"df2\": {\"n_rows\": 500, \"n_cols\": 2},\n",
+    "    \"df3\": {\"n_rows\": 500, \"n_cols\": 4},\n",
+    "    \"df4\": {\"n_rows\": 1000, \"n_cols\": 4},\n",
+    "    \"df5\": {\"n_rows\": 1000, \"n_cols\": 6},\n",
+    "    \"df6\": {\"n_rows\": 10000, \"n_cols\": 6},\n",
+    "    \"df7\": {\"n_rows\": 10000, \"n_cols\": 10},\n",
+    "    \"df8\": {\"n_rows\": 100000, \"n_cols\": 10},\n",
+    "    \"df9\": {\"n_rows\": 100000, \"n_cols\": 15}\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "e542d35c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for df_name, params in config.items():\n",
+    "    df = create_df_with_nan(**params)\n",
+    "    df.to_csv(f\"{df_name}.csv\", index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/pklm/Optimisation/Paralelisation.ipynb b/examples/pklm/Optimisation/Paralelisation.ipynb
@@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d1d98837",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--- 9.613310098648071 seconds ---\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "\n",
+    "data = {f\"Colonne_{i}\": np.random.normal(size=200).astype(float) for i in range(6)}\n",
+    "df = pd.DataFrame(data)\n",
+    "\n",
+    "_, n_col = df.shape\n",
+    "\n",
+    "result = 0\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "for _ in range (100):\n",
+    "    df_classif = df.sample(np.random.randint(2, n_col), axis=1)\n",
+    "    df_classif = df_classif.rename(columns={df_classif.columns[0]: 'target'})\n",
+    "    df_classif['target'] = 1 * df_classif['target'].isnull()\n",
+    "    X, y = df_classif.loc[:, df_classif.columns != \"target\"], df_classif[\"target\"]\n",
+    "    clf = RandomForestClassifier(\n",
+    "        n_estimators=200,\n",
+    "        max_features=None,\n",
+    "        min_samples_split=10,\n",
+    "        bootstrap=True,\n",
+    "        oob_score=True,\n",
+    "        random_state=42\n",
+    "    )\n",
+    "    clf.fit(X, y)\n",
+    "    \n",
+    "    result += clf.score(X, y)\n",
+    "    \n",
+    "print(\"--- %s seconds ---\" % (time.time() - start_time))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "91583a21",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--- 26.025609016418457 seconds ---\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "\n",
+    "data = {f\"Colonne_{i}\": np.random.normal(size=200).astype(float) for i in range(6)}\n",
+    "df = pd.DataFrame(data)\n",
+    "\n",
+    "_, n_col = df.shape\n",
+    "\n",
+    "result = 0\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "for _ in range (100):\n",
+    "    df_classif = df.sample(np.random.randint(2, n_col), axis=1)\n",
+    "    df_classif = df_classif.rename(columns={df_classif.columns[0]: 'target'})\n",
+    "    df_classif['target'] = 1 * df_classif['target'].isnull()\n",
+    "    X, y = df_classif.loc[:, df_classif.columns != \"target\"], df_classif[\"target\"]\n",
+    "    clf = RandomForestClassifier(\n",
+    "        n_estimators=200,\n",
+    "        max_features=None,\n",
+    "        min_samples_split=10,\n",
+    "        bootstrap=True,\n",
+    "        oob_score=True,\n",
+    "        random_state=42,\n",
+    "        n_jobs=-1\n",
+    "    )\n",
+    "    clf.fit(X, y)\n",
+    "    \n",
+    "    result += clf.score(X, y)\n",
+    "    \n",
+    "print(\"--- %s seconds ---\" % (time.time() - start_time))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c651c52b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--- 1.8062052726745605 seconds ---\n",
+      "Result: 100.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from joblib import Parallel, delayed\n",
+    "\n",
+    "# Génération de données\n",
+    "data = {f\"Colonne_{i}\": np.random.normal(size=200).astype(float) for i in range(6)}\n",
+    "df = pd.DataFrame(data)\n",
+    "_, n_col = df.shape\n",
+    "\n",
+    "# Fonction pour effectuer une itération du processus\n",
+    "def process_iteration(i):\n",
+    "    df_classif = df.sample(np.random.randint(2, n_col), axis=1)\n",
+    "    df_classif = df_classif.rename(columns={df_classif.columns[0]: 'target'})\n",
+    "    df_classif['target'] = 1 * df_classif['target'].isnull()\n",
+    "    X, y = df_classif.loc[:, df_classif.columns != \"target\"], df_classif[\"target\"]\n",
+    "    clf = RandomForestClassifier(\n",
+    "        n_estimators=200,\n",
+    "        max_features=None,\n",
+    "        min_samples_split=10,\n",
+    "        bootstrap=True,\n",
+    "        oob_score=True,\n",
+    "        random_state=42\n",
+    "    )\n",
+    "    clf.fit(X, y)\n",
+    "    return clf.score(X, y)\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "# Utilisation de joblib pour paralléliser les itérations\n",
+    "results = Parallel(n_jobs=-1)(delayed(process_iteration)(i) for i in range(100))\n",
+    "\n",
+    "# Calcul du résultat final\n",
+    "result = sum(results)\n",
+    "print(\"--- %s seconds ---\" % (time.time() - start_time))\n",
+    "print(f\"Result: {result}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}