Skip to content

Commit

Permalink
🧐 Last exploration of the pklm test before implementation in Qolmat
Browse files Browse the repository at this point in the history
  • Loading branch information
adriencrtrcap committed Aug 1, 2024
1 parent c87d08d commit e13f359
Show file tree
Hide file tree
Showing 15 changed files with 724,506 additions and 229 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "9f9c3954",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8a57d1c9",
"metadata": {},
"outputs": [],
"source": [
"def create_df_with_nan(n_rows: int, n_cols: int) -> pd.DataFrame:\n",
" data = {f\"Colonne_{i}\": np.random.normal(size=n_rows).astype(float) for i in range(n_cols)}\n",
" df = pd.DataFrame(data)\n",
" nb_valeurs_manquantes = int(0.35 * df.size)\n",
" indices_valeurs_manquantes = np.random.choice(df.size, nb_valeurs_manquantes, replace=False)\n",
" df.values.flat[indices_valeurs_manquantes] = np.nan\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1aa325a7",
"metadata": {},
"outputs": [],
"source": [
"create_df_with_nan(500, 10).to_csv(\"df_for_bug.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "2fb7eb43",
"metadata": {},
"outputs": [],
"source": [
"config = {\n",
" \"df1\": {\"n_rows\": 200, \"n_cols\": 2},\n",
" \"df2\": {\"n_rows\": 500, \"n_cols\": 2},\n",
" \"df3\": {\"n_rows\": 500, \"n_cols\": 4},\n",
" \"df4\": {\"n_rows\": 1000, \"n_cols\": 4},\n",
" \"df5\": {\"n_rows\": 1000, \"n_cols\": 6},\n",
" \"df6\": {\"n_rows\": 10000, \"n_cols\": 6},\n",
" \"df7\": {\"n_rows\": 10000, \"n_cols\": 10},\n",
" \"df8\": {\"n_rows\": 100000, \"n_cols\": 10},\n",
" \"df9\": {\"n_rows\": 100000, \"n_cols\": 15}\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "e542d35c",
"metadata": {},
"outputs": [],
"source": [
"for df_name, params in config.items():\n",
" df = create_df_with_nan(**params)\n",
" df.to_csv(f\"{df_name}.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
186 changes: 186 additions & 0 deletions examples/pklm/Optimisation/Paralelisation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"id": "d1d98837",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- 9.613310098648071 seconds ---\n"
]
}
],
"source": [
"import time\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"\n",
"data = {f\"Colonne_{i}\": np.random.normal(size=200).astype(float) for i in range(6)}\n",
"df = pd.DataFrame(data)\n",
"\n",
"_, n_col = df.shape\n",
"\n",
"result = 0\n",
"\n",
"start_time = time.time()\n",
"\n",
"for _ in range (100):\n",
" df_classif = df.sample(np.random.randint(2, n_col), axis=1)\n",
" df_classif = df_classif.rename(columns={df_classif.columns[0]: 'target'})\n",
" df_classif['target'] = 1 * df_classif['target'].isnull()\n",
" X, y = df_classif.loc[:, df_classif.columns != \"target\"], df_classif[\"target\"]\n",
" clf = RandomForestClassifier(\n",
" n_estimators=200,\n",
" max_features=None,\n",
" min_samples_split=10,\n",
" bootstrap=True,\n",
" oob_score=True,\n",
" random_state=42\n",
" )\n",
" clf.fit(X, y)\n",
" \n",
" result += clf.score(X, y)\n",
" \n",
"print(\"--- %s seconds ---\" % (time.time() - start_time))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "91583a21",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- 26.025609016418457 seconds ---\n"
]
}
],
"source": [
"import time\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"\n",
"data = {f\"Colonne_{i}\": np.random.normal(size=200).astype(float) for i in range(6)}\n",
"df = pd.DataFrame(data)\n",
"\n",
"_, n_col = df.shape\n",
"\n",
"result = 0\n",
"\n",
"start_time = time.time()\n",
"\n",
"for _ in range (100):\n",
" df_classif = df.sample(np.random.randint(2, n_col), axis=1)\n",
" df_classif = df_classif.rename(columns={df_classif.columns[0]: 'target'})\n",
" df_classif['target'] = 1 * df_classif['target'].isnull()\n",
" X, y = df_classif.loc[:, df_classif.columns != \"target\"], df_classif[\"target\"]\n",
" clf = RandomForestClassifier(\n",
" n_estimators=200,\n",
" max_features=None,\n",
" min_samples_split=10,\n",
" bootstrap=True,\n",
" oob_score=True,\n",
" random_state=42,\n",
" n_jobs=-1\n",
" )\n",
" clf.fit(X, y)\n",
" \n",
" result += clf.score(X, y)\n",
" \n",
"print(\"--- %s seconds ---\" % (time.time() - start_time))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c651c52b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- 1.8062052726745605 seconds ---\n",
"Result: 100.0\n"
]
}
],
"source": [
"import time\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from joblib import Parallel, delayed\n",
"\n",
"# Génération de données\n",
"data = {f\"Colonne_{i}\": np.random.normal(size=200).astype(float) for i in range(6)}\n",
"df = pd.DataFrame(data)\n",
"_, n_col = df.shape\n",
"\n",
"# Fonction pour effectuer une itération du processus\n",
"def process_iteration(i):\n",
" df_classif = df.sample(np.random.randint(2, n_col), axis=1)\n",
" df_classif = df_classif.rename(columns={df_classif.columns[0]: 'target'})\n",
" df_classif['target'] = 1 * df_classif['target'].isnull()\n",
" X, y = df_classif.loc[:, df_classif.columns != \"target\"], df_classif[\"target\"]\n",
" clf = RandomForestClassifier(\n",
" n_estimators=200,\n",
" max_features=None,\n",
" min_samples_split=10,\n",
" bootstrap=True,\n",
" oob_score=True,\n",
" random_state=42\n",
" )\n",
" clf.fit(X, y)\n",
" return clf.score(X, y)\n",
"\n",
"start_time = time.time()\n",
"\n",
"# Utilisation de joblib pour paralléliser les itérations\n",
"results = Parallel(n_jobs=-1)(delayed(process_iteration)(i) for i in range(100))\n",
"\n",
"# Calcul du résultat final\n",
"result = sum(results)\n",
"print(\"--- %s seconds ---\" % (time.time() - start_time))\n",
"print(f\"Result: {result}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit e13f359

Please sign in to comment.